diff --git "a/slurm.out" "b/slurm.out" new file mode 100644--- /dev/null +++ "b/slurm.out" @@ -0,0 +1,2860 @@ +0: W1123 14:35:36.265000 1308318 torch/distributed/run.py:792] +0: W1123 14:35:36.265000 1308318 torch/distributed/run.py:792] ***************************************** +0: W1123 14:35:36.265000 1308318 torch/distributed/run.py:792] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. +0: W1123 14:35:36.265000 1308318 torch/distributed/run.py:792] ***************************************** +3: W1123 14:35:36.265000 3160663 torch/distributed/run.py:792] +3: W1123 14:35:36.265000 3160663 torch/distributed/run.py:792] ***************************************** +3: W1123 14:35:36.265000 3160663 torch/distributed/run.py:792] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. +3: W1123 14:35:36.265000 3160663 torch/distributed/run.py:792] ***************************************** +1: W1123 14:35:36.265000 364677 torch/distributed/run.py:792] +1: W1123 14:35:36.265000 364677 torch/distributed/run.py:792] ***************************************** +1: W1123 14:35:36.265000 364677 torch/distributed/run.py:792] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. +1: W1123 14:35:36.265000 364677 torch/distributed/run.py:792] ***************************************** +2: W1123 14:35:36.265000 3283449 torch/distributed/run.py:792] +2: W1123 14:35:36.265000 3283449 torch/distributed/run.py:792] ***************************************** +2: W1123 14:35:36.265000 3283449 torch/distributed/run.py:792] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. +2: W1123 14:35:36.265000 3283449 torch/distributed/run.py:792] ***************************************** +1: [2025-11-23 14:40:32,705] [INFO] [axolotl.utils.schemas.validation.check_eval_packing:119] [PID:364771] [RANK:0] explicitly setting `eval_sample_packing` to match `sample_packing` +1: [2025-11-23 14:40:32,705] [INFO] [axolotl.utils.schemas.validation.hint_sample_packing_padding:218] [PID:364771] [RANK:0] Setting `pad_to_sequence_len: true` to prevent memory leaks when sample_packing +2: [2025-11-23 14:40:32,705] [INFO] [axolotl.utils.schemas.validation.check_eval_packing:119] [PID:3283536] [RANK:0] explicitly setting `eval_sample_packing` to match `sample_packing` +2: [2025-11-23 14:40:32,705] [INFO] [axolotl.utils.schemas.validation.hint_sample_packing_padding:218] [PID:3283536] [RANK:0] Setting `pad_to_sequence_len: true` to prevent memory leaks when sample_packing +0: [2025-11-23 14:40:32,706] [INFO] [axolotl.utils.schemas.validation.check_eval_packing:119] [PID:1308399] [RANK:0] explicitly setting `eval_sample_packing` to match `sample_packing` +0: [2025-11-23 14:40:32,706] [INFO] [axolotl.utils.schemas.validation.hint_sample_packing_padding:218] [PID:1308399] [RANK:0] Setting `pad_to_sequence_len: true` to prevent memory leaks when sample_packing +3: [2025-11-23 14:40:32,706] [INFO] [axolotl.utils.schemas.validation.check_eval_packing:119] [PID:3160742] [RANK:0] explicitly setting `eval_sample_packing` to match `sample_packing` +3: [2025-11-23 14:40:32,706] [INFO] [axolotl.utils.schemas.validation.hint_sample_packing_padding:218] [PID:3160742] [RANK:0] Setting `pad_to_sequence_len: true` to prevent memory leaks when sample_packing +0: [2025-11-23 14:41:03,101] [WARNING] [axolotl.utils.config.normalize_config:139] [PID:1308399] [RANK:0] Invalid value for save_steps (1.6666666666666667) from saves_per_epoch and/or num_epochs. Saving at training end only. +0: [2025-11-23 14:41:03,271] [INFO] [axolotl.cli.config.load_cfg:245] [PID:1308399] [RANK:0] config: +0: { +0: "activation_offloading": false, +0: "auto_resume_from_checkpoints": true, +0: "axolotl_config_path": "/lustre/fswork/projects/rech/dgo/udv55np/train/tmp/1763904854545720057.yaml", +0: "base_model": "/lustre/fswork/projects/rech/qwv/udv55np/Gemma/base/gemma-3-4b", +0: "base_model_config": "/lustre/fswork/projects/rech/qwv/udv55np/Gemma/base/gemma-3-4b", +0: "batch_size": 16, +0: "bf16": true, +0: "capabilities": { +0: "bf16": true, +0: "compute_capability": "sm_90", +0: "fp8": false, +0: "n_gpu": 16, +0: "n_node": 1 +0: }, +0: "chat_template": "gemma3", +0: "context_parallel_size": 1, +0: "curriculum_sampling": true, +0: "dataloader_num_workers": 2, +0: "dataset_prepared_path": "/lustre/fswork/projects/rech/dgo/udv55np/dataset_gemma/Nemotron-Super-49B-v1_5/split_0.25", +0: "dataset_processes": 32, +0: "datasets": [ +0: { +0: "chat_template": "tokenizer_default", +0: "data_files": [ +0: "/lustre/fswork/projects/rech/qwv/udv55np/dataset/ift/Nemotron-Super-49B-v1_5/no_thinking/0014.jsonl", +0: "/lustre/fswork/projects/rech/qwv/udv55np/dataset/ift/Nemotron-Super-49B-v1_5/no_thinking/0010.jsonl", +0: "/lustre/fswork/projects/rech/qwv/udv55np/dataset/ift/Nemotron-Super-49B-v1_5/no_thinking/0012.jsonl", +0: "/lustre/fswork/projects/rech/qwv/udv55np/dataset/ift/Nemotron-Super-49B-v1_5/no_thinking/0008.jsonl", +0: "/lustre/fswork/projects/rech/qwv/udv55np/dataset/ift/Nemotron-Super-49B-v1_5/no_thinking/0001.jsonl", +0: "/lustre/fswork/projects/rech/qwv/udv55np/dataset/ift/Nemotron-Super-49B-v1_5/no_thinking/0002.jsonl", +0: "/lustre/fswork/projects/rech/qwv/udv55np/dataset/ift/Nemotron-Super-49B-v1_5/no_thinking/0013.jsonl", +0: "/lustre/fswork/projects/rech/qwv/udv55np/dataset/ift/Nemotron-Super-49B-v1_5/no_thinking/0015.jsonl", +0: "/lustre/fswork/projects/rech/qwv/udv55np/dataset/ift/Nemotron-Super-49B-v1_5/no_thinking/0004.jsonl", +0: "/lustre/fswork/projects/rech/qwv/udv55np/dataset/ift/Nemotron-Super-49B-v1_5/no_thinking/0011.jsonl", +0: "/lustre/fswork/projects/rech/qwv/udv55np/dataset/ift/Nemotron-Super-49B-v1_5/no_thinking/0000.jsonl", +0: "/lustre/fswork/projects/rech/qwv/udv55np/dataset/ift/Nemotron-Super-49B-v1_5/no_thinking/0003.jsonl" +0: ], +0: "ds_type": "json", +0: "field_messages": "conversations", +0: "message_property_mappings": { +0: "content": "content", +0: "role": "role" +0: }, +0: "path": "/lustre/fswork/projects/rech/qwv/udv55np/dataset/ift/Nemotron-Super-49B-v1_5/no_thinking", +0: "trust_remote_code": false, +0: "type": "chat_template" +0: }, +0: { +0: "chat_template": "tokenizer_default", +0: "data_files": [ +0: "/lustre/fswork/projects/rech/qwv/udv55np/dataset/ift/Nemotron-Super-49B-v1_5/thinking/0007.jsonl", +0: "/lustre/fswork/projects/rech/qwv/udv55np/dataset/ift/Nemotron-Super-49B-v1_5/thinking/0009.jsonl", +0: "/lustre/fswork/projects/rech/qwv/udv55np/dataset/ift/Nemotron-Super-49B-v1_5/thinking/0005.jsonl", +0: "/lustre/fswork/projects/rech/qwv/udv55np/dataset/ift/Nemotron-Super-49B-v1_5/thinking/0006.jsonl" +0: ], +0: "ds_type": "json", +0: "field_messages": "conversations", +0: "message_property_mappings": { +0: "content": "content", +0: "role": "role" +0: }, +0: "path": "/lustre/fswork/projects/rech/qwv/udv55np/dataset/ift/Nemotron-Super-49B-v1_5/thinking", +0: "trust_remote_code": false, +0: "type": "chat_template" +0: } +0: ], +0: "ddp": true, +0: "deepspeed": { +0: "bf16": { +0: "enabled": true +0: }, +0: "gradient_accumulation_steps": "auto", +0: "gradient_clipping": "auto", +0: "train_batch_size": "auto", +0: "train_micro_batch_size_per_gpu": "auto", +0: "wall_clock_breakdown": false, +0: "zero_optimization": { +0: "contiguous_gradients": true, +0: "overlap_comm": true, +0: "reduce_bucket_size": "auto", +0: "stage": 3, +0: "stage3_gather_16bit_weights_on_model_save": true, +0: "stage3_param_persistence_threshold": "auto", +0: "stage3_prefetch_bucket_size": "auto", +0: "sub_group_size": 0 +0: } +0: }, +0: "device": "cuda:0", +0: "device_map": { +0: "": 0 +0: }, +0: "dion_rank_fraction": 1.0, +0: "dion_rank_multiple_of": 1, +0: "env_capabilities": { +0: "torch_version": "2.6.0" +0: }, +0: "eot_tokens": [ +0: "" +0: ], +0: "eval_batch_size": 1, +0: "eval_causal_lm_metrics": [ +0: "sacrebleu", +0: "comet", +0: "ter", +0: "chrf" +0: ], +0: "eval_max_new_tokens": 128, +0: "eval_sample_packing": true, +0: "eval_table_size": 0, +0: "evals_per_epoch": 0, +0: "flash_attention": true, +0: "fp16": false, +0: "gradient_accumulation_steps": 1, +0: "gradient_checkpointing": true, +0: "gradient_checkpointing_kwargs": { +0: "use_reentrant": true +0: }, +0: "is_multimodal": true, +0: "learning_rate": 5e-06, +0: "lisa_layers_attribute": "model.layers", +0: "load_best_model_at_end": false, +0: "load_in_4bit": false, +0: "load_in_8bit": false, +0: "local_rank": 0, +0: "logging_steps": 10, +0: "lora_dropout": 0.0, +0: "loraplus_lr_embedding": 1e-06, +0: "lr_scheduler": "warmup_stable_decay", +0: "lr_scheduler_kwargs": { +0: "min_lr_ratio": 0.1, +0: "num_decay_steps": 200 +0: }, +0: "max_prompt_len": 512, +0: "mean_resizing_embeddings": false, +0: "micro_batch_size": 1, +0: "model_config_type": "gemma3", +0: "num_epochs": 0.6, +0: "optimizer": "adamw_torch_fused", +0: "output_dir": "/lustre/fswork/projects/rech/dgo/udv55np/ift/Nemotron-Super-49B-v1_5/gemma-3-4b/0.25", +0: "pad_to_sequence_len": true, +0: "pretrain_multipack_attn": true, +0: "pretrain_multipack_buffer_size": 10000, +0: "processor_config": "/lustre/fswork/projects/rech/qwv/udv55np/Gemma/base/gemma-3-4b", +0: "profiler_steps_start": 0, +0: "qlora_sharded_model_loading": false, +0: "ray_num_workers": 1, +0: "resources_per_worker": { +0: "GPU": 1 +0: }, +0: "sample_packing": true, +0: "sample_packing_bin_size": 200, +0: "sample_packing_group_size": 100000, +0: "sample_packing_sequentially": true, +0: "save_only_model": true, +0: "save_safetensors": true, +0: "save_total_limit": 20, +0: "saves_per_epoch": 1, +0: "sequence_len": 16384, +0: "shuffle_before_merging_datasets": true, +0: "shuffle_merged_datasets": false, +0: "skip_prepare_dataset": false, +0: "strict": false, +0: "tensor_parallel_size": 1, +0: "tf32": false, +0: "tiled_mlp_use_original_mlp": true, +0: "tokenizer_config": "/lustre/fswork/projects/rech/qwv/udv55np/Gemma/base/gemma-3-27b", +0: "torch_dtype": "torch.bfloat16", +0: "train_on_inputs": false, +0: "trl": { +0: "log_completions": false, +0: "mask_truncated_completions": false, +0: "ref_model_mixup_alpha": 0.9, +0: "ref_model_sync_steps": 64, +0: "scale_rewards": true, +0: "sync_ref_model": false, +0: "use_vllm": false, +0: "vllm_server_host": "0.0.0.0", +0: "vllm_server_port": 8000 +0: }, +0: "use_ray": false, +0: "use_tensorboard": true, +0: "val_set_size": 0.0, +0: "vllm": { +0: "device": "auto", +0: "dtype": "auto", +0: "gpu_memory_utilization": 0.9, +0: "host": "0.0.0.0", +0: "port": 8000 +0: }, +0: "warmup_steps": 100, +0: "weight_decay": 0.0, +0: "world_size": 16 +0: } +0: [2025-11-23 14:41:03,272] [INFO] [axolotl.cli.checks.check_user_token:35] [PID:1308399] [RANK:0] Skipping HuggingFace token verification because HF_HUB_OFFLINE is set to True. Only local files will be used. +0: [2025-11-23 14:41:05,217] [INFO] [axolotl.utils.data.sft._load_raw_datasets:314] [PID:1308402] [RANK:3] Loading raw datasets... +0: Generating train split: 0 examples [00:00, ? examples/s] Generating train split: 4846 examples [00:00, 15248.57 examples/s] Generating train split: 9519 examples [00:00, 24463.45 examples/s] Generating train split: 18753 examples [00:00, 39086.15 examples/s] Generating train split: 39500 examples [00:00, 59505.62 examples/s] Generating train split: 48267 examples [00:01, 55975.58 examples/s] Generating train split: 66919 examples [00:01, 76634.72 examples/s] Generating train split: 78951 examples [00:01, 65800.19 examples/s] Generating train split: 97624 examples [00:01, 82425.21 examples/s] Generating train split: 109206 examples [00:01, 80541.54 examples/s] Generating train split: 122934 examples [00:01, 78509.21 examples/s] Generating train split: 143957 examples [00:02, 88800.06 examples/s] Generating train split: 157927 examples [00:02, 86949.66 examples/s] Generating train split: 174149 examples [00:02, 97566.29 examples/s] Generating train split: 192711 examples [00:02, 82021.51 examples/s] Generating +0: train split: 208979 examples [00:02, 92570.25 examples/s] Generating train split: 227766 examples [00:03, 65942.46 examples/s] Generating train split: 241828 examples [00:03, 73823.49 examples/s] Generating train split: 253075 examples [00:03, 75034.14 examples/s] Generating train split: 271665 examples [00:03, 88668.01 examples/s] Generating train split: 283314 examples [00:03, 90764.99 examples/s] Generating train split: 297311 examples [00:03, 92080.81 examples/s] Generating train split: 318122 examples [00:04, 97581.81 examples/s] Generating train split: 331996 examples [00:04, 89227.04 examples/s] Generating train split: 348297 examples [00:04, 99720.42 examples/s] Generating train split: 367237 examples [00:04, 87126.05 examples/s] Generating train split: 387742 examples [00:04, 94473.92 examples/s] Generating train split: 401884 examples [00:05, 89252.95 examples/s] Generating train split: 417957 examples [00:05, 99154.28 examples/s] Generating train split: 417957 examples [00:05, 80974.73 examples/s] +0: +0: [2025-11-23 14:41:12,795] [INFO] [axolotl.utils.data.wrappers.get_dataset_wrapper:88] [PID:1308402] [RANK:3] Loading dataset: /lustre/fswork/projects/rech/qwv/udv55np/dataset/ift/Nemotron-Super-49B-v1_5/no_thinking with base_type: chat_template and prompt_style: None +0: Tokenizing Prompts (num_proc=32): 0%| | 0/417957 [00:0016384) (num_proc=32): 0%| | 0/557277 [00:0016384) (num_proc=32): 0%| | 1000/557277 [00:00<04:50, 1915.99 examples/s] Dropping Long Sequences (>16384) (num_proc=32): 2%|▏ | 13000/557277 [00:00<00:20, 26952.73 examples/s] Dropping Long Sequences (>16384) (num_proc=32): 4%|▍ | 25000/557277 [00:01<00:18, 29215.00 examples/s] Dropping Long Sequences (>16384) (num_proc=32): 6%|▌ | 34000/557277 [00:01<00:13, 39016.04 examples/s] Dropping Long Sequences (>16384) (num_proc=32): 9%|▉ | 50000/557277 [00:01<00:08, 60896.10 examples/s] Dropping Long Sequences (>16384) (num_proc=32): 11%|█ | 60000/557277 [00:01<00:07, 65934.72 examples/s] Dropping Long Sequences (>16384) (num_proc=32): 14%|█▍ | 80000/557277 [00:01<00:05, 95001.62 examples/s] Dropping Long Sequences (>16384) (num_proc=32): 17%|█▋ | 94000/557277 [00:01<00:04, 103389.65 examples/s] Drop +0: ping Long Sequences (>16384) (num_proc=32): 20%|█▉ | 111000/557277 [00:01<00:03, 119697.17 examples/s] Dropping Long Sequences (>16384) (num_proc=32): 22%|██▏ | 125000/557277 [00:01<00:03, 114636.34 examples/s] Dropping Long Sequences (>16384) (num_proc=32): 25%|██▍ | 138000/557277 [00:01<00:03, 111534.76 examples/s] Dropping Long Sequences (>16384) (num_proc=32): 27%|██▋ | 151000/557277 [00:02<00:03, 116098.17 examples/s] Dropping Long Sequences (>16384) (num_proc=32): 29%|██▉ | 164000/557277 [00:02<00:03, 115198.80 examples/s] Dropping Long Sequences (>16384) (num_proc=32): 32%|███▏ | 177000/557277 [00:02<00:03, 118577.80 examples/s] Dropping Long Sequences (>16384) (num_proc=32): 34%|███▍ | 190000/557277 [00:02<00:03, 115713.55 examples/s] Dropping Long Sequences (>16384) (num_proc=32): 37%|███▋ | 206000/557277 [00:02<00:02, 125157.69 examples/s] Dropping Long Sequences (>16384) (num_proc=32): 39%|█ +0: ██▉ | 219000/557277 [00:02<00:02, 125513.77 examples/s] Dropping Long Sequences (>16384) (num_proc=32): 43%|████▎ | 238000/557277 [00:02<00:02, 143336.69 examples/s] Dropping Long Sequences (>16384) (num_proc=32): 45%|████▌ | 253000/557277 [00:02<00:02, 141215.45 examples/s] Dropping Long Sequences (>16384) (num_proc=32): 48%|████▊ | 268000/557277 [00:02<00:02, 133208.71 examples/s] Dropping Long Sequences (>16384) (num_proc=32): 51%|█████ | 282000/557277 [00:03<00:02, 126252.48 examples/s] Dropping Long Sequences (>16384) (num_proc=32): 53%|█████▎ | 295000/557277 [00:03<00:02, 125238.82 examples/s] Dropping Long Sequences (>16384) (num_proc=32): 55%|█████▌ | 308000/557277 [00:03<00:02, 122967.64 examples/s] Dropping Long Sequences (>16384) (num_proc=32): 58%|█████▊ | 322000/557277 [00:03<00:01, 123992.24 examples/s] Dropping Long Sequences (>16384) (num_proc=32): 61%|██████ | +0: 339000/557277 [00:03<00:01, 130517.70 examples/s] Dropping Long Sequences (>16384) (num_proc=32): 64%|██████▍ | 357000/557277 [00:03<00:01, 142108.44 examples/s] Dropping Long Sequences (>16384) (num_proc=32): 67%|██████▋ | 372000/557277 [00:03<00:01, 139753.32 examples/s] Dropping Long Sequences (>16384) (num_proc=32): 69%|██████▉ | 387000/557277 [00:03<00:01, 126869.42 examples/s] Dropping Long Sequences (>16384) (num_proc=32): 72%|███████▏ | 400000/557277 [00:03<00:01, 127258.50 examples/s] Dropping Long Sequences (>16384) (num_proc=32): 74%|███████▍ | 413000/557277 [00:04<00:01, 127123.77 examples/s] Dropping Long Sequences (>16384) (num_proc=32): 77%|███████▋ | 429905/557277 [00:04<00:00, 138365.22 examples/s] Dropping Long Sequences (>16384) (num_proc=32): 80%|███████▉ | 443960/557277 [00:04<00:01, 110398.76 examples/s] Dropping Long Sequences (>16384) (num_proc=32): 82%|███ +0: █████▏ | 456960/557277 [00:05<00:02, 35259.66 examples/s] Dropping Long Sequences (>16384) (num_proc=32): 84%|████████▎ | 465960/557277 [00:06<00:03, 22962.44 examples/s] Dropping Long Sequences (>16384) (num_proc=32): 85%|████████▍ | 472960/557277 [00:07<00:04, 17186.52 examples/s] Dropping Long Sequences (>16384) (num_proc=32): 86%|████████▌ | 477960/557277 [00:07<00:05, 14188.64 examples/s] Dropping Long Sequences (>16384) (num_proc=32): 86%|████████▋ | 481960/557277 [00:07<00:05, 14648.28 examples/s] Dropping Long Sequences (>16384) (num_proc=32): 87%|████████▋ | 485960/557277 [00:08<00:06, 11774.17 examples/s] Dropping Long Sequences (>16384) (num_proc=32): 88%|████████▊ | 488960/557277 [00:08<00:05, 12046.82 examples/s] Dropping Long Sequences (>16384) (num_proc=32): 88%|████████▊ | 491960/557277 [00:09<00:05, 10980.53 examples/s] Dropping Long Sequences (>163 +0: 84) (num_proc=32): 89%|████████▊ | 493960/557277 [00:09<00:05, 10659.56 examples/s] Dropping Long Sequences (>16384) (num_proc=32): 89%|████████▉ | 495960/557277 [00:09<00:06, 9950.64 examples/s] Dropping Long Sequences (>16384) (num_proc=32): 90%|████████▉ | 498960/557277 [00:09<00:05, 10584.62 examples/s] Dropping Long Sequences (>16384) (num_proc=32): 90%|████████▉ | 500960/557277 [00:10<00:05, 10738.66 examples/s] Dropping Long Sequences (>16384) (num_proc=32): 90%|█████████ | 502960/557277 [00:10<00:05, 9213.29 examples/s] Dropping Long Sequences (>16384) (num_proc=32): 91%|█████████ | 504960/557277 [00:10<00:05, 9940.90 examples/s] Dropping Long Sequences (>16384) (num_proc=32): 91%|█████████ | 506960/557277 [00:10<00:04, 10279.89 examples/s] Dropping Long Sequences (>16384) (num_proc=32): 91%|█████████▏| 508960/557277 [00:10<00:04, 10593.40 examples +0: /s] Dropping Long Sequences (>16384) (num_proc=32): 92%|█████████▏| 510960/557277 [00:11<00:05, 8779.40 examples/s] Dropping Long Sequences (>16384) (num_proc=32): 92%|█████████▏| 511960/557277 [00:11<00:05, 8385.73 examples/s] Dropping Long Sequences (>16384) (num_proc=32): 92%|█████████▏| 513960/557277 [00:11<00:04, 10192.01 examples/s] Dropping Long Sequences (>16384) (num_proc=32): 93%|█████████▎| 516960/557277 [00:11<00:03, 11641.89 examples/s] Dropping Long Sequences (>16384) (num_proc=32): 93%|█████████▎| 518960/557277 [00:12<00:04, 8320.90 examples/s] Dropping Long Sequences (>16384) (num_proc=32): 93%|█████████▎| 520960/557277 [00:12<00:04, 9027.68 examples/s] Dropping Long Sequences (>16384) (num_proc=32): 94%|█████████▍| 523960/557277 [00:12<00:02, 11397.96 examples/s] Dropping Long Sequences (>16384) (num_proc=32): 94%|█████████▍| +0: 525960/557277 [00:12<00:04, 7561.36 examples/s] Dropping Long Sequences (>16384) (num_proc=32): 95%|█████████▍| 527960/557277 [00:13<00:03, 8513.27 examples/s] Dropping Long Sequences (>16384) (num_proc=32): 95%|█████████▌| 531960/557277 [00:13<00:02, 11448.42 examples/s] Dropping Long Sequences (>16384) (num_proc=32): 96%|█████████▌| 533960/557277 [00:13<00:02, 8166.66 examples/s] Dropping Long Sequences (>16384) (num_proc=32): 96%|█████████▌| 535960/557277 [00:13<00:02, 8800.14 examples/s] Dropping Long Sequences (>16384) (num_proc=32): 97%|█████████▋| 538960/557277 [00:14<00:01, 11308.67 examples/s] Dropping Long Sequences (>16384) (num_proc=32): 97%|█████████▋| 540960/557277 [00:14<00:01, 11954.27 examples/s] Dropping Long Sequences (>16384) (num_proc=32): 97%|█████████▋| 542960/557277 [00:14<00:01, 7554.51 examples/s] Dropping Long Sequences (>16384) (num_ +0: proc=32): 98%|█████████▊| 545960/557277 [00:14<00:01, 9808.17 examples/s] Dropping Long Sequences (>16384) (num_proc=32): 98%|█████████▊| 548374/557277 [00:15<00:00, 10730.95 examples/s] Dropping Long Sequences (>16384) (num_proc=32): 99%|█████████▊| 550204/557277 [00:15<00:00, 11161.94 examples/s] Dropping Long Sequences (>16384) (num_proc=32): 99%|█████████▉| 552204/557277 [00:15<00:00, 7548.87 examples/s] Dropping Long Sequences (>16384) (num_proc=32): 100%|█████████▉| 554619/557277 [00:15<00:00, 8395.22 examples/s] Dropping Long Sequences (>16384) (num_proc=32): 100%|█████████▉| 556033/557277 [00:16<00:00, 9031.49 examples/s] Dropping Long Sequences (>16384) (num_proc=32): 100%|██████████| 557277/557277 [00:16<00:00, 7709.79 examples/s] Dropping Long Sequences (>16384) (num_proc=32): 100%|██████████| 557277/557277 [00:16<00:00, 33835.09 exampl +0: es/s] +0: Drop Samples with Zero Trainable Tokens (num_proc=32): 0%| | 0/554837 [00:00 +0: jzxh159:1308399:1308399 [0] NCCL INFO NET/Plugin: No plugin found (libnccl-net.so) +0: jzxh159:1308399:1308399 [0] NCCL INFO NET/Plugin: Plugin load returned 2 : libnccl-net.so: cannot open shared object file: No such file or directory : when loading libnccl-net.so +0: jzxh159:1308399:1308399 [0] NCCL INFO NET/Plugin: Using internal network plugin. +0: jzxh159:1308399:1308399 [0] NCCL INFO cudaDriverVersion 12080 +0: NCCL version 2.21.5+cuda12.4 +1: jzxh160:364774:364774 [3] NCCL INFO cudaDriverVersion 12080 +0: jzxh159:1308399:1308399 [0] NCCL INFO Comm config Blocking set to 1 +1: jzxh160:364774:364774 [3] NCCL INFO Bootstrap : Using ibp24s0:10.100.6.125<0> +0: jzxh159:1308400:1308400 [1] NCCL INFO cudaDriverVersion 12080 +1: jzxh160:364774:364774 [3] NCCL INFO NET/Plugin: No plugin found (libnccl-net.so) +1: jzxh160:364774:364774 [3] NCCL INFO NET/Plugin: Plugin load returned 2 : libnccl-net.so: cannot open shared object file: No such file or directory : when loading libnccl-net.so +1: jzxh160:364774:364774 [3] NCCL INFO NET/Plugin: Using internal network plugin. +0: jzxh159:1308400:1308400 [1] NCCL INFO Bootstrap : Using ibp24s0:10.100.6.121<0> +2: jzxh161:3283536:3283536 [0] NCCL INFO cudaDriverVersion 12080 +0: jzxh159:1308400:1308400 [1] NCCL INFO NET/Plugin: No plugin found (libnccl-net.so) +0: jzxh159:1308400:1308400 [1] NCCL INFO NET/Plugin: Plugin load returned 2 : libnccl-net.so: cannot open shared object file: No such file or directory : when loading libnccl-net.so +0: jzxh159:1308400:1308400 [1] NCCL INFO NET/Plugin: Using internal network plugin. +1: jzxh160:364772:364772 [1] NCCL INFO cudaDriverVersion 12080 +2: jzxh161:3283538:3283538 [2] NCCL INFO cudaDriverVersion 12080 +1: jzxh160:364772:364772 [1] NCCL INFO Bootstrap : Using ibp24s0:10.100.6.125<0> +0: jzxh159:1308400:1308400 [1] NCCL INFO Comm config Blocking set to 1 +1: jzxh160:364772:364772 [1] NCCL INFO NET/Plugin: No plugin found (libnccl-net.so) +1: jzxh160:364772:364772 [1] NCCL INFO NET/Plugin: Plugin load returned 2 : libnccl-net.so: cannot open shared object file: No such file or directory : when loading libnccl-net.so +1: jzxh160:364772:364772 [1] NCCL INFO NET/Plugin: Using internal network plugin. +1: jzxh160:364773:364773 [2] NCCL INFO cudaDriverVersion 12080 +1: jzxh160:364774:364774 [3] NCCL INFO Comm config Blocking set to 1 +1: jzxh160:364773:364773 [2] NCCL INFO Bootstrap : Using ibp24s0:10.100.6.125<0> +1: jzxh160:364773:364773 [2] NCCL INFO NET/Plugin: No plugin found (libnccl-net.so) +1: jzxh160:364773:364773 [2] NCCL INFO NET/Plugin: Plugin load returned 2 : libnccl-net.so: cannot open shared object file: No such file or directory : when loading libnccl-net.so +1: jzxh160:364773:364773 [2] NCCL INFO NET/Plugin: Using internal network plugin. +1: jzxh160:364771:364771 [0] NCCL INFO cudaDriverVersion 12080 +1: jzxh160:364772:364772 [1] NCCL INFO Comm config Blocking set to 1 +1: jzxh160:364771:364771 [0] NCCL INFO Bootstrap : Using ibp24s0:10.100.6.125<0> +2: jzxh161:3283539:3283539 [3] NCCL INFO cudaDriverVersion 12080 +1: jzxh160:364771:364771 [0] NCCL INFO NET/Plugin: No plugin found (libnccl-net.so) +1: jzxh160:364771:364771 [0] NCCL INFO NET/Plugin: Plugin load returned 2 : libnccl-net.so: cannot open shared object file: No such file or directory : when loading libnccl-net.so +1: jzxh160:364771:364771 [0] NCCL INFO NET/Plugin: Using internal network plugin. +1: jzxh160:364773:364773 [2] NCCL INFO Comm config Blocking set to 1 +2: jzxh161:3283537:3283537 [1] NCCL INFO cudaDriverVersion 12080 +1: jzxh160:364771:364771 [0] NCCL INFO Comm config Blocking set to 1 +0: jzxh159:1308402:1308402 [3] NCCL INFO cudaDriverVersion 12080 +0: jzxh159:1308401:1308401 [2] NCCL INFO cudaDriverVersion 12080 +2: jzxh161:3283536:3283536 [0] NCCL INFO Bootstrap : Using ibp24s0:10.100.6.129<0> +2: jzxh161:3283538:3283538 [2] NCCL INFO Bootstrap : Using ibp24s0:10.100.6.129<0> +2: jzxh161:3283539:3283539 [3] NCCL INFO Bootstrap : Using ibp24s0:10.100.6.129<0> +2: jzxh161:3283537:3283537 [1] NCCL INFO Bootstrap : Using ibp24s0:10.100.6.129<0> +0: jzxh159:1308402:1308402 [3] NCCL INFO Bootstrap : Using ibp24s0:10.100.6.121<0> +0: jzxh159:1308401:1308401 [2] NCCL INFO Bootstrap : Using ibp24s0:10.100.6.121<0> +0: jzxh159:1308402:1308402 [3] NCCL INFO NET/Plugin: No plugin found (libnccl-net.so) +0: jzxh159:1308402:1308402 [3] NCCL INFO NET/Plugin: Plugin load returned 2 : libnccl-net.so: cannot open shared object file: No such file or directory : when loading libnccl-net.so +0: jzxh159:1308402:1308402 [3] NCCL INFO NET/Plugin: Using internal network plugin. +0: jzxh159:1308401:1308401 [2] NCCL INFO NET/Plugin: No plugin found (libnccl-net.so) +0: jzxh159:1308401:1308401 [2] NCCL INFO NET/Plugin: Plugin load returned 2 : libnccl-net.so: cannot open shared object file: No such file or directory : when loading libnccl-net.so +0: jzxh159:1308401:1308401 [2] NCCL INFO NET/Plugin: Using internal network plugin. +0: jzxh159:1308401:1308401 [2] NCCL INFO Comm config Blocking set to 1 +0: jzxh159:1308402:1308402 [3] NCCL INFO Comm config Blocking set to 1 +2: jzxh161:3283539:3283539 [3] NCCL INFO NET/Plugin: No plugin found (libnccl-net.so) +2: jzxh161:3283538:3283538 [2] NCCL INFO NET/Plugin: No plugin found (libnccl-net.so) +2: jzxh161:3283539:3283539 [3] NCCL INFO NET/Plugin: Plugin load returned 2 : libnccl-net.so: cannot open shared object file: No such file or directory : when loading libnccl-net.so +2: jzxh161:3283539:3283539 [3] NCCL INFO NET/Plugin: Using internal network plugin. +2: jzxh161:3283538:3283538 [2] NCCL INFO NET/Plugin: Plugin load returned 2 : libnccl-net.so: cannot open shared object file: No such file or directory : when loading libnccl-net.so +2: jzxh161:3283537:3283537 [1] NCCL INFO NET/Plugin: No plugin found (libnccl-net.so) +2: jzxh161:3283538:3283538 [2] NCCL INFO NET/Plugin: Using internal network plugin. +2: jzxh161:3283536:3283536 [0] NCCL INFO NET/Plugin: No plugin found (libnccl-net.so) +2: jzxh161:3283537:3283537 [1] NCCL INFO NET/Plugin: Plugin load returned 2 : libnccl-net.so: cannot open shared object file: No such file or directory : when loading libnccl-net.so +2: jzxh161:3283537:3283537 [1] NCCL INFO NET/Plugin: Using internal network plugin. +2: jzxh161:3283536:3283536 [0] NCCL INFO NET/Plugin: Plugin load returned 2 : libnccl-net.so: cannot open shared object file: No such file or directory : when loading libnccl-net.so +2: jzxh161:3283536:3283536 [0] NCCL INFO NET/Plugin: Using internal network plugin. +2: jzxh161:3283538:3283538 [2] NCCL INFO Comm config Blocking set to 1 +2: jzxh161:3283537:3283537 [1] NCCL INFO Comm config Blocking set to 1 +2: jzxh161:3283539:3283539 [3] NCCL INFO Comm config Blocking set to 1 +2: jzxh161:3283536:3283536 [0] NCCL INFO Comm config Blocking set to 1 +3: jzxh162:3160742:3160742 [0] NCCL INFO cudaDriverVersion 12080 +3: jzxh162:3160742:3160742 [0] NCCL INFO Bootstrap : Using ibp24s0:10.100.6.133<0> +3: jzxh162:3160743:3160743 [1] NCCL INFO cudaDriverVersion 12080 +3: jzxh162:3160743:3160743 [1] NCCL INFO Bootstrap : Using ibp24s0:10.100.6.133<0> +3: jzxh162:3160743:3160743 [1] NCCL INFO NET/Plugin: No plugin found (libnccl-net.so) +3: jzxh162:3160742:3160742 [0] NCCL INFO NET/Plugin: No plugin found (libnccl-net.so) +3: jzxh162:3160743:3160743 [1] NCCL INFO NET/Plugin: Plugin load returned 2 : libnccl-net.so: cannot open shared object file: No such file or directory : when loading libnccl-net.so +3: jzxh162:3160743:3160743 [1] NCCL INFO NET/Plugin: Using internal network plugin. +3: jzxh162:3160742:3160742 [0] NCCL INFO NET/Plugin: Plugin load returned 2 : libnccl-net.so: cannot open shared object file: No such file or directory : when loading libnccl-net.so +3: jzxh162:3160742:3160742 [0] NCCL INFO NET/Plugin: Using internal network plugin. +3: jzxh162:3160745:3160745 [3] NCCL INFO cudaDriverVersion 12080 +3: jzxh162:3160742:3160742 [0] NCCL INFO Comm config Blocking set to 1 +3: jzxh162:3160743:3160743 [1] NCCL INFO Comm config Blocking set to 1 +3: jzxh162:3160745:3160745 [3] NCCL INFO Bootstrap : Using ibp24s0:10.100.6.133<0> +3: jzxh162:3160745:3160745 [3] NCCL INFO NET/Plugin: No plugin found (libnccl-net.so) +3: jzxh162:3160745:3160745 [3] NCCL INFO NET/Plugin: Plugin load returned 2 : libnccl-net.so: cannot open shared object file: No such file or directory : when loading libnccl-net.so +3: jzxh162:3160745:3160745 [3] NCCL INFO NET/Plugin: Using internal network plugin. +3: jzxh162:3160744:3160744 [2] NCCL INFO cudaDriverVersion 12080 +3: jzxh162:3160745:3160745 [3] NCCL INFO Comm config Blocking set to 1 +3: jzxh162:3160744:3160744 [2] NCCL INFO Bootstrap : Using ibp24s0:10.100.6.133<0> +3: jzxh162:3160744:3160744 [2] NCCL INFO NET/Plugin: No plugin found (libnccl-net.so) +3: jzxh162:3160744:3160744 [2] NCCL INFO NET/Plugin: Plugin load returned 2 : libnccl-net.so: cannot open shared object file: No such file or directory : when loading libnccl-net.so +3: jzxh162:3160744:3160744 [2] NCCL INFO NET/Plugin: Using internal network plugin. +3: jzxh162:3160744:3160744 [2] NCCL INFO Comm config Blocking set to 1 +2: jzxh161:3283536:3284336 [0] NCCL INFO NET/IB : Using [0]mlx5_0:1/IB [1]mlx5_1:1/IB [2]mlx5_2:1/IB [3]mlx5_3:1/IB [RO]; OOB ibp24s0:10.100.6.129<0> +2: jzxh161:3283538:3284335 [2] NCCL INFO NET/IB : Using [0]mlx5_0:1/IB [1]mlx5_1:1/IB [2]mlx5_2:1/IB [3]mlx5_3:1/IB [RO]; OOB ibp24s0:10.100.6.129<0> +2: jzxh161:3283539:3284338 [3] NCCL INFO NET/IB : Using [0]mlx5_0:1/IB [1]mlx5_1:1/IB [2]mlx5_2:1/IB [3]mlx5_3:1/IB [RO]; OOB ibp24s0:10.100.6.129<0> +2: jzxh161:3283536:3284336 [0] NCCL INFO Using non-device net plugin version 0 +2: jzxh161:3283536:3284336 [0] NCCL INFO Using network IB +2: jzxh161:3283537:3284337 [1] NCCL INFO NET/IB : Using [0]mlx5_0:1/IB [1]mlx5_1:1/IB [2]mlx5_2:1/IB [3]mlx5_3:1/IB [RO]; OOB ibp24s0:10.100.6.129<0> +2: jzxh161:3283538:3284335 [2] NCCL INFO Using non-device net plugin version 0 +2: jzxh161:3283538:3284335 [2] NCCL INFO Using network IB +2: jzxh161:3283539:3284338 [3] NCCL INFO Using non-device net plugin version 0 +2: jzxh161:3283539:3284338 [3] NCCL INFO Using network IB +2: jzxh161:3283537:3284337 [1] NCCL INFO Using non-device net plugin version 0 +2: jzxh161:3283537:3284337 [1] NCCL INFO Using network IB +0: jzxh159:1308401:1310257 [2] NCCL INFO NET/IB : Using [0]mlx5_0:1/IB [1]mlx5_1:1/IB [2]mlx5_2:1/IB [3]mlx5_3:1/IB [RO]; OOB ibp24s0:10.100.6.121<0> +0: jzxh159:1308400:1310256 [1] NCCL INFO NET/IB : Using [0]mlx5_0:1/IB [1]mlx5_1:1/IB [2]mlx5_2:1/IB [3]mlx5_3:1/IB [RO]; OOB ibp24s0:10.100.6.121<0> +0: jzxh159:1308399:1310255 [0] NCCL INFO NET/IB : Using [0]mlx5_0:1/IB [1]mlx5_1:1/IB [2]mlx5_2:1/IB [3]mlx5_3:1/IB [RO]; OOB ibp24s0:10.100.6.121<0> +0: jzxh159:1308401:1310257 [2] NCCL INFO Using non-device net plugin version 0 +0: jzxh159:1308401:1310257 [2] NCCL INFO Using network IB +0: jzxh159:1308402:1310258 [3] NCCL INFO NET/IB : Using [0]mlx5_0:1/IB [1]mlx5_1:1/IB [2]mlx5_2:1/IB [3]mlx5_3:1/IB [RO]; OOB ibp24s0:10.100.6.121<0> +0: jzxh159:1308400:1310256 [1] NCCL INFO Using non-device net plugin version 0 +0: jzxh159:1308400:1310256 [1] NCCL INFO Using network IB +0: jzxh159:1308399:1310255 [0] NCCL INFO Using non-device net plugin version 0 +0: jzxh159:1308399:1310255 [0] NCCL INFO Using network IB +0: jzxh159:1308402:1310258 [3] NCCL INFO Using non-device net plugin version 0 +0: jzxh159:1308402:1310258 [3] NCCL INFO Using network IB +1: jzxh160:364774:365551 [3] NCCL INFO NET/IB : Using [0]mlx5_0:1/IB [1]mlx5_1:1/IB [2]mlx5_2:1/IB [3]mlx5_3:1/IB [RO]; OOB ibp24s0:10.100.6.125<0> +1: jzxh160:364772:365552 [1] NCCL INFO NET/IB : Using [0]mlx5_0:1/IB [1]mlx5_1:1/IB [2]mlx5_2:1/IB [3]mlx5_3:1/IB [RO]; OOB ibp24s0:10.100.6.125<0> +1: jzxh160:364771:365554 [0] NCCL INFO NET/IB : Using [0]mlx5_0:1/IB [1]mlx5_1:1/IB [2]mlx5_2:1/IB [3]mlx5_3:1/IB [RO]; OOB ibp24s0:10.100.6.125<0> +1: jzxh160:364774:365551 [3] NCCL INFO Using non-device net plugin version 0 +1: jzxh160:364774:365551 [3] NCCL INFO Using network IB +1: jzxh160:364772:365552 [1] NCCL INFO Using non-device net plugin version 0 +1: jzxh160:364772:365552 [1] NCCL INFO Using network IB +1: jzxh160:364773:365553 [2] NCCL INFO NET/IB : Using [0]mlx5_0:1/IB [1]mlx5_1:1/IB [2]mlx5_2:1/IB [3]mlx5_3:1/IB [RO]; OOB ibp24s0:10.100.6.125<0> +1: jzxh160:364771:365554 [0] NCCL INFO Using non-device net plugin version 0 +1: jzxh160:364771:365554 [0] NCCL INFO Using network IB +1: jzxh160:364773:365553 [2] NCCL INFO Using non-device net plugin version 0 +1: jzxh160:364773:365553 [2] NCCL INFO Using network IB +3: jzxh162:3160744:3161553 [2] NCCL INFO NET/IB : Using [0]mlx5_0:1/IB [1]mlx5_1:1/IB [2]mlx5_2:1/IB [3]mlx5_3:1/IB [RO]; OOB ibp24s0:10.100.6.133<0> +3: jzxh162:3160745:3161552 [3] NCCL INFO NET/IB : Using [0]mlx5_0:1/IB [1]mlx5_1:1/IB [2]mlx5_2:1/IB [3]mlx5_3:1/IB [RO]; OOB ibp24s0:10.100.6.133<0> +3: jzxh162:3160743:3161551 [1] NCCL INFO NET/IB : Using [0]mlx5_0:1/IB [1]mlx5_1:1/IB [2]mlx5_2:1/IB [3]mlx5_3:1/IB [RO]; OOB ibp24s0:10.100.6.133<0> +3: jzxh162:3160744:3161553 [2] NCCL INFO Using non-device net plugin version 0 +3: jzxh162:3160744:3161553 [2] NCCL INFO Using network IB +3: jzxh162:3160742:3161550 [0] NCCL INFO NET/IB : Using [0]mlx5_0:1/IB [1]mlx5_1:1/IB [2]mlx5_2:1/IB [3]mlx5_3:1/IB [RO]; OOB ibp24s0:10.100.6.133<0> +3: jzxh162:3160745:3161552 [3] NCCL INFO Using non-device net plugin version 0 +3: jzxh162:3160745:3161552 [3] NCCL INFO Using network IB +3: jzxh162:3160743:3161551 [1] NCCL INFO Using non-device net plugin version 0 +3: jzxh162:3160743:3161551 [1] NCCL INFO Using network IB +3: jzxh162:3160742:3161550 [0] NCCL INFO Using non-device net plugin version 0 +3: jzxh162:3160742:3161550 [0] NCCL INFO Using network IB +0: jzxh159:1308399:1310255 [0] NCCL INFO ncclCommInitRank comm 0x5650a3b6e180 rank 0 nranks 16 cudaDev 0 nvmlDev 0 busId 1b000 commId 0x524c94fe7e5cd79a - Init START +0: jzxh159:1308400:1310256 [1] NCCL INFO ncclCommInitRank comm 0x5576720d8870 rank 1 nranks 16 cudaDev 1 nvmlDev 1 busId 2c000 commId 0x524c94fe7e5cd79a - Init START +0: jzxh159:1308402:1310258 [3] NCCL INFO ncclCommInitRank comm 0x55cdfc201020 rank 3 nranks 16 cudaDev 3 nvmlDev 3 busId ad000 commId 0x524c94fe7e5cd79a - Init START +0: jzxh159:1308401:1310257 [2] NCCL INFO ncclCommInitRank comm 0x55cb335512a0 rank 2 nranks 16 cudaDev 2 nvmlDev 2 busId 9d000 commId 0x524c94fe7e5cd79a - Init START +2: jzxh161:3283539:3284338 [3] NCCL INFO ncclCommInitRank comm 0x557efcb6a4d0 rank 11 nranks 16 cudaDev 3 nvmlDev 3 busId ad000 commId 0x524c94fe7e5cd79a - Init START +2: jzxh161:3283537:3284337 [1] NCCL INFO ncclCommInitRank comm 0x55f84f114b10 rank 9 nranks 16 cudaDev 1 nvmlDev 1 busId 2c000 commId 0x524c94fe7e5cd79a - Init START +3: jzxh162:3160743:3161551 [1] NCCL INFO ncclCommInitRank comm 0x55dd2bde7570 rank 13 nranks 16 cudaDev 1 nvmlDev 1 busId 2c000 commId 0x524c94fe7e5cd79a - Init START +3: jzxh162:3160745:3161552 [3] NCCL INFO ncclCommInitRank comm 0x55c152c5c3b0 rank 15 nranks 16 cudaDev 3 nvmlDev 3 busId ad000 commId 0x524c94fe7e5cd79a - Init START +3: jzxh162:3160744:3161553 [2] NCCL INFO ncclCommInitRank comm 0x56427a552c70 rank 14 nranks 16 cudaDev 2 nvmlDev 2 busId 9d000 commId 0x524c94fe7e5cd79a - Init START +3: jzxh162:3160742:3161550 [0] NCCL INFO ncclCommInitRank comm 0x5649a9566180 rank 12 nranks 16 cudaDev 0 nvmlDev 0 busId 1b000 commId 0x524c94fe7e5cd79a - Init START +1: jzxh160:364773:365553 [2] NCCL INFO ncclCommInitRank comm 0x56520ef52d20 rank 6 nranks 16 cudaDev 2 nvmlDev 2 busId 9d000 commId 0x524c94fe7e5cd79a - Init START +1: jzxh160:364771:365554 [0] NCCL INFO ncclCommInitRank comm 0x55a894a256f0 rank 4 nranks 16 cudaDev 0 nvmlDev 0 busId 1b000 commId 0x524c94fe7e5cd79a - Init START +1: jzxh160:364774:365551 [3] NCCL INFO ncclCommInitRank comm 0x557fffcd2ae0 rank 7 nranks 16 cudaDev 3 nvmlDev 3 busId ad000 commId 0x524c94fe7e5cd79a - Init START +1: jzxh160:364772:365552 [1] NCCL INFO ncclCommInitRank comm 0x556d3b785610 rank 5 nranks 16 cudaDev 1 nvmlDev 1 busId 2c000 commId 0x524c94fe7e5cd79a - Init START +2: jzxh161:3283538:3284335 [2] NCCL INFO ncclCommInitRank comm 0x5643a3152d00 rank 10 nranks 16 cudaDev 2 nvmlDev 2 busId 9d000 commId 0x524c94fe7e5cd79a - Init START +2: jzxh161:3283536:3284336 [0] NCCL INFO ncclCommInitRank comm 0x55c8d4c4af80 rank 8 nranks 16 cudaDev 0 nvmlDev 0 busId 1b000 commId 0x524c94fe7e5cd79a - Init START +2: jzxh161:3283536:3284336 [0] NCCL INFO Setting affinity for GPU 0 to ffffff,00000000,00000000,00ffffff +2: jzxh161:3283536:3284336 [0] NCCL INFO NVLS multicast support is not available on dev 0 +0: jzxh159:1308401:1310257 [2] NCCL INFO Setting affinity for GPU 2 to ff,ffff0000,00000000,000000ff,ffff0000,00000000 +0: jzxh159:1308401:1310257 [2] NCCL INFO NVLS multicast support is not available on dev 2 +0: jzxh159:1308402:1310258 [3] NCCL INFO Setting affinity for GPU 3 to ffffff00,00000000,00000000,ffffff00,00000000,00000000 +0: jzxh159:1308402:1310258 [3] NCCL INFO NVLS multicast support is not available on dev 3 +0: jzxh159:1308400:1310256 [1] NCCL INFO Setting affinity for GPU 1 to ffff,ff000000,00000000,0000ffff,ff000000 +0: jzxh159:1308400:1310256 [1] NCCL INFO NVLS multicast support is not available on dev 1 +0: jzxh159:1308399:1310255 [0] NCCL INFO Setting affinity for GPU 0 to ffffff,00000000,00000000,00ffffff +0: jzxh159:1308399:1310255 [0] NCCL INFO NVLS multicast support is not available on dev 0 +2: jzxh161:3283538:3284335 [2] NCCL INFO Setting affinity for GPU 2 to ff,ffff0000,00000000,000000ff,ffff0000,00000000 +2: jzxh161:3283538:3284335 [2] NCCL INFO NVLS multicast support is not available on dev 2 +2: jzxh161:3283539:3284338 [3] NCCL INFO Setting affinity for GPU 3 to ffffff00,00000000,00000000,ffffff00,00000000,00000000 +2: jzxh161:3283537:3284337 [1] NCCL INFO Setting affinity for GPU 1 to ffff,ff000000,00000000,0000ffff,ff000000 +2: jzxh161:3283537:3284337 [1] NCCL INFO NVLS multicast support is not available on dev 1 +2: jzxh161:3283539:3284338 [3] NCCL INFO NVLS multicast support is not available on dev 3 +3: jzxh162:3160742:3161550 [0] NCCL INFO Setting affinity for GPU 0 to ffffff,00000000,00000000,00ffffff +3: jzxh162:3160742:3161550 [0] NCCL INFO NVLS multicast support is not available on dev 0 +3: jzxh162:3160743:3161551 [1] NCCL INFO Setting affinity for GPU 1 to ffff,ff000000,00000000,0000ffff,ff000000 +3: jzxh162:3160743:3161551 [1] NCCL INFO NVLS multicast support is not available on dev 1 +3: jzxh162:3160744:3161553 [2] NCCL INFO Setting affinity for GPU 2 to ff,ffff0000,00000000,000000ff,ffff0000,00000000 +1: jzxh160:364771:365554 [0] NCCL INFO Setting affinity for GPU 0 to ffffff,00000000,00000000,00ffffff +1: jzxh160:364771:365554 [0] NCCL INFO NVLS multicast support is not available on dev 0 +3: jzxh162:3160744:3161553 [2] NCCL INFO NVLS multicast support is not available on dev 2 +3: jzxh162:3160745:3161552 [3] NCCL INFO Setting affinity for GPU 3 to ffffff00,00000000,00000000,ffffff00,00000000,00000000 +3: jzxh162:3160745:3161552 [3] NCCL INFO NVLS multicast support is not available on dev 3 +1: jzxh160:364772:365552 [1] NCCL INFO Setting affinity for GPU 1 to ffff,ff000000,00000000,0000ffff,ff000000 +1: jzxh160:364772:365552 [1] NCCL INFO NVLS multicast support is not available on dev 1 +1: jzxh160:364773:365553 [2] NCCL INFO Setting affinity for GPU 2 to ff,ffff0000,00000000,000000ff,ffff0000,00000000 +1: jzxh160:364773:365553 [2] NCCL INFO NVLS multicast support is not available on dev 2 +1: jzxh160:364774:365551 [3] NCCL INFO Setting affinity for GPU 3 to ffffff00,00000000,00000000,ffffff00,00000000,00000000 +1: jzxh160:364774:365551 [3] NCCL INFO NVLS multicast support is not available on dev 3 +0: jzxh159:1308399:1310255 [0] NCCL INFO comm 0x5650a3b6e180 rank 0 nRanks 16 nNodes 4 localRanks 4 localRank 0 MNNVL 0 +0: jzxh159:1308400:1310256 [1] NCCL INFO comm 0x5576720d8870 rank 1 nRanks 16 nNodes 4 localRanks 4 localRank 1 MNNVL 0 +0: jzxh159:1308402:1310258 [3] NCCL INFO comm 0x55cdfc201020 rank 3 nRanks 16 nNodes 4 localRanks 4 localRank 3 MNNVL 0 +0: jzxh159:1308399:1310255 [0] NCCL INFO Channel 00/16 : 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 +0: jzxh159:1308401:1310257 [2] NCCL INFO comm 0x55cb335512a0 rank 2 nRanks 16 nNodes 4 localRanks 4 localRank 2 MNNVL 0 +0: jzxh159:1308399:1310255 [0] NCCL INFO Channel 01/16 : 0 3 2 5 4 7 6 9 8 11 10 13 12 15 14 1 +0: jzxh159:1308399:1310255 [0] NCCL INFO Channel 02/16 : 0 3 6 5 4 7 10 9 8 11 14 13 12 15 2 1 +0: jzxh159:1308399:1310255 [0] NCCL INFO Channel 03/16 : 0 1 2 7 4 5 6 11 8 9 10 15 12 13 14 3 +0: jzxh159:1308399:1310255 [0] NCCL INFO Channel 04/16 : 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 +0: jzxh159:1308399:1310255 [0] NCCL INFO Channel 05/16 : 0 3 2 5 4 7 6 9 8 11 10 13 12 15 14 1 +0: jzxh159:1308399:1310255 [0] NCCL INFO Channel 06/16 : 0 3 6 5 4 7 10 9 8 11 14 13 12 15 2 1 +0: jzxh159:1308400:1310256 [1] NCCL INFO Trees [0] 2/-1/-1->1->0 [1] 2/9/-1->1->-1 [2] 3/-1/-1->1->0 [3] 0/-1/-1->1->3 [4] -1/-1/-1->1->2 [5] 3/9/-1->1->-1 [6] -1/-1/-1->1->3 [7] 0/-1/-1->1->2 [8] 2/-1/-1->1->0 [9] 2/-1/-1->1->5 [10] 3/-1/-1->1->0 [11] 0/-1/-1->1->3 [12] -1/-1/-1->1->2 [13] 3/-1/-1->1->5 [14] -1/-1/-1->1->3 [15] 0/-1/-1->1->2 +0: jzxh159:1308402:1310258 [3] NCCL INFO Trees [0] -1/-1/-1->3->2 [1] 0/-1/-1->3->2 [2] -1/-1/-1->3->1 [3] 1/11/-1->3->-1 [4] 2/-1/-1->3->0 [5] 0/-1/-1->3->1 [6] 1/-1/-1->3->0 [7] 2/11/-1->3->-1 [8] -1/-1/-1->3->2 [9] 0/-1/-1->3->2 [10] -1/-1/-1->3->1 [11] 1/-1/-1->3->7 [12] 2/-1/-1->3->0 [13] 0/-1/-1->3->1 [14] 1/-1/-1->3->0 [15] 2/-1/-1->3->7 +0: jzxh159:1308399:1310255 [0] NCCL INFO Channel 07/16 : 0 1 2 7 4 5 6 11 8 9 10 15 12 13 14 3 +0: jzxh159:1308400:1310256 [1] NCCL INFO P2P Chunksize set to 131072 +0: jzxh159:1308402:1310258 [3] NCCL INFO P2P Chunksize set to 131072 +0: jzxh159:1308399:1310255 [0] NCCL INFO Channel 08/16 : 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 +0: jzxh159:1308401:1310257 [2] NCCL INFO Trees [0] 3/-1/-1->2->1 [1] 3/-1/-1->2->1 [2] 0/10/-1->2->-1 [3] -1/-1/-1->2->0 [4] 1/-1/-1->2->3 [5] -1/-1/-1->2->0 [6] 0/10/-1->2->-1 [7] 1/-1/-1->2->3 [8] 3/-1/-1->2->1 [9] 3/-1/-1->2->1 [10] 0/-1/-1->2->6 [11] -1/-1/-1->2->0 [12] 1/-1/-1->2->3 [13] -1/-1/-1->2->0 [14] 0/-1/-1->2->6 [15] 1/-1/-1->2->3 +0: jzxh159:1308399:1310255 [0] NCCL INFO Channel 09/16 : 0 3 2 5 4 7 6 9 8 11 10 13 12 15 14 1 +0: jzxh159:1308401:1310257 [2] NCCL INFO P2P Chunksize set to 131072 +0: jzxh159:1308399:1310255 [0] NCCL INFO Channel 10/16 : 0 3 6 5 4 7 10 9 8 11 14 13 12 15 2 1 +0: jzxh159:1308399:1310255 [0] NCCL INFO Channel 11/16 : 0 1 2 7 4 5 6 11 8 9 10 15 12 13 14 3 +0: jzxh159:1308399:1310255 [0] NCCL INFO Channel 12/16 : 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 +0: jzxh159:1308399:1310255 [0] NCCL INFO Channel 13/16 : 0 3 2 5 4 7 6 9 8 11 10 13 12 15 14 1 +0: jzxh159:1308399:1310255 [0] NCCL INFO Channel 14/16 : 0 3 6 5 4 7 10 9 8 11 14 13 12 15 2 1 +1: jzxh160:364774:365551 [3] NCCL INFO comm 0x557fffcd2ae0 rank 7 nRanks 16 nNodes 4 localRanks 4 localRank 3 MNNVL 0 +1: jzxh160:364773:365553 [2] NCCL INFO comm 0x56520ef52d20 rank 6 nRanks 16 nNodes 4 localRanks 4 localRank 2 MNNVL 0 +3: jzxh162:3160744:3161553 [2] NCCL INFO comm 0x56427a552c70 rank 14 nRanks 16 nNodes 4 localRanks 4 localRank 2 MNNVL 0 +3: jzxh162:3160745:3161552 [3] NCCL INFO comm 0x55c152c5c3b0 rank 15 nRanks 16 nNodes 4 localRanks 4 localRank 3 MNNVL 0 +3: jzxh162:3160743:3161551 [1] NCCL INFO comm 0x55dd2bde7570 rank 13 nRanks 16 nNodes 4 localRanks 4 localRank 1 MNNVL 0 +3: jzxh162:3160742:3161550 [0] NCCL INFO comm 0x5649a9566180 rank 12 nRanks 16 nNodes 4 localRanks 4 localRank 0 MNNVL 0 +0: jzxh159:1308399:1310255 [0] NCCL INFO Channel 15/16 : 0 1 2 7 4 5 6 11 8 9 10 15 12 13 14 3 +0: jzxh159:1308399:1310255 [0] NCCL INFO Trees [0] 1/8/-1->0->-1 [1] -1/-1/-1->0->3 [2] 1/-1/-1->0->2 [3] 2/-1/-1->0->1 [4] 3/8/-1->0->-1 [5] 2/-1/-1->0->3 [6] 3/-1/-1->0->2 [7] -1/-1/-1->0->1 [8] 1/-1/-1->0->4 [9] -1/-1/-1->0->3 [10] 1/-1/-1->0->2 [11] 2/-1/-1->0->1 [12] 3/-1/-1->0->4 [13] 2/-1/-1->0->3 [14] 3/-1/-1->0->2 [15] -1/-1/-1->0->1 +0: jzxh159:1308399:1310255 [0] NCCL INFO P2P Chunksize set to 131072 +1: jzxh160:364774:365551 [3] NCCL INFO Trees [0] -1/-1/-1->7->6 [1] 4/-1/-1->7->6 [2] -1/-1/-1->7->5 [3] 5/-1/-1->7->11 [4] 6/-1/-1->7->4 [5] 4/-1/-1->7->5 [6] 5/-1/-1->7->4 [7] 6/-1/-1->7->11 [8] -1/-1/-1->7->6 [9] 4/-1/-1->7->6 [10] -1/-1/-1->7->5 [11] 5/11/3->7->15 [12] 6/-1/-1->7->4 [13] 4/-1/-1->7->5 [14] 5/-1/-1->7->4 [15] 6/11/3->7->15 +1: jzxh160:364774:365551 [3] NCCL INFO P2P Chunksize set to 131072 +3: jzxh162:3160744:3161553 [2] NCCL INFO Trees [0] 15/-1/-1->14->13 [1] 15/-1/-1->14->13 [2] 12/-1/-1->14->10 [3] -1/-1/-1->14->12 [4] 13/-1/-1->14->15 [5] -1/-1/-1->14->12 [6] 12/-1/-1->14->10 [7] 13/-1/-1->14->15 [8] 15/-1/-1->14->13 [9] 15/-1/-1->14->13 [10] 12/6/-1->14->-1 [11] -1/-1/-1->14->12 [12] 13/-1/-1->14->15 [13] -1/-1/-1->14->12 [14] 12/6/-1->14->-1 [15] 13/-1/-1->14->15 +3: jzxh162:3160744:3161553 [2] NCCL INFO P2P Chunksize set to 131072 +3: jzxh162:3160743:3161551 [1] NCCL INFO Trees [0] 14/-1/-1->13->12 [1] 14/-1/-1->13->9 [2] 15/-1/-1->13->12 [3] 12/-1/-1->13->15 [4] -1/-1/-1->13->14 [5] 15/-1/-1->13->9 [6] -1/-1/-1->13->15 [7] 12/-1/-1->13->14 [8] 14/-1/-1->13->12 [9] 14/5/-1->13->-1 [10] 15/-1/-1->13->12 [11] 12/-1/-1->13->15 [12] -1/-1/-1->13->14 [13] 15/5/-1->13->-1 [14] -1/-1/-1->13->15 [15] 12/-1/-1->13->14 +3: jzxh162:3160743:3161551 [1] NCCL INFO P2P Chunksize set to 131072 +1: jzxh160:364773:365553 [2] NCCL INFO Trees [0] 7/-1/-1->6->5 [1] 7/-1/-1->6->5 [2] 4/-1/-1->6->10 [3] -1/-1/-1->6->4 [4] 5/-1/-1->6->7 [5] -1/-1/-1->6->4 [6] 4/-1/-1->6->10 [7] 5/-1/-1->6->7 [8] 7/-1/-1->6->5 [9] 7/-1/-1->6->5 [10] 4/10/2->6->14 [11] -1/-1/-1->6->4 [12] 5/-1/-1->6->7 [13] -1/-1/-1->6->4 [14] 4/10/2->6->14 [15] 5/-1/-1->6->7 +1: jzxh160:364773:365553 [2] NCCL INFO P2P Chunksize set to 131072 +1: jzxh160:364772:365552 [1] NCCL INFO comm 0x556d3b785610 rank 5 nRanks 16 nNodes 4 localRanks 4 localRank 1 MNNVL 0 +1: jzxh160:364771:365554 [0] NCCL INFO comm 0x55a894a256f0 rank 4 nRanks 16 nNodes 4 localRanks 4 localRank 0 MNNVL 0 +1: jzxh160:364772:365552 [1] NCCL INFO Trees [0] 6/-1/-1->5->4 [1] 6/-1/-1->5->9 [2] 7/-1/-1->5->4 [3] 4/-1/-1->5->7 [4] -1/-1/-1->5->6 [5] 7/-1/-1->5->9 [6] -1/-1/-1->5->7 [7] 4/-1/-1->5->6 [8] 6/-1/-1->5->4 [9] 6/9/1->5->13 [10] 7/-1/-1->5->4 [11] 4/-1/-1->5->7 [12] -1/-1/-1->5->6 [13] 7/9/1->5->13 [14] -1/-1/-1->5->7 [15] 4/-1/-1->5->6 +1: jzxh160:364772:365552 [1] NCCL INFO P2P Chunksize set to 131072 +1: jzxh160:364771:365554 [0] NCCL INFO Trees [0] 5/-1/-1->4->8 [1] -1/-1/-1->4->7 [2] 5/-1/-1->4->6 [3] 6/-1/-1->4->5 [4] 7/-1/-1->4->8 [5] 6/-1/-1->4->7 [6] 7/-1/-1->4->6 [7] -1/-1/-1->4->5 [8] 5/8/0->4->12 [9] -1/-1/-1->4->7 [10] 5/-1/-1->4->6 [11] 6/-1/-1->4->5 [12] 7/8/0->4->12 [13] 6/-1/-1->4->7 [14] 7/-1/-1->4->6 [15] -1/-1/-1->4->5 +2: jzxh161:3283539:3284338 [3] NCCL INFO comm 0x557efcb6a4d0 rank 11 nRanks 16 nNodes 4 localRanks 4 localRank 3 MNNVL 0 +2: jzxh161:3283538:3284335 [2] NCCL INFO comm 0x5643a3152d00 rank 10 nRanks 16 nNodes 4 localRanks 4 localRank 2 MNNVL 0 +2: jzxh161:3283539:3284338 [3] NCCL INFO Trees [0] -1/-1/-1->11->10 [1] 8/-1/-1->11->10 [2] -1/-1/-1->11->9 [3] 9/7/15->11->3 [4] 10/-1/-1->11->8 [5] 8/-1/-1->11->9 [6] 9/-1/-1->11->8 [7] 10/7/15->11->3 [8] -1/-1/-1->11->10 [9] 8/-1/-1->11->10 [10] -1/-1/-1->11->9 [11] 9/-1/-1->11->7 [12] 10/-1/-1->11->8 [13] 8/-1/-1->11->9 [14] 9/-1/-1->11->8 [15] 10/-1/-1->11->7 +2: jzxh161:3283539:3284338 [3] NCCL INFO P2P Chunksize set to 131072 +3: jzxh162:3160742:3161550 [0] NCCL INFO Trees [0] 13/-1/-1->12->8 [1] -1/-1/-1->12->15 [2] 13/-1/-1->12->14 [3] 14/-1/-1->12->13 [4] 15/-1/-1->12->8 [5] 14/-1/-1->12->15 [6] 15/-1/-1->12->14 [7] -1/-1/-1->12->13 [8] 13/4/-1->12->-1 [9] -1/-1/-1->12->15 [10] 13/-1/-1->12->14 [11] 14/-1/-1->12->13 [12] 15/4/-1->12->-1 [13] 14/-1/-1->12->15 [14] 15/-1/-1->12->14 [15] -1/-1/-1->12->13 +3: jzxh162:3160745:3161552 [3] NCCL INFO Trees [0] -1/-1/-1->15->14 [1] 12/-1/-1->15->14 [2] -1/-1/-1->15->13 [3] 13/-1/-1->15->11 [4] 14/-1/-1->15->12 [5] 12/-1/-1->15->13 [6] 13/-1/-1->15->12 [7] 14/-1/-1->15->11 [8] -1/-1/-1->15->14 [9] 12/-1/-1->15->14 [10] -1/-1/-1->15->13 [11] 13/7/-1->15->-1 [12] 14/-1/-1->15->12 [13] 12/-1/-1->15->13 [14] 13/-1/-1->15->12 [15] 14/7/-1->15->-1 +3: jzxh162:3160742:3161550 [0] NCCL INFO P2P Chunksize set to 131072 +3: jzxh162:3160745:3161552 [3] NCCL INFO P2P Chunksize set to 131072 +1: jzxh160:364771:365554 [0] NCCL INFO P2P Chunksize set to 131072 +2: jzxh161:3283538:3284335 [2] NCCL INFO Trees [0] 11/-1/-1->10->9 [1] 11/-1/-1->10->9 [2] 8/6/14->10->2 [3] -1/-1/-1->10->8 [4] 9/-1/-1->10->11 [5] -1/-1/-1->10->8 [6] 8/6/14->10->2 [7] 9/-1/-1->10->11 [8] 11/-1/-1->10->9 [9] 11/-1/-1->10->9 [10] 8/-1/-1->10->6 [11] -1/-1/-1->10->8 [12] 9/-1/-1->10->11 [13] -1/-1/-1->10->8 [14] 8/-1/-1->10->6 [15] 9/-1/-1->10->11 +2: jzxh161:3283538:3284335 [2] NCCL INFO P2P Chunksize set to 131072 +2: jzxh161:3283537:3284337 [1] NCCL INFO comm 0x55f84f114b10 rank 9 nRanks 16 nNodes 4 localRanks 4 localRank 1 MNNVL 0 +2: jzxh161:3283536:3284336 [0] NCCL INFO comm 0x55c8d4c4af80 rank 8 nRanks 16 nNodes 4 localRanks 4 localRank 0 MNNVL 0 +2: jzxh161:3283537:3284337 [1] NCCL INFO Trees [0] 10/-1/-1->9->8 [1] 10/5/13->9->1 [2] 11/-1/-1->9->8 [3] 8/-1/-1->9->11 [4] -1/-1/-1->9->10 [5] 11/5/13->9->1 [6] -1/-1/-1->9->11 [7] 8/-1/-1->9->10 [8] 10/-1/-1->9->8 [9] 10/-1/-1->9->5 [10] 11/-1/-1->9->8 [11] 8/-1/-1->9->11 [12] -1/-1/-1->9->10 [13] 11/-1/-1->9->5 [14] -1/-1/-1->9->11 [15] 8/-1/-1->9->10 +2: jzxh161:3283537:3284337 [1] NCCL INFO P2P Chunksize set to 131072 +2: jzxh161:3283536:3284336 [0] NCCL INFO Trees [0] 9/4/12->8->0 [1] -1/-1/-1->8->11 [2] 9/-1/-1->8->10 [3] 10/-1/-1->8->9 [4] 11/4/12->8->0 [5] 10/-1/-1->8->11 [6] 11/-1/-1->8->10 [7] -1/-1/-1->8->9 [8] 9/-1/-1->8->4 [9] -1/-1/-1->8->11 [10] 9/-1/-1->8->10 [11] 10/-1/-1->8->9 [12] 11/-1/-1->8->4 [13] 10/-1/-1->8->11 [14] 11/-1/-1->8->10 [15] -1/-1/-1->8->9 +2: jzxh161:3283536:3284336 [0] NCCL INFO P2P Chunksize set to 131072 +3: jzxh162:3160743:3161551 [1] NCCL INFO Channel 00/0 : 13[1] -> 14[2] via P2P/CUMEM +2: jzxh161:3283538:3284335 [2] NCCL INFO Channel 00/0 : 10[2] -> 11[3] via P2P/CUMEM +0: jzxh159:1308401:1310257 [2] NCCL INFO Channel 00/0 : 2[2] -> 3[3] via P2P/CUMEM +1: jzxh160:364772:365552 [1] NCCL INFO Channel 00/0 : 5[1] -> 6[2] via P2P/CUMEM +2: jzxh161:3283538:3284335 [2] NCCL INFO Channel 04/0 : 10[2] -> 11[3] via P2P/CUMEM +3: jzxh162:3160744:3161553 [2] NCCL INFO Channel 00/0 : 14[2] -> 15[3] via P2P/CUMEM +0: jzxh159:1308400:1310256 [1] NCCL INFO Channel 00/0 : 1[1] -> 2[2] via P2P/CUMEM +3: jzxh162:3160743:3161551 [1] NCCL INFO Channel 03/0 : 13[1] -> 14[2] via P2P/CUMEM +0: jzxh159:1308400:1310256 [1] NCCL INFO Channel 03/0 : 1[1] -> 2[2] via P2P/CUMEM +2: jzxh161:3283538:3284335 [2] NCCL INFO Channel 08/0 : 10[2] -> 11[3] via P2P/CUMEM +3: jzxh162:3160744:3161553 [2] NCCL INFO Channel 04/0 : 14[2] -> 15[3] via P2P/CUMEM +1: jzxh160:364773:365553 [2] NCCL INFO Channel 00/0 : 6[2] -> 7[3] via P2P/CUMEM +3: jzxh162:3160743:3161551 [1] NCCL INFO Channel 04/0 : 13[1] -> 14[2] via P2P/CUMEM +1: jzxh160:364772:365552 [1] NCCL INFO Channel 03/0 : 5[1] -> 6[2] via P2P/CUMEM +0: jzxh159:1308401:1310257 [2] NCCL INFO Channel 04/0 : 2[2] -> 3[3] via P2P/CUMEM +3: jzxh162:3160744:3161553 [2] NCCL INFO Channel 08/0 : 14[2] -> 15[3] via P2P/CUMEM +1: jzxh160:364772:365552 [1] NCCL INFO Channel 04/0 : 5[1] -> 6[2] via P2P/CUMEM +2: jzxh161:3283537:3284337 [1] NCCL INFO Channel 00/0 : 9[1] -> 10[2] via P2P/CUMEM +2: jzxh161:3283538:3284335 [2] NCCL INFO Channel 12/0 : 10[2] -> 11[3] via P2P/CUMEM +3: jzxh162:3160743:3161551 [1] NCCL INFO Channel 07/0 : 13[1] -> 14[2] via P2P/CUMEM +3: jzxh162:3160744:3161553 [2] NCCL INFO Channel 12/0 : 14[2] -> 15[3] via P2P/CUMEM +0: jzxh159:1308401:1310257 [2] NCCL INFO Channel 08/0 : 2[2] -> 3[3] via P2P/CUMEM +1: jzxh160:364773:365553 [2] NCCL INFO Channel 04/0 : 6[2] -> 7[3] via P2P/CUMEM +2: jzxh161:3283537:3284337 [1] NCCL INFO Channel 03/0 : 9[1] -> 10[2] via P2P/CUMEM +3: jzxh162:3160743:3161551 [1] NCCL INFO Channel 08/0 : 13[1] -> 14[2] via P2P/CUMEM +1: jzxh160:364772:365552 [1] NCCL INFO Channel 07/0 : 5[1] -> 6[2] via P2P/CUMEM +3: jzxh162:3160743:3161551 [1] NCCL INFO Channel 11/0 : 13[1] -> 14[2] via P2P/CUMEM +1: jzxh160:364773:365553 [2] NCCL INFO Channel 08/0 : 6[2] -> 7[3] via P2P/CUMEM +2: jzxh161:3283537:3284337 [1] NCCL INFO Channel 04/0 : 9[1] -> 10[2] via P2P/CUMEM +0: jzxh159:1308400:1310256 [1] NCCL INFO Channel 04/0 : 1[1] -> 2[2] via P2P/CUMEM +0: jzxh159:1308401:1310257 [2] NCCL INFO Channel 12/0 : 2[2] -> 3[3] via P2P/CUMEM +1: jzxh160:364772:365552 [1] NCCL INFO Channel 08/0 : 5[1] -> 6[2] via P2P/CUMEM +2: jzxh161:3283537:3284337 [1] NCCL INFO Channel 07/0 : 9[1] -> 10[2] via P2P/CUMEM +3: jzxh162:3160743:3161551 [1] NCCL INFO Channel 12/0 : 13[1] -> 14[2] via P2P/CUMEM +0: jzxh159:1308400:1310256 [1] NCCL INFO Channel 07/0 : 1[1] -> 2[2] via P2P/CUMEM +1: jzxh160:364773:365553 [2] NCCL INFO Channel 12/0 : 6[2] -> 7[3] via P2P/CUMEM +2: jzxh161:3283537:3284337 [1] NCCL INFO Channel 08/0 : 9[1] -> 10[2] via P2P/CUMEM +3: jzxh162:3160743:3161551 [1] NCCL INFO Channel 15/0 : 13[1] -> 14[2] via P2P/CUMEM +0: jzxh159:1308400:1310256 [1] NCCL INFO Channel 08/0 : 1[1] -> 2[2] via P2P/CUMEM +1: jzxh160:364772:365552 [1] NCCL INFO Channel 11/0 : 5[1] -> 6[2] via P2P/CUMEM +2: jzxh161:3283537:3284337 [1] NCCL INFO Channel 11/0 : 9[1] -> 10[2] via P2P/CUMEM +0: jzxh159:1308400:1310256 [1] NCCL INFO Channel 11/0 : 1[1] -> 2[2] via P2P/CUMEM +1: jzxh160:364772:365552 [1] NCCL INFO Channel 12/0 : 5[1] -> 6[2] via P2P/CUMEM +2: jzxh161:3283537:3284337 [1] NCCL INFO Channel 12/0 : 9[1] -> 10[2] via P2P/CUMEM +0: jzxh159:1308400:1310256 [1] NCCL INFO Channel 12/0 : 1[1] -> 2[2] via P2P/CUMEM +1: jzxh160:364772:365552 [1] NCCL INFO Channel 15/0 : 5[1] -> 6[2] via P2P/CUMEM +2: jzxh161:3283537:3284337 [1] NCCL INFO Channel 15/0 : 9[1] -> 10[2] via P2P/CUMEM +0: jzxh159:1308400:1310256 [1] NCCL INFO Channel 15/0 : 1[1] -> 2[2] via P2P/CUMEM +1: jzxh160:364771:365554 [0] NCCL INFO Channel 00/0 : 3[3] -> 4[0] [receive] via NET/IB/0/GDRDMA +1: jzxh160:364774:365551 [3] NCCL INFO Channel 00/0 : 7[3] -> 8[0] [send] via NET/IB/0(4)/GDRDMA +1: jzxh160:364771:365554 [0] NCCL INFO Channel 04/0 : 3[3] -> 4[0] [receive] via NET/IB/0/GDRDMA +1: jzxh160:364774:365551 [3] NCCL INFO Channel 04/0 : 7[3] -> 8[0] [send] via NET/IB/0(4)/GDRDMA +1: jzxh160:364771:365554 [0] NCCL INFO Channel 08/0 : 3[3] -> 4[0] [receive] via NET/IB/0/GDRDMA +0: jzxh159:1308399:1310255 [0] NCCL INFO Channel 00/0 : 15[3] -> 0[0] [receive] via NET/IB/0/GDRDMA +0: jzxh159:1308402:1310258 [3] NCCL INFO Channel 00/0 : 3[3] -> 4[0] [send] via NET/IB/0(0)/GDRDMA +0: jzxh159:1308399:1310255 [0] NCCL INFO Channel 04/0 : 15[3] -> 0[0] [receive] via NET/IB/0/GDRDMA +1: jzxh160:364774:365551 [3] NCCL INFO Channel 08/0 : 7[3] -> 8[0] [send] via NET/IB/0(4)/GDRDMA +1: jzxh160:364771:365554 [0] NCCL INFO Channel 12/0 : 3[3] -> 4[0] [receive] via NET/IB/0/GDRDMA +1: jzxh160:364774:365551 [3] NCCL INFO Channel 12/0 : 7[3] -> 8[0] [send] via NET/IB/0(4)/GDRDMA +1: jzxh160:364771:365554 [0] NCCL INFO Channel 00/0 : 4[0] -> 5[1] via P2P/CUMEM +2: jzxh161:3283536:3284336 [0] NCCL INFO Channel 00/0 : 7[3] -> 8[0] [receive] via NET/IB/0/GDRDMA +2: jzxh161:3283539:3284338 [3] NCCL INFO Channel 00/0 : 11[3] -> 12[0] [send] via NET/IB/0(8)/GDRDMA +2: jzxh161:3283536:3284336 [0] NCCL INFO Channel 04/0 : 7[3] -> 8[0] [receive] via NET/IB/0/GDRDMA +0: jzxh159:1308402:1310258 [3] NCCL INFO Channel 04/0 : 3[3] -> 4[0] [send] via NET/IB/0(0)/GDRDMA +0: jzxh159:1308399:1310255 [0] NCCL INFO Channel 08/0 : 15[3] -> 0[0] [receive] via NET/IB/0/GDRDMA +0: jzxh159:1308402:1310258 [3] NCCL INFO Channel 08/0 : 3[3] -> 4[0] [send] via NET/IB/0(0)/GDRDMA +2: jzxh161:3283539:3284338 [3] NCCL INFO Channel 04/0 : 11[3] -> 12[0] [send] via NET/IB/0(8)/GDRDMA +0: jzxh159:1308399:1310255 [0] NCCL INFO Channel 12/0 : 15[3] -> 0[0] [receive] via NET/IB/0/GDRDMA +1: jzxh160:364771:365554 [0] NCCL INFO Channel 03/0 : 4[0] -> 5[1] via P2P/CUMEM +0: jzxh159:1308399:1310255 [0] NCCL INFO Channel 00/0 : 0[0] -> 1[1] via P2P/CUMEM +0: jzxh159:1308402:1310258 [3] NCCL INFO Channel 12/0 : 3[3] -> 4[0] [send] via NET/IB/0(0)/GDRDMA +1: jzxh160:364771:365554 [0] NCCL INFO Channel 04/0 : 4[0] -> 5[1] via P2P/CUMEM +2: jzxh161:3283536:3284336 [0] NCCL INFO Channel 08/0 : 7[3] -> 8[0] [receive] via NET/IB/0/GDRDMA +2: jzxh161:3283539:3284338 [3] NCCL INFO Channel 08/0 : 11[3] -> 12[0] [send] via NET/IB/0(8)/GDRDMA +2: jzxh161:3283536:3284336 [0] NCCL INFO Channel 12/0 : 7[3] -> 8[0] [receive] via NET/IB/0/GDRDMA +2: jzxh161:3283539:3284338 [3] NCCL INFO Channel 12/0 : 11[3] -> 12[0] [send] via NET/IB/0(8)/GDRDMA +2: jzxh161:3283536:3284336 [0] NCCL INFO Channel 00/0 : 8[0] -> 9[1] via P2P/CUMEM +1: jzxh160:364771:365554 [0] NCCL INFO Channel 07/0 : 4[0] -> 5[1] via P2P/CUMEM +0: jzxh159:1308399:1310255 [0] NCCL INFO Channel 03/0 : 0[0] -> 1[1] via P2P/CUMEM +1: jzxh160:364771:365554 [0] NCCL INFO Channel 08/0 : 4[0] -> 5[1] via P2P/CUMEM +0: jzxh159:1308399:1310255 [0] NCCL INFO Channel 04/0 : 0[0] -> 1[1] via P2P/CUMEM +3: jzxh162:3160742:3161550 [0] NCCL INFO Channel 00/0 : 11[3] -> 12[0] [receive] via NET/IB/0/GDRDMA +2: jzxh161:3283536:3284336 [0] NCCL INFO Channel 03/0 : 8[0] -> 9[1] via P2P/CUMEM +1: jzxh160:364771:365554 [0] NCCL INFO Channel 11/0 : 4[0] -> 5[1] via P2P/CUMEM +3: jzxh162:3160745:3161552 [3] NCCL INFO Channel 00/0 : 15[3] -> 0[0] [send] via NET/IB/0(12)/GDRDMA +0: jzxh159:1308399:1310255 [0] NCCL INFO Channel 07/0 : 0[0] -> 1[1] via P2P/CUMEM +3: jzxh162:3160742:3161550 [0] NCCL INFO Channel 04/0 : 11[3] -> 12[0] [receive] via NET/IB/0/GDRDMA +3: jzxh162:3160745:3161552 [3] NCCL INFO Channel 04/0 : 15[3] -> 0[0] [send] via NET/IB/0(12)/GDRDMA +3: jzxh162:3160742:3161550 [0] NCCL INFO Channel 08/0 : 11[3] -> 12[0] [receive] via NET/IB/0/GDRDMA +1: jzxh160:364771:365554 [0] NCCL INFO Channel 12/0 : 4[0] -> 5[1] via P2P/CUMEM +2: jzxh161:3283536:3284336 [0] NCCL INFO Channel 04/0 : 8[0] -> 9[1] via P2P/CUMEM +3: jzxh162:3160745:3161552 [3] NCCL INFO Channel 08/0 : 15[3] -> 0[0] [send] via NET/IB/0(12)/GDRDMA +0: jzxh159:1308399:1310255 [0] NCCL INFO Channel 08/0 : 0[0] -> 1[1] via P2P/CUMEM +3: jzxh162:3160742:3161550 [0] NCCL INFO Channel 12/0 : 11[3] -> 12[0] [receive] via NET/IB/0/GDRDMA +3: jzxh162:3160745:3161552 [3] NCCL INFO Channel 12/0 : 15[3] -> 0[0] [send] via NET/IB/0(12)/GDRDMA +3: jzxh162:3160742:3161550 [0] NCCL INFO Channel 00/0 : 12[0] -> 13[1] via P2P/CUMEM +1: jzxh160:364771:365554 [0] NCCL INFO Channel 15/0 : 4[0] -> 5[1] via P2P/CUMEM +0: jzxh159:1308399:1310255 [0] NCCL INFO Channel 11/0 : 0[0] -> 1[1] via P2P/CUMEM +2: jzxh161:3283536:3284336 [0] NCCL INFO Channel 07/0 : 8[0] -> 9[1] via P2P/CUMEM +0: jzxh159:1308399:1310255 [0] NCCL INFO Channel 12/0 : 0[0] -> 1[1] via P2P/CUMEM +2: jzxh161:3283536:3284336 [0] NCCL INFO Channel 08/0 : 8[0] -> 9[1] via P2P/CUMEM +2: jzxh161:3283536:3284336 [0] NCCL INFO Channel 11/0 : 8[0] -> 9[1] via P2P/CUMEM +1: jzxh160:364771:365554 [0] NCCL INFO Channel 01/0 : 4[0] -> 7[3] via P2P/CUMEM +3: jzxh162:3160742:3161550 [0] NCCL INFO Channel 03/0 : 12[0] -> 13[1] via P2P/CUMEM +0: jzxh159:1308399:1310255 [0] NCCL INFO Channel 15/0 : 0[0] -> 1[1] via P2P/CUMEM +2: jzxh161:3283536:3284336 [0] NCCL INFO Channel 12/0 : 8[0] -> 9[1] via P2P/CUMEM +1: jzxh160:364771:365554 [0] NCCL INFO Channel 02/0 : 4[0] -> 7[3] via P2P/CUMEM +3: jzxh162:3160742:3161550 [0] NCCL INFO Channel 04/0 : 12[0] -> 13[1] via P2P/CUMEM +1: jzxh160:364771:365554 [0] NCCL INFO Channel 05/0 : 4[0] -> 7[3] via P2P/CUMEM +2: jzxh161:3283536:3284336 [0] NCCL INFO Channel 15/0 : 8[0] -> 9[1] via P2P/CUMEM +3: jzxh162:3160742:3161550 [0] NCCL INFO Channel 07/0 : 12[0] -> 13[1] via P2P/CUMEM +1: jzxh160:364771:365554 [0] NCCL INFO Channel 06/0 : 4[0] -> 7[3] via P2P/CUMEM +0: jzxh159:1308399:1310255 [0] NCCL INFO Channel 01/0 : 0[0] -> 3[3] via P2P/CUMEM +3: jzxh162:3160742:3161550 [0] NCCL INFO Channel 08/0 : 12[0] -> 13[1] via P2P/CUMEM +1: jzxh160:364771:365554 [0] NCCL INFO Channel 09/0 : 4[0] -> 7[3] via P2P/CUMEM +0: jzxh159:1308399:1310255 [0] NCCL INFO Channel 02/0 : 0[0] -> 3[3] via P2P/CUMEM +3: jzxh162:3160742:3161550 [0] NCCL INFO Channel 11/0 : 12[0] -> 13[1] via P2P/CUMEM +1: jzxh160:364771:365554 [0] NCCL INFO Channel 10/0 : 4[0] -> 7[3] via P2P/CUMEM +2: jzxh161:3283536:3284336 [0] NCCL INFO Channel 01/0 : 8[0] -> 11[3] via P2P/CUMEM +0: jzxh159:1308399:1310255 [0] NCCL INFO Channel 05/0 : 0[0] -> 3[3] via P2P/CUMEM +1: jzxh160:364771:365554 [0] NCCL INFO Channel 13/0 : 4[0] -> 7[3] via P2P/CUMEM +3: jzxh162:3160742:3161550 [0] NCCL INFO Channel 12/0 : 12[0] -> 13[1] via P2P/CUMEM +0: jzxh159:1308399:1310255 [0] NCCL INFO Channel 06/0 : 0[0] -> 3[3] via P2P/CUMEM +2: jzxh161:3283536:3284336 [0] NCCL INFO Channel 02/0 : 8[0] -> 11[3] via P2P/CUMEM +1: jzxh160:364771:365554 [0] NCCL INFO Channel 14/0 : 4[0] -> 7[3] via P2P/CUMEM +0: jzxh159:1308399:1310255 [0] NCCL INFO Channel 09/0 : 0[0] -> 3[3] via P2P/CUMEM +3: jzxh162:3160742:3161550 [0] NCCL INFO Channel 15/0 : 12[0] -> 13[1] via P2P/CUMEM +2: jzxh161:3283536:3284336 [0] NCCL INFO Channel 05/0 : 8[0] -> 11[3] via P2P/CUMEM +2: jzxh161:3283536:3284336 [0] NCCL INFO Channel 06/0 : 8[0] -> 11[3] via P2P/CUMEM +0: jzxh159:1308399:1310255 [0] NCCL INFO Channel 10/0 : 0[0] -> 3[3] via P2P/CUMEM +0: jzxh159:1308399:1310255 [0] NCCL INFO Channel 13/0 : 0[0] -> 3[3] via P2P/CUMEM +2: jzxh161:3283536:3284336 [0] NCCL INFO Channel 09/0 : 8[0] -> 11[3] via P2P/CUMEM +0: jzxh159:1308399:1310255 [0] NCCL INFO Channel 14/0 : 0[0] -> 3[3] via P2P/CUMEM +2: jzxh161:3283536:3284336 [0] NCCL INFO Channel 10/0 : 8[0] -> 11[3] via P2P/CUMEM +2: jzxh161:3283536:3284336 [0] NCCL INFO Channel 13/0 : 8[0] -> 11[3] via P2P/CUMEM +3: jzxh162:3160742:3161550 [0] NCCL INFO Channel 01/0 : 12[0] -> 15[3] via P2P/CUMEM +2: jzxh161:3283536:3284336 [0] NCCL INFO Channel 14/0 : 8[0] -> 11[3] via P2P/CUMEM +3: jzxh162:3160742:3161550 [0] NCCL INFO Channel 02/0 : 12[0] -> 15[3] via P2P/CUMEM +3: jzxh162:3160742:3161550 [0] NCCL INFO Channel 05/0 : 12[0] -> 15[3] via P2P/CUMEM +3: jzxh162:3160742:3161550 [0] NCCL INFO Channel 06/0 : 12[0] -> 15[3] via P2P/CUMEM +3: jzxh162:3160742:3161550 [0] NCCL INFO Channel 09/0 : 12[0] -> 15[3] via P2P/CUMEM +3: jzxh162:3160742:3161550 [0] NCCL INFO Channel 10/0 : 12[0] -> 15[3] via P2P/CUMEM +3: jzxh162:3160742:3161550 [0] NCCL INFO Channel 13/0 : 12[0] -> 15[3] via P2P/CUMEM +3: jzxh162:3160742:3161550 [0] NCCL INFO Channel 14/0 : 12[0] -> 15[3] via P2P/CUMEM +1: jzxh160:364773:365553 [2] NCCL INFO Channel 02/0 : 3[3] -> 6[2] [receive] via NET/IB/2/GDRDMA +1: jzxh160:364774:365551 [3] NCCL INFO Channel 02/0 : 7[3] -> 10[2] [send] via NET/IB/2(6)/GDRDMA +1: jzxh160:364773:365553 [2] NCCL INFO Channel 06/0 : 3[3] -> 6[2] [receive] via NET/IB/2/GDRDMA +1: jzxh160:364774:365551 [3] NCCL INFO Channel 06/0 : 7[3] -> 10[2] [send] via NET/IB/2(6)/GDRDMA +1: jzxh160:364773:365553 [2] NCCL INFO Channel 10/0 : 3[3] -> 6[2] [receive] via NET/IB/2/GDRDMA +1: jzxh160:364774:365551 [3] NCCL INFO Channel 10/0 : 7[3] -> 10[2] [send] via NET/IB/2(6)/GDRDMA +1: jzxh160:364773:365553 [2] NCCL INFO Channel 14/0 : 3[3] -> 6[2] [receive] via NET/IB/2/GDRDMA +0: jzxh159:1308401:1310257 [2] NCCL INFO Channel 02/0 : 15[3] -> 2[2] [receive] via NET/IB/2/GDRDMA +0: jzxh159:1308402:1310258 [3] NCCL INFO Channel 02/0 : 3[3] -> 6[2] [send] via NET/IB/2(2)/GDRDMA +0: jzxh159:1308401:1310257 [2] NCCL INFO Channel 06/0 : 15[3] -> 2[2] [receive] via NET/IB/2/GDRDMA +0: jzxh159:1308402:1310258 [3] NCCL INFO Channel 06/0 : 3[3] -> 6[2] [send] via NET/IB/2(2)/GDRDMA +0: jzxh159:1308401:1310257 [2] NCCL INFO Channel 10/0 : 15[3] -> 2[2] [receive] via NET/IB/2/GDRDMA +0: jzxh159:1308402:1310258 [3] NCCL INFO Channel 10/0 : 3[3] -> 6[2] [send] via NET/IB/2(2)/GDRDMA +0: jzxh159:1308401:1310257 [2] NCCL INFO Channel 14/0 : 15[3] -> 2[2] [receive] via NET/IB/2/GDRDMA +1: jzxh160:364774:365551 [3] NCCL INFO Channel 14/0 : 7[3] -> 10[2] [send] via NET/IB/2(6)/GDRDMA +0: jzxh159:1308402:1310258 [3] NCCL INFO Channel 14/0 : 3[3] -> 6[2] [send] via NET/IB/2(2)/GDRDMA +2: jzxh161:3283538:3284335 [2] NCCL INFO Channel 02/0 : 7[3] -> 10[2] [receive] via NET/IB/2/GDRDMA +2: jzxh161:3283539:3284338 [3] NCCL INFO Channel 02/0 : 11[3] -> 14[2] [send] via NET/IB/2(10)/GDRDMA +2: jzxh161:3283538:3284335 [2] NCCL INFO Channel 06/0 : 7[3] -> 10[2] [receive] via NET/IB/2/GDRDMA +2: jzxh161:3283539:3284338 [3] NCCL INFO Channel 06/0 : 11[3] -> 14[2] [send] via NET/IB/2(10)/GDRDMA +2: jzxh161:3283538:3284335 [2] NCCL INFO Channel 10/0 : 7[3] -> 10[2] [receive] via NET/IB/2/GDRDMA +2: jzxh161:3283539:3284338 [3] NCCL INFO Channel 10/0 : 11[3] -> 14[2] [send] via NET/IB/2(10)/GDRDMA +2: jzxh161:3283538:3284335 [2] NCCL INFO Channel 14/0 : 7[3] -> 10[2] [receive] via NET/IB/2/GDRDMA +2: jzxh161:3283539:3284338 [3] NCCL INFO Channel 14/0 : 11[3] -> 14[2] [send] via NET/IB/2(10)/GDRDMA +3: jzxh162:3160744:3161553 [2] NCCL INFO Channel 02/0 : 11[3] -> 14[2] [receive] via NET/IB/2/GDRDMA +3: jzxh162:3160745:3161552 [3] NCCL INFO Channel 02/0 : 15[3] -> 2[2] [send] via NET/IB/2(14)/GDRDMA +3: jzxh162:3160744:3161553 [2] NCCL INFO Channel 06/0 : 11[3] -> 14[2] [receive] via NET/IB/2/GDRDMA +3: jzxh162:3160745:3161552 [3] NCCL INFO Channel 06/0 : 15[3] -> 2[2] [send] via NET/IB/2(14)/GDRDMA +3: jzxh162:3160744:3161553 [2] NCCL INFO Channel 10/0 : 11[3] -> 14[2] [receive] via NET/IB/2/GDRDMA +3: jzxh162:3160745:3161552 [3] NCCL INFO Channel 10/0 : 15[3] -> 2[2] [send] via NET/IB/2(14)/GDRDMA +3: jzxh162:3160744:3161553 [2] NCCL INFO Channel 14/0 : 11[3] -> 14[2] [receive] via NET/IB/2/GDRDMA +3: jzxh162:3160745:3161552 [3] NCCL INFO Channel 14/0 : 15[3] -> 2[2] [send] via NET/IB/2(14)/GDRDMA +1: jzxh160:364772:365552 [1] NCCL INFO Channel 01/0 : 2[2] -> 5[1] [receive] via NET/IB/1/GDRDMA +1: jzxh160:364773:365553 [2] NCCL INFO Channel 01/0 : 6[2] -> 9[1] [send] via NET/IB/1(5)/GDRDMA +1: jzxh160:364772:365552 [1] NCCL INFO Channel 05/0 : 2[2] -> 5[1] [receive] via NET/IB/1/GDRDMA +1: jzxh160:364773:365553 [2] NCCL INFO Channel 05/0 : 6[2] -> 9[1] [send] via NET/IB/1(5)/GDRDMA +1: jzxh160:364772:365552 [1] NCCL INFO Channel 09/0 : 2[2] -> 5[1] [receive] via NET/IB/1/GDRDMA +1: jzxh160:364773:365553 [2] NCCL INFO Channel 09/0 : 6[2] -> 9[1] [send] via NET/IB/1(5)/GDRDMA +1: jzxh160:364772:365552 [1] NCCL INFO Channel 13/0 : 2[2] -> 5[1] [receive] via NET/IB/1/GDRDMA +1: jzxh160:364773:365553 [2] NCCL INFO Channel 13/0 : 6[2] -> 9[1] [send] via NET/IB/1(5)/GDRDMA +2: jzxh161:3283537:3284337 [1] NCCL INFO Channel 01/0 : 6[2] -> 9[1] [receive] via NET/IB/1/GDRDMA +0: jzxh159:1308400:1310256 [1] NCCL INFO Channel 01/0 : 14[2] -> 1[1] [receive] via NET/IB/1/GDRDMA +2: jzxh161:3283538:3284335 [2] NCCL INFO Channel 01/0 : 10[2] -> 13[1] [send] via NET/IB/1(9)/GDRDMA +0: jzxh159:1308401:1310257 [2] NCCL INFO Channel 01/0 : 2[2] -> 5[1] [send] via NET/IB/1(1)/GDRDMA +2: jzxh161:3283537:3284337 [1] NCCL INFO Channel 05/0 : 6[2] -> 9[1] [receive] via NET/IB/1/GDRDMA +2: jzxh161:3283538:3284335 [2] NCCL INFO Channel 05/0 : 10[2] -> 13[1] [send] via NET/IB/1(9)/GDRDMA +2: jzxh161:3283537:3284337 [1] NCCL INFO Channel 09/0 : 6[2] -> 9[1] [receive] via NET/IB/1/GDRDMA +0: jzxh159:1308400:1310256 [1] NCCL INFO Channel 05/0 : 14[2] -> 1[1] [receive] via NET/IB/1/GDRDMA +2: jzxh161:3283538:3284335 [2] NCCL INFO Channel 09/0 : 10[2] -> 13[1] [send] via NET/IB/1(9)/GDRDMA +0: jzxh159:1308401:1310257 [2] NCCL INFO Channel 05/0 : 2[2] -> 5[1] [send] via NET/IB/1(1)/GDRDMA +0: jzxh159:1308400:1310256 [1] NCCL INFO Channel 09/0 : 14[2] -> 1[1] [receive] via NET/IB/1/GDRDMA +0: jzxh159:1308401:1310257 [2] NCCL INFO Channel 09/0 : 2[2] -> 5[1] [send] via NET/IB/1(1)/GDRDMA +0: jzxh159:1308400:1310256 [1] NCCL INFO Channel 13/0 : 14[2] -> 1[1] [receive] via NET/IB/1/GDRDMA +0: jzxh159:1308401:1310257 [2] NCCL INFO Channel 13/0 : 2[2] -> 5[1] [send] via NET/IB/1(1)/GDRDMA +2: jzxh161:3283537:3284337 [1] NCCL INFO Channel 13/0 : 6[2] -> 9[1] [receive] via NET/IB/1/GDRDMA +2: jzxh161:3283538:3284335 [2] NCCL INFO Channel 13/0 : 10[2] -> 13[1] [send] via NET/IB/1(9)/GDRDMA +3: jzxh162:3160743:3161551 [1] NCCL INFO Channel 01/0 : 10[2] -> 13[1] [receive] via NET/IB/1/GDRDMA +2: jzxh161:3283537:3284337 [1] NCCL INFO Channel 01/0 : 9[1] -> 8[0] via P2P/CUMEM +3: jzxh162:3160744:3161553 [2] NCCL INFO Channel 01/0 : 14[2] -> 1[1] [send] via NET/IB/1(13)/GDRDMA +3: jzxh162:3160743:3161551 [1] NCCL INFO Channel 05/0 : 10[2] -> 13[1] [receive] via NET/IB/1/GDRDMA +3: jzxh162:3160744:3161553 [2] NCCL INFO Channel 05/0 : 14[2] -> 1[1] [send] via NET/IB/1(13)/GDRDMA +3: jzxh162:3160743:3161551 [1] NCCL INFO Channel 09/0 : 10[2] -> 13[1] [receive] via NET/IB/1/GDRDMA +3: jzxh162:3160744:3161553 [2] NCCL INFO Channel 09/0 : 14[2] -> 1[1] [send] via NET/IB/1(13)/GDRDMA +2: jzxh161:3283537:3284337 [1] NCCL INFO Channel 02/0 : 9[1] -> 8[0] via P2P/CUMEM +3: jzxh162:3160743:3161551 [1] NCCL INFO Channel 13/0 : 10[2] -> 13[1] [receive] via NET/IB/1/GDRDMA +3: jzxh162:3160744:3161553 [2] NCCL INFO Channel 13/0 : 14[2] -> 1[1] [send] via NET/IB/1(13)/GDRDMA +2: jzxh161:3283537:3284337 [1] NCCL INFO Channel 05/0 : 9[1] -> 8[0] via P2P/CUMEM +2: jzxh161:3283537:3284337 [1] NCCL INFO Channel 06/0 : 9[1] -> 8[0] via P2P/CUMEM +1: jzxh160:364772:365552 [1] NCCL INFO Channel 01/0 : 5[1] -> 4[0] via P2P/CUMEM +1: jzxh160:364772:365552 [1] NCCL INFO Channel 02/0 : 5[1] -> 4[0] via P2P/CUMEM +2: jzxh161:3283537:3284337 [1] NCCL INFO Channel 09/0 : 9[1] -> 8[0] via P2P/CUMEM +2: jzxh161:3283537:3284337 [1] NCCL INFO Channel 10/0 : 9[1] -> 8[0] via P2P/CUMEM +1: jzxh160:364772:365552 [1] NCCL INFO Channel 05/0 : 5[1] -> 4[0] via P2P/CUMEM +2: jzxh161:3283537:3284337 [1] NCCL INFO Channel 13/0 : 9[1] -> 8[0] via P2P/CUMEM +3: jzxh162:3160743:3161551 [1] NCCL INFO Channel 01/0 : 13[1] -> 12[0] via P2P/CUMEM +1: jzxh160:364772:365552 [1] NCCL INFO Channel 06/0 : 5[1] -> 4[0] via P2P/CUMEM +0: jzxh159:1308400:1310256 [1] NCCL INFO Channel 01/0 : 1[1] -> 0[0] via P2P/CUMEM +3: jzxh162:3160743:3161551 [1] NCCL INFO Channel 02/0 : 13[1] -> 12[0] via P2P/CUMEM +2: jzxh161:3283537:3284337 [1] NCCL INFO Channel 14/0 : 9[1] -> 8[0] via P2P/CUMEM +0: jzxh159:1308400:1310256 [1] NCCL INFO Channel 02/0 : 1[1] -> 0[0] via P2P/CUMEM +1: jzxh160:364772:365552 [1] NCCL INFO Channel 09/0 : 5[1] -> 4[0] via P2P/CUMEM +3: jzxh162:3160743:3161551 [1] NCCL INFO Channel 05/0 : 13[1] -> 12[0] via P2P/CUMEM +0: jzxh159:1308400:1310256 [1] NCCL INFO Channel 05/0 : 1[1] -> 0[0] via P2P/CUMEM +1: jzxh160:364772:365552 [1] NCCL INFO Channel 10/0 : 5[1] -> 4[0] via P2P/CUMEM +3: jzxh162:3160743:3161551 [1] NCCL INFO Channel 06/0 : 13[1] -> 12[0] via P2P/CUMEM +1: jzxh160:364772:365552 [1] NCCL INFO Channel 13/0 : 5[1] -> 4[0] via P2P/CUMEM +0: jzxh159:1308400:1310256 [1] NCCL INFO Channel 06/0 : 1[1] -> 0[0] via P2P/CUMEM +1: jzxh160:364772:365552 [1] NCCL INFO Channel 14/0 : 5[1] -> 4[0] via P2P/CUMEM +3: jzxh162:3160743:3161551 [1] NCCL INFO Channel 09/0 : 13[1] -> 12[0] via P2P/CUMEM +0: jzxh159:1308400:1310256 [1] NCCL INFO Channel 09/0 : 1[1] -> 0[0] via P2P/CUMEM +3: jzxh162:3160743:3161551 [1] NCCL INFO Channel 10/0 : 13[1] -> 12[0] via P2P/CUMEM +0: jzxh159:1308400:1310256 [1] NCCL INFO Channel 10/0 : 1[1] -> 0[0] via P2P/CUMEM +3: jzxh162:3160743:3161551 [1] NCCL INFO Channel 13/0 : 13[1] -> 12[0] via P2P/CUMEM +0: jzxh159:1308400:1310256 [1] NCCL INFO Channel 13/0 : 1[1] -> 0[0] via P2P/CUMEM +3: jzxh162:3160743:3161551 [1] NCCL INFO Channel 14/0 : 13[1] -> 12[0] via P2P/CUMEM +0: jzxh159:1308400:1310256 [1] NCCL INFO Channel 14/0 : 1[1] -> 0[0] via P2P/CUMEM +0: jzxh159:1308402:1310258 [3] NCCL INFO Channel 03/0 : 14[2] -> 3[3] [receive] via NET/IB/3/GDRDMA +0: jzxh159:1308401:1310257 [2] NCCL INFO Channel 03/0 : 2[2] -> 7[3] [send] via NET/IB/3(3)/GDRDMA +0: jzxh159:1308402:1310258 [3] NCCL INFO Channel 07/0 : 14[2] -> 3[3] [receive] via NET/IB/3/GDRDMA +0: jzxh159:1308401:1310257 [2] NCCL INFO Channel 07/0 : 2[2] -> 7[3] [send] via NET/IB/3(3)/GDRDMA +0: jzxh159:1308402:1310258 [3] NCCL INFO Channel 11/0 : 14[2] -> 3[3] [receive] via NET/IB/3/GDRDMA +0: jzxh159:1308401:1310257 [2] NCCL INFO Channel 11/0 : 2[2] -> 7[3] [send] via NET/IB/3(3)/GDRDMA +0: jzxh159:1308402:1310258 [3] NCCL INFO Channel 15/0 : 14[2] -> 3[3] [receive] via NET/IB/3/GDRDMA +0: jzxh159:1308401:1310257 [2] NCCL INFO Channel 15/0 : 2[2] -> 7[3] [send] via NET/IB/3(3)/GDRDMA +1: jzxh160:364774:365551 [3] NCCL INFO Channel 03/0 : 2[2] -> 7[3] [receive] via NET/IB/3/GDRDMA +1: jzxh160:364773:365553 [2] NCCL INFO Channel 03/0 : 6[2] -> 11[3] [send] via NET/IB/3(7)/GDRDMA +3: jzxh162:3160745:3161552 [3] NCCL INFO Channel 03/0 : 10[2] -> 15[3] [receive] via NET/IB/3/GDRDMA +3: jzxh162:3160744:3161553 [2] NCCL INFO Channel 03/0 : 14[2] -> 3[3] [send] via NET/IB/3(15)/GDRDMA +0: jzxh159:1308402:1310258 [3] NCCL INFO Channel 03/0 : 3[3] -> 0[0] via P2P/CUMEM +1: jzxh160:364774:365551 [3] NCCL INFO Channel 07/0 : 2[2] -> 7[3] [receive] via NET/IB/3/GDRDMA +3: jzxh162:3160745:3161552 [3] NCCL INFO Channel 07/0 : 10[2] -> 15[3] [receive] via NET/IB/3/GDRDMA +1: jzxh160:364773:365553 [2] NCCL INFO Channel 07/0 : 6[2] -> 11[3] [send] via NET/IB/3(7)/GDRDMA +3: jzxh162:3160744:3161553 [2] NCCL INFO Channel 07/0 : 14[2] -> 3[3] [send] via NET/IB/3(15)/GDRDMA +1: jzxh160:364774:365551 [3] NCCL INFO Channel 11/0 : 2[2] -> 7[3] [receive] via NET/IB/3/GDRDMA +3: jzxh162:3160745:3161552 [3] NCCL INFO Channel 11/0 : 10[2] -> 15[3] [receive] via NET/IB/3/GDRDMA +1: jzxh160:364773:365553 [2] NCCL INFO Channel 11/0 : 6[2] -> 11[3] [send] via NET/IB/3(7)/GDRDMA +3: jzxh162:3160744:3161553 [2] NCCL INFO Channel 11/0 : 14[2] -> 3[3] [send] via NET/IB/3(15)/GDRDMA +1: jzxh160:364774:365551 [3] NCCL INFO Channel 15/0 : 2[2] -> 7[3] [receive] via NET/IB/3/GDRDMA +3: jzxh162:3160745:3161552 [3] NCCL INFO Channel 15/0 : 10[2] -> 15[3] [receive] via NET/IB/3/GDRDMA +1: jzxh160:364773:365553 [2] NCCL INFO Channel 15/0 : 6[2] -> 11[3] [send] via NET/IB/3(7)/GDRDMA +3: jzxh162:3160744:3161553 [2] NCCL INFO Channel 15/0 : 14[2] -> 3[3] [send] via NET/IB/3(15)/GDRDMA +1: jzxh160:364774:365551 [3] NCCL INFO Channel 03/0 : 7[3] -> 4[0] via P2P/CUMEM +0: jzxh159:1308402:1310258 [3] NCCL INFO Channel 07/0 : 3[3] -> 0[0] via P2P/CUMEM +0: jzxh159:1308402:1310258 [3] NCCL INFO Channel 11/0 : 3[3] -> 0[0] via P2P/CUMEM +1: jzxh160:364774:365551 [3] NCCL INFO Channel 07/0 : 7[3] -> 4[0] via P2P/CUMEM +2: jzxh161:3283538:3284335 [2] NCCL INFO Channel 03/0 : 10[2] -> 15[3] [send] via NET/IB/3(11)/GDRDMA +1: jzxh160:364774:365551 [3] NCCL INFO Channel 11/0 : 7[3] -> 4[0] via P2P/CUMEM +0: jzxh159:1308402:1310258 [3] NCCL INFO Channel 15/0 : 3[3] -> 0[0] via P2P/CUMEM +1: jzxh160:364774:365551 [3] NCCL INFO Channel 15/0 : 7[3] -> 4[0] via P2P/CUMEM +2: jzxh161:3283539:3284338 [3] NCCL INFO Channel 03/0 : 6[2] -> 11[3] [receive] via NET/IB/3/GDRDMA +2: jzxh161:3283538:3284335 [2] NCCL INFO Channel 07/0 : 10[2] -> 15[3] [send] via NET/IB/3(11)/GDRDMA +2: jzxh161:3283539:3284338 [3] NCCL INFO Channel 07/0 : 6[2] -> 11[3] [receive] via NET/IB/3/GDRDMA +2: jzxh161:3283538:3284335 [2] NCCL INFO Channel 11/0 : 10[2] -> 15[3] [send] via NET/IB/3(11)/GDRDMA +2: jzxh161:3283539:3284338 [3] NCCL INFO Channel 11/0 : 6[2] -> 11[3] [receive] via NET/IB/3/GDRDMA +2: jzxh161:3283538:3284335 [2] NCCL INFO Channel 15/0 : 10[2] -> 15[3] [send] via NET/IB/3(11)/GDRDMA +2: jzxh161:3283539:3284338 [3] NCCL INFO Channel 15/0 : 6[2] -> 11[3] [receive] via NET/IB/3/GDRDMA +3: jzxh162:3160744:3161553 [2] NCCL INFO Channel 02/0 : 14[2] -> 13[1] via P2P/CUMEM +0: jzxh159:1308401:1310257 [2] NCCL INFO Channel 02/0 : 2[2] -> 1[1] via P2P/CUMEM +1: jzxh160:364774:365551 [3] NCCL INFO Channel 01/0 : 7[3] -> 6[2] via P2P/CUMEM +0: jzxh159:1308402:1310258 [3] NCCL INFO Channel 01/0 : 3[3] -> 2[2] via P2P/CUMEM +0: jzxh159:1308401:1310257 [2] NCCL INFO Channel 06/0 : 2[2] -> 1[1] via P2P/CUMEM +2: jzxh161:3283539:3284338 [3] NCCL INFO Channel 03/0 : 11[3] -> 8[0] via P2P/CUMEM +3: jzxh162:3160745:3161552 [3] NCCL INFO Channel 03/0 : 15[3] -> 12[0] via P2P/CUMEM +3: jzxh162:3160744:3161553 [2] NCCL INFO Channel 06/0 : 14[2] -> 13[1] via P2P/CUMEM +1: jzxh160:364774:365551 [3] NCCL INFO Channel 05/0 : 7[3] -> 6[2] via P2P/CUMEM +0: jzxh159:1308402:1310258 [3] NCCL INFO Channel 05/0 : 3[3] -> 2[2] via P2P/CUMEM +2: jzxh161:3283539:3284338 [3] NCCL INFO Channel 07/0 : 11[3] -> 8[0] via P2P/CUMEM +3: jzxh162:3160745:3161552 [3] NCCL INFO Channel 07/0 : 15[3] -> 12[0] via P2P/CUMEM +3: jzxh162:3160744:3161553 [2] NCCL INFO Channel 10/0 : 14[2] -> 13[1] via P2P/CUMEM +0: jzxh159:1308401:1310257 [2] NCCL INFO Channel 10/0 : 2[2] -> 1[1] via P2P/CUMEM +1: jzxh160:364774:365551 [3] NCCL INFO Channel 09/0 : 7[3] -> 6[2] via P2P/CUMEM +2: jzxh161:3283539:3284338 [3] NCCL INFO Channel 11/0 : 11[3] -> 8[0] via P2P/CUMEM +3: jzxh162:3160744:3161553 [2] NCCL INFO Channel 14/0 : 14[2] -> 13[1] via P2P/CUMEM +2: jzxh161:3283539:3284338 [3] NCCL INFO Channel 15/0 : 11[3] -> 8[0] via P2P/CUMEM +3: jzxh162:3160745:3161552 [3] NCCL INFO Channel 11/0 : 15[3] -> 12[0] via P2P/CUMEM +0: jzxh159:1308402:1310258 [3] NCCL INFO Channel 09/0 : 3[3] -> 2[2] via P2P/CUMEM +0: jzxh159:1308401:1310257 [2] NCCL INFO Channel 14/0 : 2[2] -> 1[1] via P2P/CUMEM +1: jzxh160:364774:365551 [3] NCCL INFO Channel 13/0 : 7[3] -> 6[2] via P2P/CUMEM +3: jzxh162:3160745:3161552 [3] NCCL INFO Channel 15/0 : 15[3] -> 12[0] via P2P/CUMEM +2: jzxh161:3283538:3284335 [2] NCCL INFO Channel 02/0 : 10[2] -> 9[1] via P2P/CUMEM +2: jzxh161:3283538:3284335 [2] NCCL INFO Channel 06/0 : 10[2] -> 9[1] via P2P/CUMEM +3: jzxh162:3160745:3161552 [3] NCCL INFO Channel 01/0 : 15[3] -> 14[2] via P2P/CUMEM +0: jzxh159:1308402:1310258 [3] NCCL INFO Channel 13/0 : 3[3] -> 2[2] via P2P/CUMEM +2: jzxh161:3283539:3284338 [3] NCCL INFO Channel 01/0 : 11[3] -> 10[2] via P2P/CUMEM +2: jzxh161:3283538:3284335 [2] NCCL INFO Channel 10/0 : 10[2] -> 9[1] via P2P/CUMEM +3: jzxh162:3160745:3161552 [3] NCCL INFO Channel 05/0 : 15[3] -> 14[2] via P2P/CUMEM +1: jzxh160:364773:365553 [2] NCCL INFO Channel 02/0 : 6[2] -> 5[1] via P2P/CUMEM +1: jzxh160:364773:365553 [2] NCCL INFO Channel 06/0 : 6[2] -> 5[1] via P2P/CUMEM +2: jzxh161:3283539:3284338 [3] NCCL INFO Channel 05/0 : 11[3] -> 10[2] via P2P/CUMEM +3: jzxh162:3160745:3161552 [3] NCCL INFO Channel 09/0 : 15[3] -> 14[2] via P2P/CUMEM +2: jzxh161:3283538:3284335 [2] NCCL INFO Channel 14/0 : 10[2] -> 9[1] via P2P/CUMEM +2: jzxh161:3283539:3284338 [3] NCCL INFO Channel 09/0 : 11[3] -> 10[2] via P2P/CUMEM +3: jzxh162:3160745:3161552 [3] NCCL INFO Channel 13/0 : 15[3] -> 14[2] via P2P/CUMEM +1: jzxh160:364773:365553 [2] NCCL INFO Channel 10/0 : 6[2] -> 5[1] via P2P/CUMEM +2: jzxh161:3283539:3284338 [3] NCCL INFO Channel 13/0 : 11[3] -> 10[2] via P2P/CUMEM +1: jzxh160:364773:365553 [2] NCCL INFO Channel 14/0 : 6[2] -> 5[1] via P2P/CUMEM +0: jzxh159:1308401:1310257 [2] NCCL INFO Connected all rings +0: jzxh159:1308399:1310255 [0] NCCL INFO Connected all rings +0: jzxh159:1308402:1310258 [3] NCCL INFO Connected all rings +0: jzxh159:1308400:1310256 [1] NCCL INFO Connected all rings +2: jzxh161:3283536:3284336 [0] NCCL INFO Connected all rings +0: jzxh159:1308399:1310255 [0] NCCL INFO Channel 02/0 : 0[0] -> 1[1] via P2P/CUMEM +1: jzxh160:364773:365553 [2] NCCL INFO Connected all rings +1: jzxh160:364774:365551 [3] NCCL INFO Connected all rings +1: jzxh160:364771:365554 [0] NCCL INFO Connected all rings +1: jzxh160:364772:365552 [1] NCCL INFO Connected all rings +3: jzxh162:3160744:3161553 [2] NCCL INFO Connected all rings +3: jzxh162:3160745:3161552 [3] NCCL INFO Connected all rings +2: jzxh161:3283536:3284336 [0] NCCL INFO Channel 02/0 : 8[0] -> 9[1] via P2P/CUMEM +2: jzxh161:3283536:3284336 [0] NCCL INFO Channel 10/0 : 8[0] -> 9[1] via P2P/CUMEM +1: jzxh160:364771:365554 [0] NCCL INFO Channel 02/0 : 4[0] -> 5[1] via P2P/CUMEM +3: jzxh162:3160742:3161550 [0] NCCL INFO Connected all rings +3: jzxh162:3160743:3161551 [1] NCCL INFO Connected all rings +1: jzxh160:364771:365554 [0] NCCL INFO Channel 10/0 : 4[0] -> 5[1] via P2P/CUMEM +3: jzxh162:3160742:3161550 [0] NCCL INFO Channel 02/0 : 12[0] -> 13[1] via P2P/CUMEM +0: jzxh159:1308399:1310255 [0] NCCL INFO Channel 10/0 : 0[0] -> 1[1] via P2P/CUMEM +2: jzxh161:3283537:3284337 [1] NCCL INFO Connected all rings +0: jzxh159:1308401:1310257 [2] NCCL INFO Channel 01/0 : 2[2] -> 3[3] via P2P/CUMEM +2: jzxh161:3283539:3284338 [3] NCCL INFO Connected all rings +2: jzxh161:3283538:3284335 [2] NCCL INFO Connected all rings +0: jzxh159:1308401:1310257 [2] NCCL INFO Channel 07/0 : 2[2] -> 3[3] via P2P/CUMEM +0: jzxh159:1308400:1310256 [1] NCCL INFO Channel 01/0 : 1[1] -> 2[2] via P2P/CUMEM +1: jzxh160:364772:365552 [1] NCCL INFO Channel 01/0 : 5[1] -> 6[2] via P2P/CUMEM +3: jzxh162:3160742:3161550 [0] NCCL INFO Channel 10/0 : 12[0] -> 13[1] via P2P/CUMEM +0: jzxh159:1308401:1310257 [2] NCCL INFO Channel 09/0 : 2[2] -> 3[3] via P2P/CUMEM +1: jzxh160:364773:365553 [2] NCCL INFO Channel 01/0 : 6[2] -> 7[3] via P2P/CUMEM +3: jzxh162:3160744:3161553 [2] NCCL INFO Channel 01/0 : 14[2] -> 15[3] via P2P/CUMEM +0: jzxh159:1308400:1310256 [1] NCCL INFO Channel 09/0 : 1[1] -> 2[2] via P2P/CUMEM +1: jzxh160:364772:365552 [1] NCCL INFO Channel 09/0 : 5[1] -> 6[2] via P2P/CUMEM +2: jzxh161:3283537:3284337 [1] NCCL INFO Channel 01/0 : 9[1] -> 10[2] via P2P/CUMEM +3: jzxh162:3160744:3161553 [2] NCCL INFO Channel 07/0 : 14[2] -> 15[3] via P2P/CUMEM +0: jzxh159:1308401:1310257 [2] NCCL INFO Channel 15/0 : 2[2] -> 3[3] via P2P/CUMEM +1: jzxh160:364773:365553 [2] NCCL INFO Channel 07/0 : 6[2] -> 7[3] via P2P/CUMEM +0: jzxh159:1308399:1310255 [0] NCCL INFO Channel 02/0 : 0[0] -> 2[2] via P2P/CUMEM +2: jzxh161:3283538:3284335 [2] NCCL INFO Channel 01/0 : 10[2] -> 11[3] via P2P/CUMEM +0: jzxh159:1308400:1310256 [1] NCCL INFO Channel 02/0 : 1[1] -> 3[3] via P2P/CUMEM +1: jzxh160:364773:365553 [2] NCCL INFO Channel 09/0 : 6[2] -> 7[3] via P2P/CUMEM +1: jzxh160:364771:365554 [0] NCCL INFO Channel 02/0 : 4[0] -> 6[2] via P2P/CUMEM +3: jzxh162:3160743:3161551 [1] NCCL INFO Channel 01/0 : 13[1] -> 14[2] via P2P/CUMEM +0: jzxh159:1308399:1310255 [0] NCCL INFO Channel 03/0 : 0[0] -> 2[2] via P2P/CUMEM +1: jzxh160:364773:365553 [2] NCCL INFO Channel 15/0 : 6[2] -> 7[3] via P2P/CUMEM +2: jzxh161:3283537:3284337 [1] NCCL INFO Channel 09/0 : 9[1] -> 10[2] via P2P/CUMEM +0: jzxh159:1308400:1310256 [1] NCCL INFO Channel 03/0 : 1[1] -> 3[3] via P2P/CUMEM +2: jzxh161:3283538:3284335 [2] NCCL INFO Channel 07/0 : 10[2] -> 11[3] via P2P/CUMEM +3: jzxh162:3160744:3161553 [2] NCCL INFO Channel 09/0 : 14[2] -> 15[3] via P2P/CUMEM +3: jzxh162:3160743:3161551 [1] NCCL INFO Channel 09/0 : 13[1] -> 14[2] via P2P/CUMEM +0: jzxh159:1308399:1310255 [0] NCCL INFO Channel 05/0 : 0[0] -> 2[2] via P2P/CUMEM +1: jzxh160:364771:365554 [0] NCCL INFO Channel 03/0 : 4[0] -> 6[2] via P2P/CUMEM +2: jzxh161:3283536:3284336 [0] NCCL INFO Channel 02/0 : 8[0] -> 10[2] via P2P/CUMEM +2: jzxh161:3283538:3284335 [2] NCCL INFO Channel 09/0 : 10[2] -> 11[3] via P2P/CUMEM +3: jzxh162:3160744:3161553 [2] NCCL INFO Channel 15/0 : 14[2] -> 15[3] via P2P/CUMEM +0: jzxh159:1308400:1310256 [1] NCCL INFO Channel 05/0 : 1[1] -> 3[3] via P2P/CUMEM +1: jzxh160:364771:365554 [0] NCCL INFO Channel 05/0 : 4[0] -> 6[2] via P2P/CUMEM +1: jzxh160:364772:365552 [1] NCCL INFO Channel 02/0 : 5[1] -> 7[3] via P2P/CUMEM +2: jzxh161:3283538:3284335 [2] NCCL INFO Channel 15/0 : 10[2] -> 11[3] via P2P/CUMEM +0: jzxh159:1308400:1310256 [1] NCCL INFO Channel 06/0 : 1[1] -> 3[3] via P2P/CUMEM +0: jzxh159:1308399:1310255 [0] NCCL INFO Channel 06/0 : 0[0] -> 2[2] via P2P/CUMEM +2: jzxh161:3283536:3284336 [0] NCCL INFO Channel 03/0 : 8[0] -> 10[2] via P2P/CUMEM +1: jzxh160:364772:365552 [1] NCCL INFO Channel 03/0 : 5[1] -> 7[3] via P2P/CUMEM +0: jzxh159:1308399:1310255 [0] NCCL INFO Channel 10/0 : 0[0] -> 2[2] via P2P/CUMEM +0: jzxh159:1308400:1310256 [1] NCCL INFO Channel 10/0 : 1[1] -> 3[3] via P2P/CUMEM +1: jzxh160:364771:365554 [0] NCCL INFO Channel 06/0 : 4[0] -> 6[2] via P2P/CUMEM +2: jzxh161:3283536:3284336 [0] NCCL INFO Channel 05/0 : 8[0] -> 10[2] via P2P/CUMEM +0: jzxh159:1308399:1310255 [0] NCCL INFO Channel 11/0 : 0[0] -> 2[2] via P2P/CUMEM +2: jzxh161:3283537:3284337 [1] NCCL INFO Channel 02/0 : 9[1] -> 11[3] via P2P/CUMEM +0: jzxh159:1308400:1310256 [1] NCCL INFO Channel 11/0 : 1[1] -> 3[3] via P2P/CUMEM +2: jzxh161:3283536:3284336 [0] NCCL INFO Channel 06/0 : 8[0] -> 10[2] via P2P/CUMEM +1: jzxh160:364772:365552 [1] NCCL INFO Channel 05/0 : 5[1] -> 7[3] via P2P/CUMEM +0: jzxh159:1308399:1310255 [0] NCCL INFO Channel 13/0 : 0[0] -> 2[2] via P2P/CUMEM +0: jzxh159:1308400:1310256 [1] NCCL INFO Channel 13/0 : 1[1] -> 3[3] via P2P/CUMEM +1: jzxh160:364771:365554 [0] NCCL INFO Channel 10/0 : 4[0] -> 6[2] via P2P/CUMEM +0: jzxh159:1308400:1310256 [1] NCCL INFO Channel 14/0 : 1[1] -> 3[3] via P2P/CUMEM +1: jzxh160:364771:365554 [0] NCCL INFO Channel 11/0 : 4[0] -> 6[2] via P2P/CUMEM +3: jzxh162:3160742:3161550 [0] NCCL INFO Channel 02/0 : 12[0] -> 14[2] via P2P/CUMEM +0: jzxh159:1308399:1310255 [0] NCCL INFO Channel 14/0 : 0[0] -> 2[2] via P2P/CUMEM +3: jzxh162:3160743:3161551 [1] NCCL INFO Channel 02/0 : 13[1] -> 15[3] via P2P/CUMEM +1: jzxh160:364772:365552 [1] NCCL INFO Channel 06/0 : 5[1] -> 7[3] via P2P/CUMEM +3: jzxh162:3160742:3161550 [0] NCCL INFO Channel 03/0 : 12[0] -> 14[2] via P2P/CUMEM +1: jzxh160:364772:365552 [1] NCCL INFO Channel 10/0 : 5[1] -> 7[3] via P2P/CUMEM +1: jzxh160:364771:365554 [0] NCCL INFO Channel 13/0 : 4[0] -> 6[2] via P2P/CUMEM +1: jzxh160:364771:365554 [0] NCCL INFO Channel 14/0 : 4[0] -> 6[2] via P2P/CUMEM +2: jzxh161:3283536:3284336 [0] NCCL INFO Channel 10/0 : 8[0] -> 10[2] via P2P/CUMEM +3: jzxh162:3160743:3161551 [1] NCCL INFO Channel 03/0 : 13[1] -> 15[3] via P2P/CUMEM +1: jzxh160:364772:365552 [1] NCCL INFO Channel 11/0 : 5[1] -> 7[3] via P2P/CUMEM +3: jzxh162:3160742:3161550 [0] NCCL INFO Channel 05/0 : 12[0] -> 14[2] via P2P/CUMEM +2: jzxh161:3283536:3284336 [0] NCCL INFO Channel 11/0 : 8[0] -> 10[2] via P2P/CUMEM +1: jzxh160:364772:365552 [1] NCCL INFO Channel 13/0 : 5[1] -> 7[3] via P2P/CUMEM +2: jzxh161:3283537:3284337 [1] NCCL INFO Channel 03/0 : 9[1] -> 11[3] via P2P/CUMEM +3: jzxh162:3160742:3161550 [0] NCCL INFO Channel 06/0 : 12[0] -> 14[2] via P2P/CUMEM +2: jzxh161:3283536:3284336 [0] NCCL INFO Channel 13/0 : 8[0] -> 10[2] via P2P/CUMEM +3: jzxh162:3160743:3161551 [1] NCCL INFO Channel 05/0 : 13[1] -> 15[3] via P2P/CUMEM +2: jzxh161:3283537:3284337 [1] NCCL INFO Channel 05/0 : 9[1] -> 11[3] via P2P/CUMEM +1: jzxh160:364772:365552 [1] NCCL INFO Channel 14/0 : 5[1] -> 7[3] via P2P/CUMEM +2: jzxh161:3283536:3284336 [0] NCCL INFO Channel 14/0 : 8[0] -> 10[2] via P2P/CUMEM +0: jzxh159:1308399:1310255 [0] NCCL INFO Channel 04/0 : 0[0] -> 3[3] via P2P/CUMEM +2: jzxh161:3283537:3284337 [1] NCCL INFO Channel 06/0 : 9[1] -> 11[3] via P2P/CUMEM +3: jzxh162:3160743:3161551 [1] NCCL INFO Channel 06/0 : 13[1] -> 15[3] via P2P/CUMEM +0: jzxh159:1308401:1310257 [2] NCCL INFO Channel 10/0 : 2[2] -> 6[2] [send] via NET/IB/2/GDRDMA +0: jzxh159:1308401:1310257 [2] NCCL INFO Channel 14/0 : 2[2] -> 6[2] [send] via NET/IB/2/GDRDMA +0: jzxh159:1308400:1310256 [1] NCCL INFO Channel 09/0 : 1[1] -> 5[1] [send] via NET/IB/1/GDRDMA +0: jzxh159:1308399:1310255 [0] NCCL INFO Channel 12/0 : 0[0] -> 3[3] via P2P/CUMEM +0: jzxh159:1308400:1310256 [1] NCCL INFO Channel 13/0 : 1[1] -> 5[1] [send] via NET/IB/1/GDRDMA +2: jzxh161:3283537:3284337 [1] NCCL INFO Channel 10/0 : 9[1] -> 11[3] via P2P/CUMEM +3: jzxh162:3160742:3161550 [0] NCCL INFO Channel 10/0 : 12[0] -> 14[2] via P2P/CUMEM +1: jzxh160:364771:365554 [0] NCCL INFO Channel 04/0 : 4[0] -> 7[3] via P2P/CUMEM +1: jzxh160:364773:365553 [2] NCCL INFO Channel 10/0 : 2[2] -> 6[2] [receive] via NET/IB/2/GDRDMA +2: jzxh161:3283537:3284337 [1] NCCL INFO Channel 11/0 : 9[1] -> 11[3] via P2P/CUMEM +1: jzxh160:364772:365552 [1] NCCL INFO Channel 09/0 : 1[1] -> 5[1] [receive] via NET/IB/1/GDRDMA +1: jzxh160:364773:365553 [2] NCCL INFO Channel 14/0 : 2[2] -> 6[2] [receive] via NET/IB/2/GDRDMA +0: jzxh159:1308399:1310255 [0] NCCL INFO Channel 08/0 : 0[0] -> 4[0] [send] via NET/IB/0/GDRDMA +0: jzxh159:1308402:1310258 [3] NCCL INFO Channel 11/0 : 3[3] -> 7[3] [send] via NET/IB/3/GDRDMA +0: jzxh159:1308399:1310255 [0] NCCL INFO Channel 12/0 : 0[0] -> 4[0] [send] via NET/IB/0/GDRDMA +0: jzxh159:1308402:1310258 [3] NCCL INFO Channel 15/0 : 3[3] -> 7[3] [send] via NET/IB/3/GDRDMA +1: jzxh160:364771:365554 [0] NCCL INFO Channel 12/0 : 4[0] -> 7[3] via P2P/CUMEM +1: jzxh160:364772:365552 [1] NCCL INFO Channel 13/0 : 1[1] -> 5[1] [receive] via NET/IB/1/GDRDMA +2: jzxh161:3283537:3284337 [1] NCCL INFO Channel 13/0 : 9[1] -> 11[3] via P2P/CUMEM +1: jzxh160:364773:365553 [2] NCCL INFO Channel 02/0 : 6[2] -> 10[2] [send] via NET/IB/2/GDRDMA +1: jzxh160:364772:365552 [1] NCCL INFO Channel 01/0 : 5[1] -> 9[1] [send] via NET/IB/1/GDRDMA +0: jzxh159:1308401:1310257 [2] NCCL INFO Channel 02/0 : 10[2] -> 2[2] [receive] via NET/IB/2/GDRDMA +1: jzxh160:364773:365553 [2] NCCL INFO Channel 06/0 : 6[2] -> 10[2] [send] via NET/IB/2/GDRDMA +1: jzxh160:364772:365552 [1] NCCL INFO Channel 05/0 : 5[1] -> 9[1] [send] via NET/IB/1/GDRDMA +1: jzxh160:364773:365553 [2] NCCL INFO Channel 10/0 : 6[2] -> 10[2] [send] via NET/IB/2/GDRDMA +1: jzxh160:364772:365552 [1] NCCL INFO Channel 09/0 : 5[1] -> 9[1] [send] via NET/IB/1/GDRDMA +1: jzxh160:364773:365553 [2] NCCL INFO Channel 14/0 : 6[2] -> 10[2] [send] via NET/IB/2/GDRDMA +0: jzxh159:1308401:1310257 [2] NCCL INFO Channel 06/0 : 10[2] -> 2[2] [receive] via NET/IB/2/GDRDMA +0: jzxh159:1308400:1310256 [1] NCCL INFO Channel 01/0 : 9[1] -> 1[1] [receive] via NET/IB/1/GDRDMA +0: jzxh159:1308401:1310257 [2] NCCL INFO Channel 02/0 : 2[2] -> 10[2] [send] via NET/IB/2/GDRDMA +2: jzxh161:3283537:3284337 [1] NCCL INFO Channel 14/0 : 9[1] -> 11[3] via P2P/CUMEM +0: jzxh159:1308401:1310257 [2] NCCL INFO Channel 06/0 : 2[2] -> 10[2] [send] via NET/IB/2/GDRDMA +0: jzxh159:1308400:1310256 [1] NCCL INFO Channel 05/0 : 9[1] -> 1[1] [receive] via NET/IB/1/GDRDMA +0: jzxh159:1308400:1310256 [1] NCCL INFO Channel 01/0 : 1[1] -> 9[1] [send] via NET/IB/1/GDRDMA +0: jzxh159:1308400:1310256 [1] NCCL INFO Channel 05/0 : 1[1] -> 9[1] [send] via NET/IB/1/GDRDMA +2: jzxh161:3283536:3284336 [0] NCCL INFO Channel 04/0 : 8[0] -> 11[3] via P2P/CUMEM +2: jzxh161:3283537:3284337 [1] NCCL INFO Channel 01/0 : 5[1] -> 9[1] [receive] via NET/IB/1/GDRDMA +1: jzxh160:364772:365552 [1] NCCL INFO Channel 13/0 : 5[1] -> 9[1] [send] via NET/IB/1/GDRDMA +1: jzxh160:364771:365554 [0] NCCL INFO Channel 08/0 : 0[0] -> 4[0] [receive] via NET/IB/0/GDRDMA +1: jzxh160:364774:365551 [3] NCCL INFO Channel 11/0 : 3[3] -> 7[3] [receive] via NET/IB/3/GDRDMA +1: jzxh160:364771:365554 [0] NCCL INFO Channel 12/0 : 0[0] -> 4[0] [receive] via NET/IB/0/GDRDMA +1: jzxh160:364774:365551 [3] NCCL INFO Channel 15/0 : 3[3] -> 7[3] [receive] via NET/IB/3/GDRDMA +1: jzxh160:364774:365551 [3] NCCL INFO Channel 03/0 : 7[3] -> 11[3] [send] via NET/IB/3/GDRDMA +1: jzxh160:364771:365554 [0] NCCL INFO Channel 00/0 : 4[0] -> 8[0] [send] via NET/IB/0/GDRDMA +1: jzxh160:364774:365551 [3] NCCL INFO Channel 07/0 : 7[3] -> 11[3] [send] via NET/IB/3/GDRDMA +1: jzxh160:364771:365554 [0] NCCL INFO Channel 04/0 : 4[0] -> 8[0] [send] via NET/IB/0/GDRDMA +0: jzxh159:1308402:1310258 [3] NCCL INFO Channel 03/0 : 11[3] -> 3[3] [receive] via NET/IB/3/GDRDMA +0: jzxh159:1308399:1310255 [0] NCCL INFO Channel 00/0 : 8[0] -> 0[0] [receive] via NET/IB/0/GDRDMA +0: jzxh159:1308402:1310258 [3] NCCL INFO Channel 07/0 : 11[3] -> 3[3] [receive] via NET/IB/3/GDRDMA +0: jzxh159:1308399:1310255 [0] NCCL INFO Channel 04/0 : 8[0] -> 0[0] [receive] via NET/IB/0/GDRDMA +0: jzxh159:1308399:1310255 [0] NCCL INFO Channel 00/0 : 0[0] -> 8[0] [send] via NET/IB/0/GDRDMA +0: jzxh159:1308402:1310258 [3] NCCL INFO Channel 03/0 : 3[3] -> 11[3] [send] via NET/IB/3/GDRDMA +2: jzxh161:3283538:3284335 [2] NCCL INFO Channel 02/0 : 6[2] -> 10[2] [receive] via NET/IB/2/GDRDMA +2: jzxh161:3283537:3284337 [1] NCCL INFO Channel 05/0 : 5[1] -> 9[1] [receive] via NET/IB/1/GDRDMA +2: jzxh161:3283538:3284335 [2] NCCL INFO Channel 06/0 : 6[2] -> 10[2] [receive] via NET/IB/2/GDRDMA +2: jzxh161:3283536:3284336 [0] NCCL INFO Channel 12/0 : 8[0] -> 11[3] via P2P/CUMEM +2: jzxh161:3283537:3284337 [1] NCCL INFO Channel 09/0 : 5[1] -> 9[1] [receive] via NET/IB/1/GDRDMA +2: jzxh161:3283538:3284335 [2] NCCL INFO Channel 10/0 : 6[2] -> 10[2] [receive] via NET/IB/2/GDRDMA +1: jzxh160:364774:365551 [3] NCCL INFO Channel 11/0 : 7[3] -> 11[3] [send] via NET/IB/3/GDRDMA +1: jzxh160:364771:365554 [0] NCCL INFO Channel 08/0 : 4[0] -> 8[0] [send] via NET/IB/0/GDRDMA +1: jzxh160:364774:365551 [3] NCCL INFO Channel 15/0 : 7[3] -> 11[3] [send] via NET/IB/3/GDRDMA +1: jzxh160:364771:365554 [0] NCCL INFO Channel 12/0 : 4[0] -> 8[0] [send] via NET/IB/0/GDRDMA +2: jzxh161:3283538:3284335 [2] NCCL INFO Channel 14/0 : 6[2] -> 10[2] [receive] via NET/IB/2/GDRDMA +2: jzxh161:3283537:3284337 [1] NCCL INFO Channel 13/0 : 5[1] -> 9[1] [receive] via NET/IB/1/GDRDMA +2: jzxh161:3283538:3284335 [2] NCCL INFO Channel 02/0 : 10[2] -> 14[2] [send] via NET/IB/2/GDRDMA +2: jzxh161:3283537:3284337 [1] NCCL INFO Channel 01/0 : 9[1] -> 13[1] [send] via NET/IB/1/GDRDMA +2: jzxh161:3283538:3284335 [2] NCCL INFO Channel 06/0 : 10[2] -> 14[2] [send] via NET/IB/2/GDRDMA +2: jzxh161:3283537:3284337 [1] NCCL INFO Channel 05/0 : 9[1] -> 13[1] [send] via NET/IB/1/GDRDMA +1: jzxh160:364773:365553 [2] NCCL INFO Channel 10/0 : 14[2] -> 6[2] [receive] via NET/IB/2/GDRDMA +1: jzxh160:364773:365553 [2] NCCL INFO Channel 14/0 : 14[2] -> 6[2] [receive] via NET/IB/2/GDRDMA +1: jzxh160:364772:365552 [1] NCCL INFO Channel 09/0 : 13[1] -> 5[1] [receive] via NET/IB/1/GDRDMA +1: jzxh160:364773:365553 [2] NCCL INFO Channel 10/0 : 6[2] -> 14[2] [send] via NET/IB/2/GDRDMA +2: jzxh161:3283539:3284338 [3] NCCL INFO Channel 03/0 : 7[3] -> 11[3] [receive] via NET/IB/3/GDRDMA +2: jzxh161:3283536:3284336 [0] NCCL INFO Channel 00/0 : 4[0] -> 8[0] [receive] via NET/IB/0/GDRDMA +2: jzxh161:3283539:3284338 [3] NCCL INFO Channel 07/0 : 7[3] -> 11[3] [receive] via NET/IB/3/GDRDMA +2: jzxh161:3283536:3284336 [0] NCCL INFO Channel 04/0 : 4[0] -> 8[0] [receive] via NET/IB/0/GDRDMA +2: jzxh161:3283539:3284338 [3] NCCL INFO Channel 11/0 : 7[3] -> 11[3] [receive] via NET/IB/3/GDRDMA +2: jzxh161:3283536:3284336 [0] NCCL INFO Channel 08/0 : 4[0] -> 8[0] [receive] via NET/IB/0/GDRDMA +2: jzxh161:3283539:3284338 [3] NCCL INFO Channel 15/0 : 7[3] -> 11[3] [receive] via NET/IB/3/GDRDMA +2: jzxh161:3283536:3284336 [0] NCCL INFO Channel 12/0 : 4[0] -> 8[0] [receive] via NET/IB/0/GDRDMA +2: jzxh161:3283539:3284338 [3] NCCL INFO Channel 03/0 : 11[3] -> 15[3] [send] via NET/IB/3/GDRDMA +2: jzxh161:3283536:3284336 [0] NCCL INFO Channel 00/0 : 8[0] -> 12[0] [send] via NET/IB/0/GDRDMA +2: jzxh161:3283539:3284338 [3] NCCL INFO Channel 07/0 : 11[3] -> 15[3] [send] via NET/IB/3/GDRDMA +1: jzxh160:364772:365552 [1] NCCL INFO Channel 13/0 : 13[1] -> 5[1] [receive] via NET/IB/1/GDRDMA +1: jzxh160:364773:365553 [2] NCCL INFO Channel 14/0 : 6[2] -> 14[2] [send] via NET/IB/2/GDRDMA +1: jzxh160:364772:365552 [1] NCCL INFO Channel 09/0 : 5[1] -> 13[1] [send] via NET/IB/1/GDRDMA +1: jzxh160:364772:365552 [1] NCCL INFO Channel 13/0 : 5[1] -> 13[1] [send] via NET/IB/1/GDRDMA +1: jzxh160:364774:365551 [3] NCCL INFO Channel 11/0 : 15[3] -> 7[3] [receive] via NET/IB/3/GDRDMA +2: jzxh161:3283536:3284336 [0] NCCL INFO Channel 04/0 : 8[0] -> 12[0] [send] via NET/IB/0/GDRDMA +1: jzxh160:364774:365551 [3] NCCL INFO Channel 15/0 : 15[3] -> 7[3] [receive] via NET/IB/3/GDRDMA +1: jzxh160:364771:365554 [0] NCCL INFO Channel 08/0 : 12[0] -> 4[0] [receive] via NET/IB/0/GDRDMA +1: jzxh160:364774:365551 [3] NCCL INFO Channel 11/0 : 7[3] -> 15[3] [send] via NET/IB/3/GDRDMA +1: jzxh160:364771:365554 [0] NCCL INFO Channel 12/0 : 12[0] -> 4[0] [receive] via NET/IB/0/GDRDMA +1: jzxh160:364774:365551 [3] NCCL INFO Channel 15/0 : 7[3] -> 15[3] [send] via NET/IB/3/GDRDMA +1: jzxh160:364771:365554 [0] NCCL INFO Channel 08/0 : 4[0] -> 12[0] [send] via NET/IB/0/GDRDMA +1: jzxh160:364771:365554 [0] NCCL INFO Channel 12/0 : 4[0] -> 12[0] [send] via NET/IB/0/GDRDMA +0: jzxh159:1308399:1310255 [0] NCCL INFO Channel 04/0 : 0[0] -> 8[0] [send] via NET/IB/0/GDRDMA +3: jzxh162:3160742:3161550 [0] NCCL INFO Channel 11/0 : 12[0] -> 14[2] via P2P/CUMEM +0: jzxh159:1308402:1310258 [3] NCCL INFO Channel 07/0 : 3[3] -> 11[3] [send] via NET/IB/3/GDRDMA +3: jzxh162:3160743:3161551 [1] NCCL INFO Channel 10/0 : 13[1] -> 15[3] via P2P/CUMEM +3: jzxh162:3160742:3161550 [0] NCCL INFO Channel 13/0 : 12[0] -> 14[2] via P2P/CUMEM +3: jzxh162:3160743:3161551 [1] NCCL INFO Channel 11/0 : 13[1] -> 15[3] via P2P/CUMEM +3: jzxh162:3160742:3161550 [0] NCCL INFO Channel 14/0 : 12[0] -> 14[2] via P2P/CUMEM +3: jzxh162:3160743:3161551 [1] NCCL INFO Channel 13/0 : 13[1] -> 15[3] via P2P/CUMEM +3: jzxh162:3160743:3161551 [1] NCCL INFO Channel 14/0 : 13[1] -> 15[3] via P2P/CUMEM +3: jzxh162:3160742:3161550 [0] NCCL INFO Channel 04/0 : 12[0] -> 15[3] via P2P/CUMEM +3: jzxh162:3160744:3161553 [2] NCCL INFO Channel 02/0 : 10[2] -> 14[2] [receive] via NET/IB/2/GDRDMA +3: jzxh162:3160744:3161553 [2] NCCL INFO Channel 06/0 : 10[2] -> 14[2] [receive] via NET/IB/2/GDRDMA +3: jzxh162:3160742:3161550 [0] NCCL INFO Channel 12/0 : 12[0] -> 15[3] via P2P/CUMEM +3: jzxh162:3160744:3161553 [2] NCCL INFO Channel 10/0 : 6[2] -> 14[2] [receive] via NET/IB/2/GDRDMA +3: jzxh162:3160744:3161553 [2] NCCL INFO Channel 14/0 : 6[2] -> 14[2] [receive] via NET/IB/2/GDRDMA +3: jzxh162:3160743:3161551 [1] NCCL INFO Channel 01/0 : 9[1] -> 13[1] [receive] via NET/IB/1/GDRDMA +3: jzxh162:3160744:3161553 [2] NCCL INFO Channel 10/0 : 14[2] -> 6[2] [send] via NET/IB/2/GDRDMA +3: jzxh162:3160743:3161551 [1] NCCL INFO Channel 05/0 : 9[1] -> 13[1] [receive] via NET/IB/1/GDRDMA +3: jzxh162:3160744:3161553 [2] NCCL INFO Channel 14/0 : 14[2] -> 6[2] [send] via NET/IB/2/GDRDMA +3: jzxh162:3160743:3161551 [1] NCCL INFO Channel 09/0 : 5[1] -> 13[1] [receive] via NET/IB/1/GDRDMA +3: jzxh162:3160744:3161553 [2] NCCL INFO Channel 02/0 : 14[2] -> 10[2] [send] via NET/IB/2/GDRDMA +3: jzxh162:3160743:3161551 [1] NCCL INFO Channel 13/0 : 5[1] -> 13[1] [receive] via NET/IB/1/GDRDMA +3: jzxh162:3160744:3161553 [2] NCCL INFO Channel 06/0 : 14[2] -> 10[2] [send] via NET/IB/2/GDRDMA +1: jzxh160:364773:365553 [2] NCCL INFO Channel 02/0 : 10[2] -> 6[2] [receive] via NET/IB/2/GDRDMA +3: jzxh162:3160742:3161550 [0] NCCL INFO Channel 00/0 : 8[0] -> 12[0] [receive] via NET/IB/0/GDRDMA +3: jzxh162:3160745:3161552 [3] NCCL INFO Channel 03/0 : 11[3] -> 15[3] [receive] via NET/IB/3/GDRDMA +3: jzxh162:3160743:3161551 [1] NCCL INFO Channel 09/0 : 13[1] -> 5[1] [send] via NET/IB/1/GDRDMA +3: jzxh162:3160745:3161552 [3] NCCL INFO Channel 07/0 : 11[3] -> 15[3] [receive] via NET/IB/3/GDRDMA +3: jzxh162:3160742:3161550 [0] NCCL INFO Channel 04/0 : 8[0] -> 12[0] [receive] via NET/IB/0/GDRDMA +3: jzxh162:3160743:3161551 [1] NCCL INFO Channel 13/0 : 13[1] -> 5[1] [send] via NET/IB/1/GDRDMA +1: jzxh160:364773:365553 [2] NCCL INFO Channel 06/0 : 10[2] -> 6[2] [receive] via NET/IB/2/GDRDMA +1: jzxh160:364772:365552 [1] NCCL INFO Channel 01/0 : 9[1] -> 5[1] [receive] via NET/IB/1/GDRDMA +1: jzxh160:364773:365553 [2] NCCL INFO Channel 10/0 : 10[2] -> 6[2] [receive] via NET/IB/2/GDRDMA +1: jzxh160:364772:365552 [1] NCCL INFO Channel 05/0 : 9[1] -> 5[1] [receive] via NET/IB/1/GDRDMA +1: jzxh160:364773:365553 [2] NCCL INFO Channel 14/0 : 10[2] -> 6[2] [receive] via NET/IB/2/GDRDMA +2: jzxh161:3283538:3284335 [2] NCCL INFO Channel 02/0 : 2[2] -> 10[2] [receive] via NET/IB/2/GDRDMA +3: jzxh162:3160745:3161552 [3] NCCL INFO Channel 11/0 : 7[3] -> 15[3] [receive] via NET/IB/3/GDRDMA +3: jzxh162:3160743:3161551 [1] NCCL INFO Channel 01/0 : 13[1] -> 9[1] [send] via NET/IB/1/GDRDMA +3: jzxh162:3160742:3161550 [0] NCCL INFO Channel 08/0 : 4[0] -> 12[0] [receive] via NET/IB/0/GDRDMA +3: jzxh162:3160745:3161552 [3] NCCL INFO Channel 15/0 : 7[3] -> 15[3] [receive] via NET/IB/3/GDRDMA +3: jzxh162:3160743:3161551 [1] NCCL INFO Channel 05/0 : 13[1] -> 9[1] [send] via NET/IB/1/GDRDMA +3: jzxh162:3160742:3161550 [0] NCCL INFO Channel 12/0 : 4[0] -> 12[0] [receive] via NET/IB/0/GDRDMA +3: jzxh162:3160745:3161552 [3] NCCL INFO Channel 11/0 : 15[3] -> 7[3] [send] via NET/IB/3/GDRDMA +3: jzxh162:3160745:3161552 [3] NCCL INFO Channel 15/0 : 15[3] -> 7[3] [send] via NET/IB/3/GDRDMA +1: jzxh160:364773:365553 [2] NCCL INFO Channel 10/0 : 6[2] -> 2[2] [send] via NET/IB/2/GDRDMA +1: jzxh160:364772:365552 [1] NCCL INFO Channel 09/0 : 9[1] -> 5[1] [receive] via NET/IB/1/GDRDMA +1: jzxh160:364773:365553 [2] NCCL INFO Channel 14/0 : 6[2] -> 2[2] [send] via NET/IB/2/GDRDMA +1: jzxh160:364772:365552 [1] NCCL INFO Channel 13/0 : 9[1] -> 5[1] [receive] via NET/IB/1/GDRDMA +1: jzxh160:364772:365552 [1] NCCL INFO Channel 09/0 : 5[1] -> 1[1] [send] via NET/IB/1/GDRDMA +1: jzxh160:364772:365552 [1] NCCL INFO Channel 13/0 : 5[1] -> 1[1] [send] via NET/IB/1/GDRDMA +2: jzxh161:3283539:3284338 [3] NCCL INFO Channel 03/0 : 3[3] -> 11[3] [receive] via NET/IB/3/GDRDMA +3: jzxh162:3160745:3161552 [3] NCCL INFO Channel 03/0 : 15[3] -> 11[3] [send] via NET/IB/3/GDRDMA +0: jzxh159:1308401:1310257 [2] NCCL INFO Channel 10/0 : 6[2] -> 2[2] [receive] via NET/IB/2/GDRDMA +2: jzxh161:3283538:3284335 [2] NCCL INFO Channel 06/0 : 2[2] -> 10[2] [receive] via NET/IB/2/GDRDMA +2: jzxh161:3283537:3284337 [1] NCCL INFO Channel 01/0 : 1[1] -> 9[1] [receive] via NET/IB/1/GDRDMA +3: jzxh162:3160745:3161552 [3] NCCL INFO Channel 07/0 : 15[3] -> 11[3] [send] via NET/IB/3/GDRDMA +3: jzxh162:3160742:3161550 [0] NCCL INFO Channel 08/0 : 12[0] -> 4[0] [send] via NET/IB/0/GDRDMA +2: jzxh161:3283539:3284338 [3] NCCL INFO Channel 07/0 : 3[3] -> 11[3] [receive] via NET/IB/3/GDRDMA +2: jzxh161:3283538:3284335 [2] NCCL INFO Channel 02/0 : 10[2] -> 2[2] [send] via NET/IB/2/GDRDMA +3: jzxh162:3160742:3161550 [0] NCCL INFO Channel 12/0 : 12[0] -> 4[0] [send] via NET/IB/0/GDRDMA +2: jzxh161:3283537:3284337 [1] NCCL INFO Channel 05/0 : 1[1] -> 9[1] [receive] via NET/IB/1/GDRDMA +2: jzxh161:3283538:3284335 [2] NCCL INFO Channel 06/0 : 10[2] -> 2[2] [send] via NET/IB/2/GDRDMA +2: jzxh161:3283539:3284338 [3] NCCL INFO Channel 03/0 : 11[3] -> 3[3] [send] via NET/IB/3/GDRDMA +2: jzxh161:3283537:3284337 [1] NCCL INFO Channel 01/0 : 9[1] -> 1[1] [send] via NET/IB/1/GDRDMA +0: jzxh159:1308402:1310258 [3] NCCL INFO Channel 11/0 : 7[3] -> 3[3] [receive] via NET/IB/3/GDRDMA +0: jzxh159:1308401:1310257 [2] NCCL INFO Channel 14/0 : 6[2] -> 2[2] [receive] via NET/IB/2/GDRDMA +0: jzxh159:1308402:1310258 [3] NCCL INFO Channel 15/0 : 7[3] -> 3[3] [receive] via NET/IB/3/GDRDMA +1: jzxh160:364774:365551 [3] NCCL INFO Channel 03/0 : 11[3] -> 7[3] [receive] via NET/IB/3/GDRDMA +1: jzxh160:364774:365551 [3] NCCL INFO Channel 07/0 : 11[3] -> 7[3] [receive] via NET/IB/3/GDRDMA +1: jzxh160:364774:365551 [3] NCCL INFO Channel 11/0 : 11[3] -> 7[3] [receive] via NET/IB/3/GDRDMA +1: jzxh160:364774:365551 [3] NCCL INFO Channel 15/0 : 11[3] -> 7[3] [receive] via NET/IB/3/GDRDMA +3: jzxh162:3160742:3161550 [0] NCCL INFO Channel 00/0 : 12[0] -> 8[0] [send] via NET/IB/0/GDRDMA +3: jzxh162:3160742:3161550 [0] NCCL INFO Channel 04/0 : 12[0] -> 8[0] [send] via NET/IB/0/GDRDMA +1: jzxh160:364771:365554 [0] NCCL INFO Channel 00/0 : 8[0] -> 4[0] [receive] via NET/IB/0/GDRDMA +1: jzxh160:364774:365551 [3] NCCL INFO Channel 11/0 : 7[3] -> 3[3] [send] via NET/IB/3/GDRDMA +2: jzxh161:3283539:3284338 [3] NCCL INFO Channel 07/0 : 11[3] -> 3[3] [send] via NET/IB/3/GDRDMA +1: jzxh160:364774:365551 [3] NCCL INFO Channel 15/0 : 7[3] -> 3[3] [send] via NET/IB/3/GDRDMA +1: jzxh160:364771:365554 [0] NCCL INFO Channel 04/0 : 8[0] -> 4[0] [receive] via NET/IB/0/GDRDMA +2: jzxh161:3283537:3284337 [1] NCCL INFO Channel 05/0 : 9[1] -> 1[1] [send] via NET/IB/1/GDRDMA +1: jzxh160:364771:365554 [0] NCCL INFO Channel 08/0 : 8[0] -> 4[0] [receive] via NET/IB/0/GDRDMA +2: jzxh161:3283538:3284335 [2] NCCL INFO Channel 02/0 : 14[2] -> 10[2] [receive] via NET/IB/2/GDRDMA +2: jzxh161:3283539:3284338 [3] NCCL INFO Channel 03/0 : 15[3] -> 11[3] [receive] via NET/IB/3/GDRDMA +2: jzxh161:3283538:3284335 [2] NCCL INFO Channel 06/0 : 14[2] -> 10[2] [receive] via NET/IB/2/GDRDMA +2: jzxh161:3283537:3284337 [1] NCCL INFO Channel 01/0 : 13[1] -> 9[1] [receive] via NET/IB/1/GDRDMA +2: jzxh161:3283538:3284335 [2] NCCL INFO Channel 02/0 : 10[2] -> 6[2] [send] via NET/IB/2/GDRDMA +2: jzxh161:3283539:3284338 [3] NCCL INFO Channel 07/0 : 15[3] -> 11[3] [receive] via NET/IB/3/GDRDMA +2: jzxh161:3283537:3284337 [1] NCCL INFO Channel 05/0 : 13[1] -> 9[1] [receive] via NET/IB/1/GDRDMA +0: jzxh159:1308401:1310257 [2] NCCL INFO Channel 02/0 : 2[2] -> 0[0] via P2P/CUMEM +1: jzxh160:364771:365554 [0] NCCL INFO Channel 12/0 : 8[0] -> 4[0] [receive] via NET/IB/0/GDRDMA +3: jzxh162:3160744:3161553 [2] NCCL INFO Channel 02/0 : 14[2] -> 12[0] via P2P/CUMEM +0: jzxh159:1308401:1310257 [2] NCCL INFO Channel 03/0 : 2[2] -> 0[0] via P2P/CUMEM +1: jzxh160:364771:365554 [0] NCCL INFO Channel 08/0 : 4[0] -> 0[0] [send] via NET/IB/0/GDRDMA +1: jzxh160:364771:365554 [0] NCCL INFO Channel 12/0 : 4[0] -> 0[0] [send] via NET/IB/0/GDRDMA +1: jzxh160:364773:365553 [2] NCCL INFO Channel 02/0 : 6[2] -> 4[0] via P2P/CUMEM +2: jzxh161:3283538:3284335 [2] NCCL INFO Channel 06/0 : 10[2] -> 6[2] [send] via NET/IB/2/GDRDMA +2: jzxh161:3283539:3284338 [3] NCCL INFO Channel 03/0 : 11[3] -> 7[3] [send] via NET/IB/3/GDRDMA +0: jzxh159:1308400:1310256 [1] NCCL INFO Channel 09/0 : 5[1] -> 1[1] [receive] via NET/IB/1/GDRDMA +2: jzxh161:3283537:3284337 [1] NCCL INFO Channel 01/0 : 9[1] -> 5[1] [send] via NET/IB/1/GDRDMA +2: jzxh161:3283538:3284335 [2] NCCL INFO Channel 10/0 : 10[2] -> 6[2] [send] via NET/IB/2/GDRDMA +2: jzxh161:3283539:3284338 [3] NCCL INFO Channel 07/0 : 11[3] -> 7[3] [send] via NET/IB/3/GDRDMA +2: jzxh161:3283537:3284337 [1] NCCL INFO Channel 05/0 : 9[1] -> 5[1] [send] via NET/IB/1/GDRDMA +2: jzxh161:3283538:3284335 [2] NCCL INFO Channel 14/0 : 10[2] -> 6[2] [send] via NET/IB/2/GDRDMA +2: jzxh161:3283539:3284338 [3] NCCL INFO Channel 11/0 : 11[3] -> 7[3] [send] via NET/IB/3/GDRDMA +2: jzxh161:3283537:3284337 [1] NCCL INFO Channel 09/0 : 9[1] -> 5[1] [send] via NET/IB/1/GDRDMA +2: jzxh161:3283539:3284338 [3] NCCL INFO Channel 15/0 : 11[3] -> 7[3] [send] via NET/IB/3/GDRDMA +2: jzxh161:3283537:3284337 [1] NCCL INFO Channel 13/0 : 9[1] -> 5[1] [send] via NET/IB/1/GDRDMA +0: jzxh159:1308401:1310257 [2] NCCL INFO Channel 05/0 : 2[2] -> 0[0] via P2P/CUMEM +0: jzxh159:1308400:1310256 [1] NCCL INFO Channel 13/0 : 5[1] -> 1[1] [receive] via NET/IB/1/GDRDMA +3: jzxh162:3160745:3161552 [3] NCCL INFO Channel 01/0 : 15[3] -> 12[0] via P2P/CUMEM +3: jzxh162:3160744:3161553 [2] NCCL INFO Channel 03/0 : 14[2] -> 12[0] via P2P/CUMEM +1: jzxh160:364773:365553 [2] NCCL INFO Channel 03/0 : 6[2] -> 4[0] via P2P/CUMEM +0: jzxh159:1308401:1310257 [2] NCCL INFO Channel 06/0 : 2[2] -> 0[0] via P2P/CUMEM +3: jzxh162:3160745:3161552 [3] NCCL INFO Channel 04/0 : 15[3] -> 12[0] via P2P/CUMEM +1: jzxh160:364773:365553 [2] NCCL INFO Channel 05/0 : 6[2] -> 4[0] via P2P/CUMEM +1: jzxh160:364773:365553 [2] NCCL INFO Channel 06/0 : 6[2] -> 4[0] via P2P/CUMEM +0: jzxh159:1308401:1310257 [2] NCCL INFO Channel 10/0 : 2[2] -> 0[0] via P2P/CUMEM +1: jzxh160:364773:365553 [2] NCCL INFO Channel 10/0 : 6[2] -> 4[0] via P2P/CUMEM +3: jzxh162:3160744:3161553 [2] NCCL INFO Channel 05/0 : 14[2] -> 12[0] via P2P/CUMEM +3: jzxh162:3160745:3161552 [3] NCCL INFO Channel 05/0 : 15[3] -> 12[0] via P2P/CUMEM +0: jzxh159:1308401:1310257 [2] NCCL INFO Channel 11/0 : 2[2] -> 0[0] via P2P/CUMEM +1: jzxh160:364773:365553 [2] NCCL INFO Channel 11/0 : 6[2] -> 4[0] via P2P/CUMEM +3: jzxh162:3160744:3161553 [2] NCCL INFO Channel 06/0 : 14[2] -> 12[0] via P2P/CUMEM +0: jzxh159:1308401:1310257 [2] NCCL INFO Channel 13/0 : 2[2] -> 0[0] via P2P/CUMEM +2: jzxh161:3283538:3284335 [2] NCCL INFO Channel 02/0 : 10[2] -> 8[0] via P2P/CUMEM +0: jzxh159:1308402:1310258 [3] NCCL INFO Channel 01/0 : 3[3] -> 0[0] via P2P/CUMEM +1: jzxh160:364773:365553 [2] NCCL INFO Channel 13/0 : 6[2] -> 4[0] via P2P/CUMEM +2: jzxh161:3283538:3284335 [2] NCCL INFO Channel 03/0 : 10[2] -> 8[0] via P2P/CUMEM +3: jzxh162:3160745:3161552 [3] NCCL INFO Channel 06/0 : 15[3] -> 12[0] via P2P/CUMEM +1: jzxh160:364774:365551 [3] NCCL INFO Channel 01/0 : 7[3] -> 4[0] via P2P/CUMEM +3: jzxh162:3160744:3161553 [2] NCCL INFO Channel 10/0 : 14[2] -> 12[0] via P2P/CUMEM +0: jzxh159:1308401:1310257 [2] NCCL INFO Channel 14/0 : 2[2] -> 0[0] via P2P/CUMEM +0: jzxh159:1308402:1310258 [3] NCCL INFO Channel 04/0 : 3[3] -> 0[0] via P2P/CUMEM +3: jzxh162:3160745:3161552 [3] NCCL INFO Channel 09/0 : 15[3] -> 12[0] via P2P/CUMEM +2: jzxh161:3283539:3284338 [3] NCCL INFO Channel 01/0 : 11[3] -> 8[0] via P2P/CUMEM +3: jzxh162:3160744:3161553 [2] NCCL INFO Channel 11/0 : 14[2] -> 12[0] via P2P/CUMEM +2: jzxh161:3283536:3284336 [0] NCCL INFO Channel 00/0 : 0[0] -> 8[0] [receive] via NET/IB/0/GDRDMA +2: jzxh161:3283536:3284336 [0] NCCL INFO Channel 04/0 : 0[0] -> 8[0] [receive] via NET/IB/0/GDRDMA +2: jzxh161:3283538:3284335 [2] NCCL INFO Channel 05/0 : 10[2] -> 8[0] via P2P/CUMEM +1: jzxh160:364773:365553 [2] NCCL INFO Channel 14/0 : 6[2] -> 4[0] via P2P/CUMEM +0: jzxh159:1308402:1310258 [3] NCCL INFO Channel 05/0 : 3[3] -> 0[0] via P2P/CUMEM +3: jzxh162:3160744:3161553 [2] NCCL INFO Channel 13/0 : 14[2] -> 12[0] via P2P/CUMEM +3: jzxh162:3160745:3161552 [3] NCCL INFO Channel 12/0 : 15[3] -> 12[0] via P2P/CUMEM +2: jzxh161:3283539:3284338 [3] NCCL INFO Channel 04/0 : 11[3] -> 8[0] via P2P/CUMEM +2: jzxh161:3283536:3284336 [0] NCCL INFO Channel 00/0 : 8[0] -> 0[0] [send] via NET/IB/0/GDRDMA +2: jzxh161:3283536:3284336 [0] NCCL INFO Channel 04/0 : 8[0] -> 0[0] [send] via NET/IB/0/GDRDMA +0: jzxh159:1308399:1310255 [0] NCCL INFO Channel 08/0 : 4[0] -> 0[0] [receive] via NET/IB/0/GDRDMA +0: jzxh159:1308402:1310258 [3] NCCL INFO Channel 06/0 : 3[3] -> 0[0] via P2P/CUMEM +0: jzxh159:1308399:1310255 [0] NCCL INFO Channel 12/0 : 4[0] -> 0[0] [receive] via NET/IB/0/GDRDMA +3: jzxh162:3160744:3161553 [2] NCCL INFO Channel 14/0 : 14[2] -> 12[0] via P2P/CUMEM +0: jzxh159:1308402:1310258 [3] NCCL INFO Channel 09/0 : 3[3] -> 0[0] via P2P/CUMEM +2: jzxh161:3283538:3284335 [2] NCCL INFO Channel 06/0 : 10[2] -> 8[0] via P2P/CUMEM +2: jzxh161:3283539:3284338 [3] NCCL INFO Channel 05/0 : 11[3] -> 8[0] via P2P/CUMEM +2: jzxh161:3283536:3284336 [0] NCCL INFO Channel 00/0 : 12[0] -> 8[0] [receive] via NET/IB/0/GDRDMA +2: jzxh161:3283536:3284336 [0] NCCL INFO Channel 04/0 : 12[0] -> 8[0] [receive] via NET/IB/0/GDRDMA +2: jzxh161:3283538:3284335 [2] NCCL INFO Channel 10/0 : 10[2] -> 8[0] via P2P/CUMEM +2: jzxh161:3283539:3284338 [3] NCCL INFO Channel 06/0 : 11[3] -> 8[0] via P2P/CUMEM +2: jzxh161:3283536:3284336 [0] NCCL INFO Channel 00/0 : 8[0] -> 4[0] [send] via NET/IB/0/GDRDMA +3: jzxh162:3160745:3161552 [3] NCCL INFO Channel 13/0 : 15[3] -> 12[0] via P2P/CUMEM +0: jzxh159:1308402:1310258 [3] NCCL INFO Channel 12/0 : 3[3] -> 0[0] via P2P/CUMEM +2: jzxh161:3283536:3284336 [0] NCCL INFO Channel 04/0 : 8[0] -> 4[0] [send] via NET/IB/0/GDRDMA +3: jzxh162:3160745:3161552 [3] NCCL INFO Channel 14/0 : 15[3] -> 12[0] via P2P/CUMEM +2: jzxh161:3283536:3284336 [0] NCCL INFO Channel 08/0 : 8[0] -> 4[0] [send] via NET/IB/0/GDRDMA +2: jzxh161:3283538:3284335 [2] NCCL INFO Channel 11/0 : 10[2] -> 8[0] via P2P/CUMEM +2: jzxh161:3283539:3284338 [3] NCCL INFO Channel 09/0 : 11[3] -> 8[0] via P2P/CUMEM +2: jzxh161:3283536:3284336 [0] NCCL INFO Channel 12/0 : 8[0] -> 4[0] [send] via NET/IB/0/GDRDMA +2: jzxh161:3283538:3284335 [2] NCCL INFO Channel 13/0 : 10[2] -> 8[0] via P2P/CUMEM +0: jzxh159:1308402:1310258 [3] NCCL INFO Channel 13/0 : 3[3] -> 0[0] via P2P/CUMEM +1: jzxh160:364774:365551 [3] NCCL INFO Channel 04/0 : 7[3] -> 4[0] via P2P/CUMEM +2: jzxh161:3283539:3284338 [3] NCCL INFO Channel 12/0 : 11[3] -> 8[0] via P2P/CUMEM +2: jzxh161:3283538:3284335 [2] NCCL INFO Channel 14/0 : 10[2] -> 8[0] via P2P/CUMEM +2: jzxh161:3283539:3284338 [3] NCCL INFO Channel 13/0 : 11[3] -> 8[0] via P2P/CUMEM +0: jzxh159:1308402:1310258 [3] NCCL INFO Channel 14/0 : 3[3] -> 0[0] via P2P/CUMEM +1: jzxh160:364774:365551 [3] NCCL INFO Channel 05/0 : 7[3] -> 4[0] via P2P/CUMEM +2: jzxh161:3283539:3284338 [3] NCCL INFO Channel 14/0 : 11[3] -> 8[0] via P2P/CUMEM +1: jzxh160:364774:365551 [3] NCCL INFO Channel 06/0 : 7[3] -> 4[0] via P2P/CUMEM +1: jzxh160:364774:365551 [3] NCCL INFO Channel 09/0 : 7[3] -> 4[0] via P2P/CUMEM +1: jzxh160:364774:365551 [3] NCCL INFO Channel 12/0 : 7[3] -> 4[0] via P2P/CUMEM +0: jzxh159:1308402:1310258 [3] NCCL INFO Channel 02/0 : 3[3] -> 1[1] via P2P/CUMEM +1: jzxh160:364774:365551 [3] NCCL INFO Channel 13/0 : 7[3] -> 4[0] via P2P/CUMEM +3: jzxh162:3160745:3161552 [3] NCCL INFO Channel 02/0 : 15[3] -> 13[1] via P2P/CUMEM +0: jzxh159:1308402:1310258 [3] NCCL INFO Channel 03/0 : 3[3] -> 1[1] via P2P/CUMEM +1: jzxh160:364774:365551 [3] NCCL INFO Channel 14/0 : 7[3] -> 4[0] via P2P/CUMEM +3: jzxh162:3160745:3161552 [3] NCCL INFO Channel 03/0 : 15[3] -> 13[1] via P2P/CUMEM +0: jzxh159:1308402:1310258 [3] NCCL INFO Channel 05/0 : 3[3] -> 1[1] via P2P/CUMEM +3: jzxh162:3160745:3161552 [3] NCCL INFO Channel 05/0 : 15[3] -> 13[1] via P2P/CUMEM +0: jzxh159:1308402:1310258 [3] NCCL INFO Channel 06/0 : 3[3] -> 1[1] via P2P/CUMEM +3: jzxh162:3160745:3161552 [3] NCCL INFO Channel 06/0 : 15[3] -> 13[1] via P2P/CUMEM +2: jzxh161:3283539:3284338 [3] NCCL INFO Channel 02/0 : 11[3] -> 9[1] via P2P/CUMEM +0: jzxh159:1308402:1310258 [3] NCCL INFO Channel 10/0 : 3[3] -> 1[1] via P2P/CUMEM +3: jzxh162:3160745:3161552 [3] NCCL INFO Channel 10/0 : 15[3] -> 13[1] via P2P/CUMEM +2: jzxh161:3283539:3284338 [3] NCCL INFO Channel 03/0 : 11[3] -> 9[1] via P2P/CUMEM +0: jzxh159:1308402:1310258 [3] NCCL INFO Channel 11/0 : 3[3] -> 1[1] via P2P/CUMEM +3: jzxh162:3160745:3161552 [3] NCCL INFO Channel 11/0 : 15[3] -> 13[1] via P2P/CUMEM +3: jzxh162:3160745:3161552 [3] NCCL INFO Channel 13/0 : 15[3] -> 13[1] via P2P/CUMEM +2: jzxh161:3283539:3284338 [3] NCCL INFO Channel 05/0 : 11[3] -> 9[1] via P2P/CUMEM +0: jzxh159:1308402:1310258 [3] NCCL INFO Channel 13/0 : 3[3] -> 1[1] via P2P/CUMEM +0: jzxh159:1308402:1310258 [3] NCCL INFO Channel 14/0 : 3[3] -> 1[1] via P2P/CUMEM +2: jzxh161:3283539:3284338 [3] NCCL INFO Channel 06/0 : 11[3] -> 9[1] via P2P/CUMEM +3: jzxh162:3160745:3161552 [3] NCCL INFO Channel 14/0 : 15[3] -> 13[1] via P2P/CUMEM +1: jzxh160:364774:365551 [3] NCCL INFO Channel 02/0 : 7[3] -> 5[1] via P2P/CUMEM +0: jzxh159:1308402:1310258 [3] NCCL INFO Channel 00/0 : 3[3] -> 2[2] via P2P/CUMEM +2: jzxh161:3283539:3284338 [3] NCCL INFO Channel 10/0 : 11[3] -> 9[1] via P2P/CUMEM +3: jzxh162:3160745:3161552 [3] NCCL INFO Channel 00/0 : 15[3] -> 14[2] via P2P/CUMEM +1: jzxh160:364774:365551 [3] NCCL INFO Channel 03/0 : 7[3] -> 5[1] via P2P/CUMEM +0: jzxh159:1308402:1310258 [3] NCCL INFO Channel 04/0 : 3[3] -> 2[2] via P2P/CUMEM +1: jzxh160:364774:365551 [3] NCCL INFO Channel 05/0 : 7[3] -> 5[1] via P2P/CUMEM +2: jzxh161:3283539:3284338 [3] NCCL INFO Channel 11/0 : 11[3] -> 9[1] via P2P/CUMEM +3: jzxh162:3160745:3161552 [3] NCCL INFO Channel 04/0 : 15[3] -> 14[2] via P2P/CUMEM +0: jzxh159:1308402:1310258 [3] NCCL INFO Channel 07/0 : 3[3] -> 2[2] via P2P/CUMEM +1: jzxh160:364774:365551 [3] NCCL INFO Channel 06/0 : 7[3] -> 5[1] via P2P/CUMEM +2: jzxh161:3283539:3284338 [3] NCCL INFO Channel 13/0 : 11[3] -> 9[1] via P2P/CUMEM +0: jzxh159:1308402:1310258 [3] NCCL INFO Channel 08/0 : 3[3] -> 2[2] via P2P/CUMEM +1: jzxh160:364774:365551 [3] NCCL INFO Channel 10/0 : 7[3] -> 5[1] via P2P/CUMEM +2: jzxh161:3283539:3284338 [3] NCCL INFO Channel 14/0 : 11[3] -> 9[1] via P2P/CUMEM +3: jzxh162:3160745:3161552 [3] NCCL INFO Channel 07/0 : 15[3] -> 14[2] via P2P/CUMEM +1: jzxh160:364774:365551 [3] NCCL INFO Channel 11/0 : 7[3] -> 5[1] via P2P/CUMEM +0: jzxh159:1308402:1310258 [3] NCCL INFO Channel 12/0 : 3[3] -> 2[2] via P2P/CUMEM +2: jzxh161:3283539:3284338 [3] NCCL INFO Channel 00/0 : 11[3] -> 10[2] via P2P/CUMEM +3: jzxh162:3160745:3161552 [3] NCCL INFO Channel 08/0 : 15[3] -> 14[2] via P2P/CUMEM +1: jzxh160:364774:365551 [3] NCCL INFO Channel 13/0 : 7[3] -> 5[1] via P2P/CUMEM +0: jzxh159:1308402:1310258 [3] NCCL INFO Channel 15/0 : 3[3] -> 2[2] via P2P/CUMEM +1: jzxh160:364774:365551 [3] NCCL INFO Channel 14/0 : 7[3] -> 5[1] via P2P/CUMEM +2: jzxh161:3283539:3284338 [3] NCCL INFO Channel 04/0 : 11[3] -> 10[2] via P2P/CUMEM +1: jzxh160:364774:365551 [3] NCCL INFO Channel 00/0 : 7[3] -> 6[2] via P2P/CUMEM +3: jzxh162:3160745:3161552 [3] NCCL INFO Channel 12/0 : 15[3] -> 14[2] via P2P/CUMEM +2: jzxh161:3283539:3284338 [3] NCCL INFO Channel 07/0 : 11[3] -> 10[2] via P2P/CUMEM +1: jzxh160:364774:365551 [3] NCCL INFO Channel 04/0 : 7[3] -> 6[2] via P2P/CUMEM +3: jzxh162:3160745:3161552 [3] NCCL INFO Channel 15/0 : 15[3] -> 14[2] via P2P/CUMEM +2: jzxh161:3283539:3284338 [3] NCCL INFO Channel 08/0 : 11[3] -> 10[2] via P2P/CUMEM +1: jzxh160:364774:365551 [3] NCCL INFO Channel 07/0 : 7[3] -> 6[2] via P2P/CUMEM +3: jzxh162:3160744:3161553 [2] NCCL INFO Channel 00/0 : 14[2] -> 13[1] via P2P/CUMEM +1: jzxh160:364774:365551 [3] NCCL INFO Channel 08/0 : 7[3] -> 6[2] via P2P/CUMEM +2: jzxh161:3283539:3284338 [3] NCCL INFO Channel 12/0 : 11[3] -> 10[2] via P2P/CUMEM +3: jzxh162:3160744:3161553 [2] NCCL INFO Channel 01/0 : 14[2] -> 13[1] via P2P/CUMEM +0: jzxh159:1308401:1310257 [2] NCCL INFO Channel 00/0 : 2[2] -> 1[1] via P2P/CUMEM +1: jzxh160:364774:365551 [3] NCCL INFO Channel 12/0 : 7[3] -> 6[2] via P2P/CUMEM +3: jzxh162:3160744:3161553 [2] NCCL INFO Channel 04/0 : 14[2] -> 13[1] via P2P/CUMEM +0: jzxh159:1308400:1310256 [1] NCCL INFO Channel 00/0 : 1[1] -> 0[0] via P2P/CUMEM +0: jzxh159:1308401:1310257 [2] NCCL INFO Channel 01/0 : 2[2] -> 1[1] via P2P/CUMEM +2: jzxh161:3283539:3284338 [3] NCCL INFO Channel 15/0 : 11[3] -> 10[2] via P2P/CUMEM +3: jzxh162:3160744:3161553 [2] NCCL INFO Channel 07/0 : 14[2] -> 13[1] via P2P/CUMEM +0: jzxh159:1308400:1310256 [1] NCCL INFO Channel 03/0 : 1[1] -> 0[0] via P2P/CUMEM +0: jzxh159:1308401:1310257 [2] NCCL INFO Channel 04/0 : 2[2] -> 1[1] via P2P/CUMEM +1: jzxh160:364774:365551 [3] NCCL INFO Channel 15/0 : 7[3] -> 6[2] via P2P/CUMEM +3: jzxh162:3160743:3161551 [1] NCCL INFO Channel 00/0 : 13[1] -> 12[0] via P2P/CUMEM +0: jzxh159:1308400:1310256 [1] NCCL INFO Channel 07/0 : 1[1] -> 0[0] via P2P/CUMEM +0: jzxh159:1308401:1310257 [2] NCCL INFO Channel 07/0 : 2[2] -> 1[1] via P2P/CUMEM +2: jzxh161:3283538:3284335 [2] NCCL INFO Channel 00/0 : 10[2] -> 9[1] via P2P/CUMEM +3: jzxh162:3160744:3161553 [2] NCCL INFO Channel 08/0 : 14[2] -> 13[1] via P2P/CUMEM +3: jzxh162:3160743:3161551 [1] NCCL INFO Channel 03/0 : 13[1] -> 12[0] via P2P/CUMEM +0: jzxh159:1308400:1310256 [1] NCCL INFO Channel 08/0 : 1[1] -> 0[0] via P2P/CUMEM +0: jzxh159:1308401:1310257 [2] NCCL INFO Channel 08/0 : 2[2] -> 1[1] via P2P/CUMEM +2: jzxh161:3283538:3284335 [2] NCCL INFO Channel 01/0 : 10[2] -> 9[1] via P2P/CUMEM +3: jzxh162:3160744:3161553 [2] NCCL INFO Channel 09/0 : 14[2] -> 13[1] via P2P/CUMEM +2: jzxh161:3283538:3284335 [2] NCCL INFO Channel 04/0 : 10[2] -> 9[1] via P2P/CUMEM +3: jzxh162:3160743:3161551 [1] NCCL INFO Channel 07/0 : 13[1] -> 12[0] via P2P/CUMEM +1: jzxh160:364773:365553 [2] NCCL INFO Channel 00/0 : 6[2] -> 5[1] via P2P/CUMEM +0: jzxh159:1308400:1310256 [1] NCCL INFO Channel 11/0 : 1[1] -> 0[0] via P2P/CUMEM +0: jzxh159:1308401:1310257 [2] NCCL INFO Channel 09/0 : 2[2] -> 1[1] via P2P/CUMEM +2: jzxh161:3283537:3284337 [1] NCCL INFO Channel 00/0 : 9[1] -> 8[0] via P2P/CUMEM +3: jzxh162:3160744:3161553 [2] NCCL INFO Channel 12/0 : 14[2] -> 13[1] via P2P/CUMEM +2: jzxh161:3283538:3284335 [2] NCCL INFO Channel 07/0 : 10[2] -> 9[1] via P2P/CUMEM +3: jzxh162:3160743:3161551 [1] NCCL INFO Channel 08/0 : 13[1] -> 12[0] via P2P/CUMEM +2: jzxh161:3283537:3284337 [1] NCCL INFO Channel 03/0 : 9[1] -> 8[0] via P2P/CUMEM +0: jzxh159:1308400:1310256 [1] NCCL INFO Channel 15/0 : 1[1] -> 0[0] via P2P/CUMEM +0: jzxh159:1308401:1310257 [2] NCCL INFO Channel 12/0 : 2[2] -> 1[1] via P2P/CUMEM +1: jzxh160:364773:365553 [2] NCCL INFO Channel 01/0 : 6[2] -> 5[1] via P2P/CUMEM +2: jzxh161:3283538:3284335 [2] NCCL INFO Channel 08/0 : 10[2] -> 9[1] via P2P/CUMEM +2: jzxh161:3283537:3284337 [1] NCCL INFO Channel 07/0 : 9[1] -> 8[0] via P2P/CUMEM +2: jzxh161:3283538:3284335 [2] NCCL INFO Channel 09/0 : 10[2] -> 9[1] via P2P/CUMEM +3: jzxh162:3160744:3161553 [2] NCCL INFO Channel 15/0 : 14[2] -> 13[1] via P2P/CUMEM +3: jzxh162:3160743:3161551 [1] NCCL INFO Channel 11/0 : 13[1] -> 12[0] via P2P/CUMEM +0: jzxh159:1308401:1310257 [2] NCCL INFO Channel 15/0 : 2[2] -> 1[1] via P2P/CUMEM +1: jzxh160:364773:365553 [2] NCCL INFO Channel 04/0 : 6[2] -> 5[1] via P2P/CUMEM +2: jzxh161:3283537:3284337 [1] NCCL INFO Channel 08/0 : 9[1] -> 8[0] via P2P/CUMEM +2: jzxh161:3283538:3284335 [2] NCCL INFO Channel 12/0 : 10[2] -> 9[1] via P2P/CUMEM +3: jzxh162:3160743:3161551 [1] NCCL INFO Channel 15/0 : 13[1] -> 12[0] via P2P/CUMEM +1: jzxh160:364772:365552 [1] NCCL INFO Channel 00/0 : 5[1] -> 4[0] via P2P/CUMEM +2: jzxh161:3283537:3284337 [1] NCCL INFO Channel 11/0 : 9[1] -> 8[0] via P2P/CUMEM +2: jzxh161:3283538:3284335 [2] NCCL INFO Channel 15/0 : 10[2] -> 9[1] via P2P/CUMEM +1: jzxh160:364773:365553 [2] NCCL INFO Channel 07/0 : 6[2] -> 5[1] via P2P/CUMEM +1: jzxh160:364772:365552 [1] NCCL INFO Channel 03/0 : 5[1] -> 4[0] via P2P/CUMEM +2: jzxh161:3283537:3284337 [1] NCCL INFO Channel 15/0 : 9[1] -> 8[0] via P2P/CUMEM +1: jzxh160:364773:365553 [2] NCCL INFO Channel 08/0 : 6[2] -> 5[1] via P2P/CUMEM +1: jzxh160:364772:365552 [1] NCCL INFO Channel 07/0 : 5[1] -> 4[0] via P2P/CUMEM +1: jzxh160:364773:365553 [2] NCCL INFO Channel 09/0 : 6[2] -> 5[1] via P2P/CUMEM +1: jzxh160:364772:365552 [1] NCCL INFO Channel 08/0 : 5[1] -> 4[0] via P2P/CUMEM +1: jzxh160:364773:365553 [2] NCCL INFO Channel 12/0 : 6[2] -> 5[1] via P2P/CUMEM +1: jzxh160:364772:365552 [1] NCCL INFO Channel 11/0 : 5[1] -> 4[0] via P2P/CUMEM +1: jzxh160:364773:365553 [2] NCCL INFO Channel 15/0 : 6[2] -> 5[1] via P2P/CUMEM +1: jzxh160:364772:365552 [1] NCCL INFO Channel 15/0 : 5[1] -> 4[0] via P2P/CUMEM +0: jzxh159:1308399:1310255 [0] NCCL INFO Connected all trees +0: jzxh159:1308400:1310256 [1] NCCL INFO Connected all trees +0: jzxh159:1308400:1310256 [1] NCCL INFO threadThresholds 8/8/64 | 128/8/64 | 512 | 512 +0: jzxh159:1308400:1310256 [1] NCCL INFO 16 coll channels, 16 collnet channels, 0 nvls channels, 16 p2p channels, 2 p2p channels per peer +0: jzxh159:1308402:1310258 [3] NCCL INFO Connected all trees +0: jzxh159:1308401:1310257 [2] NCCL INFO Connected all trees +0: jzxh159:1308399:1310255 [0] NCCL INFO threadThresholds 8/8/64 | 128/8/64 | 512 | 512 +0: jzxh159:1308402:1310258 [3] NCCL INFO threadThresholds 8/8/64 | 128/8/64 | 512 | 512 +0: jzxh159:1308399:1310255 [0] NCCL INFO 16 coll channels, 16 collnet channels, 0 nvls channels, 16 p2p channels, 2 p2p channels per peer +0: jzxh159:1308402:1310258 [3] NCCL INFO 16 coll channels, 16 collnet channels, 0 nvls channels, 16 p2p channels, 2 p2p channels per peer +0: jzxh159:1308401:1310257 [2] NCCL INFO threadThresholds 8/8/64 | 128/8/64 | 512 | 512 +0: jzxh159:1308401:1310257 [2] NCCL INFO 16 coll channels, 16 collnet channels, 0 nvls channels, 16 p2p channels, 2 p2p channels per peer +1: jzxh160:364771:365554 [0] NCCL INFO Connected all trees +3: jzxh162:3160742:3161550 [0] NCCL INFO Connected all trees +3: jzxh162:3160743:3161551 [1] NCCL INFO Connected all trees +2: jzxh161:3283538:3284335 [2] NCCL INFO Connected all trees +2: jzxh161:3283539:3284338 [3] NCCL INFO Connected all trees +3: jzxh162:3160745:3161552 [3] NCCL INFO Connected all trees +3: jzxh162:3160743:3161551 [1] NCCL INFO threadThresholds 8/8/64 | 128/8/64 | 512 | 512 +3: jzxh162:3160742:3161550 [0] NCCL INFO threadThresholds 8/8/64 | 128/8/64 | 512 | 512 +3: jzxh162:3160743:3161551 [1] NCCL INFO 16 coll channels, 16 collnet channels, 0 nvls channels, 16 p2p channels, 2 p2p channels per peer +3: jzxh162:3160742:3161550 [0] NCCL INFO 16 coll channels, 16 collnet channels, 0 nvls channels, 16 p2p channels, 2 p2p channels per peer +3: jzxh162:3160745:3161552 [3] NCCL INFO threadThresholds 8/8/64 | 128/8/64 | 512 | 512 +3: jzxh162:3160745:3161552 [3] NCCL INFO 16 coll channels, 16 collnet channels, 0 nvls channels, 16 p2p channels, 2 p2p channels per peer +3: jzxh162:3160744:3161553 [2] NCCL INFO Connected all trees +3: jzxh162:3160744:3161553 [2] NCCL INFO threadThresholds 8/8/64 | 128/8/64 | 512 | 512 +3: jzxh162:3160744:3161553 [2] NCCL INFO 16 coll channels, 16 collnet channels, 0 nvls channels, 16 p2p channels, 2 p2p channels per peer +1: jzxh160:364772:365552 [1] NCCL INFO Connected all trees +1: jzxh160:364771:365554 [0] NCCL INFO threadThresholds 8/8/64 | 128/8/64 | 512 | 512 +1: jzxh160:364774:365551 [3] NCCL INFO Connected all trees +1: jzxh160:364771:365554 [0] NCCL INFO 16 coll channels, 16 collnet channels, 0 nvls channels, 16 p2p channels, 2 p2p channels per peer +1: jzxh160:364772:365552 [1] NCCL INFO threadThresholds 8/8/64 | 128/8/64 | 512 | 512 +1: jzxh160:364772:365552 [1] NCCL INFO 16 coll channels, 16 collnet channels, 0 nvls channels, 16 p2p channels, 2 p2p channels per peer +1: jzxh160:364773:365553 [2] NCCL INFO Connected all trees +1: jzxh160:364774:365551 [3] NCCL INFO threadThresholds 8/8/64 | 128/8/64 | 512 | 512 +1: jzxh160:364774:365551 [3] NCCL INFO 16 coll channels, 16 collnet channels, 0 nvls channels, 16 p2p channels, 2 p2p channels per peer +1: jzxh160:364773:365553 [2] NCCL INFO threadThresholds 8/8/64 | 128/8/64 | 512 | 512 +1: jzxh160:364773:365553 [2] NCCL INFO 16 coll channels, 16 collnet channels, 0 nvls channels, 16 p2p channels, 2 p2p channels per peer +2: jzxh161:3283538:3284335 [2] NCCL INFO threadThresholds 8/8/64 | 128/8/64 | 512 | 512 +2: jzxh161:3283538:3284335 [2] NCCL INFO 16 coll channels, 16 collnet channels, 0 nvls channels, 16 p2p channels, 2 p2p channels per peer +2: jzxh161:3283539:3284338 [3] NCCL INFO threadThresholds 8/8/64 | 128/8/64 | 512 | 512 +2: jzxh161:3283539:3284338 [3] NCCL INFO 16 coll channels, 16 collnet channels, 0 nvls channels, 16 p2p channels, 2 p2p channels per peer +2: jzxh161:3283536:3284336 [0] NCCL INFO Connected all trees +2: jzxh161:3283537:3284337 [1] NCCL INFO Connected all trees +2: jzxh161:3283536:3284336 [0] NCCL INFO threadThresholds 8/8/64 | 128/8/64 | 512 | 512 +2: jzxh161:3283536:3284336 [0] NCCL INFO 16 coll channels, 16 collnet channels, 0 nvls channels, 16 p2p channels, 2 p2p channels per peer +2: jzxh161:3283537:3284337 [1] NCCL INFO threadThresholds 8/8/64 | 128/8/64 | 512 | 512 +2: jzxh161:3283537:3284337 [1] NCCL INFO 16 coll channels, 16 collnet channels, 0 nvls channels, 16 p2p channels, 2 p2p channels per peer +0: jzxh159:1308401:1310257 [2] NCCL INFO TUNER/Plugin: Plugin load returned 2 : libnccl-net.so: cannot open shared object file: No such file or directory : when loading libnccl-tuner.so +0: jzxh159:1308401:1310257 [2] NCCL INFO TUNER/Plugin: Using internal tuner plugin. +0: jzxh159:1308401:1310257 [2] NCCL INFO ncclCommInitRank comm 0x55cb335512a0 rank 2 nranks 16 cudaDev 2 nvmlDev 2 busId 9d000 commId 0x524c94fe7e5cd79a - Init COMPLETE +0: jzxh159:1308400:1310256 [1] NCCL INFO TUNER/Plugin: Plugin load returned 2 : libnccl-net.so: cannot open shared object file: No such file or directory : when loading libnccl-tuner.so +0: jzxh159:1308399:1310255 [0] NCCL INFO TUNER/Plugin: Plugin load returned 2 : libnccl-net.so: cannot open shared object file: No such file or directory : when loading libnccl-tuner.so +0: jzxh159:1308402:1310258 [3] NCCL INFO TUNER/Plugin: Plugin load returned 2 : libnccl-net.so: cannot open shared object file: No such file or directory : when loading libnccl-tuner.so +0: jzxh159:1308400:1310256 [1] NCCL INFO TUNER/Plugin: Using internal tuner plugin. +0: jzxh159:1308399:1310255 [0] NCCL INFO TUNER/Plugin: Using internal tuner plugin. +0: jzxh159:1308402:1310258 [3] NCCL INFO TUNER/Plugin: Using internal tuner plugin. +0: jzxh159:1308400:1310256 [1] NCCL INFO ncclCommInitRank comm 0x5576720d8870 rank 1 nranks 16 cudaDev 1 nvmlDev 1 busId 2c000 commId 0x524c94fe7e5cd79a - Init COMPLETE +0: jzxh159:1308399:1310255 [0] NCCL INFO ncclCommInitRank comm 0x5650a3b6e180 rank 0 nranks 16 cudaDev 0 nvmlDev 0 busId 1b000 commId 0x524c94fe7e5cd79a - Init COMPLETE +0: jzxh159:1308402:1310258 [3] NCCL INFO ncclCommInitRank comm 0x55cdfc201020 rank 3 nranks 16 cudaDev 3 nvmlDev 3 busId ad000 commId 0x524c94fe7e5cd79a - Init COMPLETE +3: jzxh162:3160745:3161552 [3] NCCL INFO TUNER/Plugin: Plugin load returned 2 : libnccl-net.so: cannot open shared object file: No such file or directory : when loading libnccl-tuner.so +3: jzxh162:3160743:3161551 [1] NCCL INFO TUNER/Plugin: Plugin load returned 2 : libnccl-net.so: cannot open shared object file: No such file or directory : when loading libnccl-tuner.so +3: jzxh162:3160742:3161550 [0] NCCL INFO TUNER/Plugin: Plugin load returned 2 : libnccl-net.so: cannot open shared object file: No such file or directory : when loading libnccl-tuner.so +3: jzxh162:3160745:3161552 [3] NCCL INFO TUNER/Plugin: Using internal tuner plugin. +3: jzxh162:3160743:3161551 [1] NCCL INFO TUNER/Plugin: Using internal tuner plugin. +3: jzxh162:3160742:3161550 [0] NCCL INFO TUNER/Plugin: Using internal tuner plugin. +3: jzxh162:3160745:3161552 [3] NCCL INFO ncclCommInitRank comm 0x55c152c5c3b0 rank 15 nranks 16 cudaDev 3 nvmlDev 3 busId ad000 commId 0x524c94fe7e5cd79a - Init COMPLETE +3: jzxh162:3160742:3161550 [0] NCCL INFO ncclCommInitRank comm 0x5649a9566180 rank 12 nranks 16 cudaDev 0 nvmlDev 0 busId 1b000 commId 0x524c94fe7e5cd79a - Init COMPLETE +3: jzxh162:3160743:3161551 [1] NCCL INFO ncclCommInitRank comm 0x55dd2bde7570 rank 13 nranks 16 cudaDev 1 nvmlDev 1 busId 2c000 commId 0x524c94fe7e5cd79a - Init COMPLETE +3: jzxh162:3160744:3161553 [2] NCCL INFO TUNER/Plugin: Plugin load returned 2 : libnccl-net.so: cannot open shared object file: No such file or directory : when loading libnccl-tuner.so +3: jzxh162:3160744:3161553 [2] NCCL INFO TUNER/Plugin: Using internal tuner plugin. +3: jzxh162:3160744:3161553 [2] NCCL INFO ncclCommInitRank comm 0x56427a552c70 rank 14 nranks 16 cudaDev 2 nvmlDev 2 busId 9d000 commId 0x524c94fe7e5cd79a - Init COMPLETE +3: jzxh162:3160744:3161582 [2] NCCL INFO Channel 08/1 : 14[2] -> 0[0] [send] via NET/IB/0(12)/GDRDMA/Shared +3: jzxh162:3160744:3161582 [2] NCCL INFO Channel 09/1 : 14[2] -> 0[0] [send] via NET/IB/0(12)/GDRDMA/Shared +0: jzxh159:1308401:1310287 [2] NCCL INFO Channel 00/1 : 2[2] -> 0[0] via P2P/CUMEM +0: jzxh159:1308402:1310288 [3] NCCL INFO Channel 00/1 : 3[3] -> 0[0] via P2P/CUMEM +0: jzxh159:1308400:1310289 [1] NCCL INFO Channel 00/1 : 1[1] -> 0[0] via P2P/CUMEM +3: jzxh162:3160742:3161583 [0] NCCL INFO Channel 08/1 : 12[0] -> 0[0] [send] via NET/IB/0/GDRDMA/Shared +0: jzxh159:1308399:1310290 [0] NCCL INFO Channel 08/1 : 15[3] -> 0[0] [receive] via NET/IB/0/GDRDMA/Shared +3: jzxh162:3160745:3161584 [3] NCCL INFO Channel 08/1 : 15[3] -> 0[0] [send] via NET/IB/0(12)/GDRDMA/Shared +3: jzxh162:3160742:3161583 [0] NCCL INFO Channel 09/1 : 12[0] -> 0[0] [send] via NET/IB/0/GDRDMA/Shared +0: jzxh159:1308399:1310290 [0] NCCL INFO Channel 09/1 : 15[3] -> 0[0] [receive] via NET/IB/0/GDRDMA/Shared +3: jzxh162:3160745:3161584 [3] NCCL INFO Channel 09/1 : 15[3] -> 0[0] [send] via NET/IB/0(12)/GDRDMA/Shared +0: jzxh159:1308401:1310287 [2] NCCL INFO Channel 01/1 : 2[2] -> 0[0] via P2P/CUMEM +0: jzxh159:1308402:1310288 [3] NCCL INFO Channel 01/1 : 3[3] -> 0[0] via P2P/CUMEM +0: jzxh159:1308400:1310289 [1] NCCL INFO Channel 01/1 : 1[1] -> 0[0] via P2P/CUMEM +0: jzxh159:1308399:1310290 [0] NCCL INFO Channel 08/1 : 14[2] -> 0[0] [receive] via NET/IB/0/GDRDMA/Shared +3: jzxh162:3160743:3161585 [1] NCCL INFO Channel 08/1 : 13[1] -> 0[0] [send] via NET/IB/0(12)/GDRDMA/Shared +3: jzxh162:3160743:3161585 [1] NCCL INFO Channel 09/1 : 13[1] -> 0[0] [send] via NET/IB/0(12)/GDRDMA/Shared +0: jzxh159:1308399:1310290 [0] NCCL INFO Channel 09/1 : 14[2] -> 0[0] [receive] via NET/IB/0/GDRDMA/Shared +0: jzxh159:1308399:1310290 [0] NCCL INFO Channel 08/1 : 13[1] -> 0[0] [receive] via NET/IB/0/GDRDMA/Shared +0: jzxh159:1308399:1310290 [0] NCCL INFO Channel 09/1 : 13[1] -> 0[0] [receive] via NET/IB/0/GDRDMA/Shared +0: jzxh159:1308399:1310290 [0] NCCL INFO Channel 08/1 : 12[0] -> 0[0] [receive] via NET/IB/0/GDRDMA/Shared +0: jzxh159:1308399:1310290 [0] NCCL INFO Channel 09/1 : 12[0] -> 0[0] [receive] via NET/IB/0/GDRDMA/Shared +0: jzxh159:1308399:1310290 [0] NCCL INFO Channel 04/1 : 11[3] -> 0[0] [receive] via NET/IB/0/GDRDMA/Shared +0: jzxh159:1308399:1310290 [0] NCCL INFO Channel 05/1 : 11[3] -> 0[0] [receive] via NET/IB/0/GDRDMA/Shared +1: jzxh160:364772:365552 [1] NCCL INFO TUNER/Plugin: Plugin load returned 2 : libnccl-net.so: cannot open shared object file: No such file or directory : when loading libnccl-tuner.so +1: jzxh160:364772:365552 [1] NCCL INFO TUNER/Plugin: Using internal tuner plugin. +1: jzxh160:364772:365552 [1] NCCL INFO ncclCommInitRank comm 0x556d3b785610 rank 5 nranks 16 cudaDev 1 nvmlDev 1 busId 2c000 commId 0x524c94fe7e5cd79a - Init COMPLETE +1: jzxh160:364773:365553 [2] NCCL INFO TUNER/Plugin: Plugin load returned 2 : libnccl-net.so: cannot open shared object file: No such file or directory : when loading libnccl-tuner.so +1: jzxh160:364774:365551 [3] NCCL INFO TUNER/Plugin: Plugin load returned 2 : libnccl-net.so: cannot open shared object file: No such file or directory : when loading libnccl-tuner.so +1: jzxh160:364771:365554 [0] NCCL INFO TUNER/Plugin: Plugin load returned 2 : libnccl-net.so: cannot open shared object file: No such file or directory : when loading libnccl-tuner.so +1: jzxh160:364773:365553 [2] NCCL INFO TUNER/Plugin: Using internal tuner plugin. +1: jzxh160:364774:365551 [3] NCCL INFO TUNER/Plugin: Using internal tuner plugin. +1: jzxh160:364771:365554 [0] NCCL INFO TUNER/Plugin: Using internal tuner plugin. +1: jzxh160:364774:365551 [3] NCCL INFO ncclCommInitRank comm 0x557fffcd2ae0 rank 7 nranks 16 cudaDev 3 nvmlDev 3 busId ad000 commId 0x524c94fe7e5cd79a - Init COMPLETE +2: jzxh161:3283537:3284337 [1] NCCL INFO TUNER/Plugin: Plugin load returned 2 : libnccl-net.so: cannot open shared object file: No such file or directory : when loading libnccl-tuner.so +2: jzxh161:3283537:3284337 [1] NCCL INFO TUNER/Plugin: Using internal tuner plugin. +2: jzxh161:3283537:3284337 [1] NCCL INFO ncclCommInitRank comm 0x55f84f114b10 rank 9 nranks 16 cudaDev 1 nvmlDev 1 busId 2c000 commId 0x524c94fe7e5cd79a - Init COMPLETE +1: jzxh160:364773:365553 [2] NCCL INFO ncclCommInitRank comm 0x56520ef52d20 rank 6 nranks 16 cudaDev 2 nvmlDev 2 busId 9d000 commId 0x524c94fe7e5cd79a - Init COMPLETE +1: jzxh160:364771:365554 [0] NCCL INFO ncclCommInitRank comm 0x55a894a256f0 rank 4 nranks 16 cudaDev 0 nvmlDev 0 busId 1b000 commId 0x524c94fe7e5cd79a - Init COMPLETE +2: jzxh161:3283536:3284336 [0] NCCL INFO TUNER/Plugin: Plugin load returned 2 : libnccl-net.so: cannot open shared object file: No such file or directory : when loading libnccl-tuner.so +2: jzxh161:3283536:3284336 [0] NCCL INFO TUNER/Plugin: Using internal tuner plugin. +2: jzxh161:3283536:3284336 [0] NCCL INFO ncclCommInitRank comm 0x55c8d4c4af80 rank 8 nranks 16 cudaDev 0 nvmlDev 0 busId 1b000 commId 0x524c94fe7e5cd79a - Init COMPLETE +2: jzxh161:3283539:3284338 [3] NCCL INFO TUNER/Plugin: Plugin load returned 2 : libnccl-net.so: cannot open shared object file: No such file or directory : when loading libnccl-tuner.so +2: jzxh161:3283538:3284335 [2] NCCL INFO TUNER/Plugin: Plugin load returned 2 : libnccl-net.so: cannot open shared object file: No such file or directory : when loading libnccl-tuner.so +2: jzxh161:3283538:3284335 [2] NCCL INFO TUNER/Plugin: Using internal tuner plugin. +2: jzxh161:3283539:3284338 [3] NCCL INFO TUNER/Plugin: Using internal tuner plugin. +2: jzxh161:3283538:3284335 [2] NCCL INFO ncclCommInitRank comm 0x5643a3152d00 rank 10 nranks 16 cudaDev 2 nvmlDev 2 busId 9d000 commId 0x524c94fe7e5cd79a - Init COMPLETE +2: jzxh161:3283539:3284338 [3] NCCL INFO ncclCommInitRank comm 0x557efcb6a4d0 rank 11 nranks 16 cudaDev 3 nvmlDev 3 busId ad000 commId 0x524c94fe7e5cd79a - Init COMPLETE +1: jzxh160:364771:365583 [0] NCCL INFO Channel 12/1 : 4[0] -> 0[0] [send] via NET/IB/0/GDRDMA/Shared +1: jzxh160:364773:365584 [2] NCCL INFO Channel 12/1 : 6[2] -> 0[0] [send] via NET/IB/0(4)/GDRDMA/Shared +1: jzxh160:364771:365583 [0] NCCL INFO Channel 13/1 : 4[0] -> 0[0] [send] via NET/IB/0/GDRDMA/Shared +1: jzxh160:364773:365584 [2] NCCL INFO Channel 13/1 : 6[2] -> 0[0] [send] via NET/IB/0(4)/GDRDMA/Shared +1: jzxh160:364774:365585 [3] NCCL INFO Channel 12/1 : 7[3] -> 0[0] [send] via NET/IB/0(4)/GDRDMA/Shared +1: jzxh160:364774:365585 [3] NCCL INFO Channel 13/1 : 7[3] -> 0[0] [send] via NET/IB/0(4)/GDRDMA/Shared +1: jzxh160:364772:365586 [1] NCCL INFO Channel 12/1 : 5[1] -> 0[0] [send] via NET/IB/0(4)/GDRDMA/Shared +1: jzxh160:364772:365586 [1] NCCL INFO Channel 13/1 : 5[1] -> 0[0] [send] via NET/IB/0(4)/GDRDMA/Shared +2: jzxh161:3283539:3284370 [3] NCCL INFO Channel 04/1 : 11[3] -> 0[0] [send] via NET/IB/0(8)/GDRDMA/Shared +2: jzxh161:3283537:3284369 [1] NCCL INFO Channel 04/1 : 9[1] -> 0[0] [send] via NET/IB/0(8)/GDRDMA/Shared +2: jzxh161:3283539:3284370 [3] NCCL INFO Channel 05/1 : 11[3] -> 0[0] [send] via NET/IB/0(8)/GDRDMA/Shared +0: jzxh159:1308399:1310290 [0] NCCL INFO Channel 04/1 : 10[2] -> 0[0] [receive] via NET/IB/0/GDRDMA/Shared +2: jzxh161:3283536:3284368 [0] NCCL INFO Channel 04/1 : 8[0] -> 0[0] [send] via NET/IB/0/GDRDMA/Shared +0: jzxh159:1308399:1310290 [0] NCCL INFO Channel 05/1 : 10[2] -> 0[0] [receive] via NET/IB/0/GDRDMA/Shared +2: jzxh161:3283537:3284369 [1] NCCL INFO Channel 05/1 : 9[1] -> 0[0] [send] via NET/IB/0(8)/GDRDMA/Shared +2: jzxh161:3283536:3284368 [0] NCCL INFO Channel 05/1 : 8[0] -> 0[0] [send] via NET/IB/0/GDRDMA/Shared +2: jzxh161:3283538:3284367 [2] NCCL INFO Channel 04/1 : 10[2] -> 0[0] [send] via NET/IB/0(8)/GDRDMA/Shared +2: jzxh161:3283538:3284367 [2] NCCL INFO Channel 05/1 : 10[2] -> 0[0] [send] via NET/IB/0(8)/GDRDMA/Shared +0: jzxh159:1308399:1310290 [0] NCCL INFO Channel 04/1 : 9[1] -> 0[0] [receive] via NET/IB/0/GDRDMA/Shared +0: jzxh159:1308399:1310290 [0] NCCL INFO Channel 05/1 : 9[1] -> 0[0] [receive] via NET/IB/0/GDRDMA/Shared +0: jzxh159:1308399:1310290 [0] NCCL INFO Channel 04/1 : 8[0] -> 0[0] [receive] via NET/IB/0/GDRDMA/Shared +0: jzxh159:1308399:1310290 [0] NCCL INFO Channel 05/1 : 8[0] -> 0[0] [receive] via NET/IB/0/GDRDMA/Shared +0: jzxh159:1308399:1310290 [0] NCCL INFO Channel 12/1 : 7[3] -> 0[0] [receive] via NET/IB/0/GDRDMA/Shared +0: jzxh159:1308399:1310290 [0] NCCL INFO Channel 13/1 : 7[3] -> 0[0] [receive] via NET/IB/0/GDRDMA/Shared +0: jzxh159:1308399:1310290 [0] NCCL INFO Channel 12/1 : 6[2] -> 0[0] [receive] via NET/IB/0/GDRDMA/Shared +0: jzxh159:1308399:1310290 [0] NCCL INFO Channel 13/1 : 6[2] -> 0[0] [receive] via NET/IB/0/GDRDMA/Shared +0: jzxh159:1308399:1310290 [0] NCCL INFO Channel 12/1 : 5[1] -> 0[0] [receive] via NET/IB/0/GDRDMA/Shared +0: jzxh159:1308399:1310290 [0] NCCL INFO Channel 13/1 : 5[1] -> 0[0] [receive] via NET/IB/0/GDRDMA/Shared +0: jzxh159:1308399:1310290 [0] NCCL INFO Channel 12/1 : 4[0] -> 0[0] [receive] via NET/IB/0/GDRDMA/Shared +0: jzxh159:1308399:1310290 [0] NCCL INFO Channel 13/1 : 4[0] -> 0[0] [receive] via NET/IB/0/GDRDMA/Shared +0: [2025-11-23 14:45:05,799] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:436] [PID:1308399] [RANK:0] gather_len_batches: [39614, 39614, 39614, 39614, 39614, 39614, 39614, 39614, 39614, 39614, 39614, 39614, 39614, 39614, 39614, 39614] +0: [2025-11-23 14:45:06,098] [INFO] [axolotl.utils.trainer.calc_sample_packing_eff_est:495] [PID:1308399] [RANK:0] sample_packing_eff_est across ranks: [0.895937442779541, 0.895937442779541, 0.895937442779541, 0.895937442779541, 0.895937442779541, 0.895937442779541, 0.895937442779541, 0.895937442779541, 0.895937442779541, 0.895937442779541, 0.895937442779541, 0.895937442779541, 0.895937442779541, 0.895937442779541, 0.895937442779541, 0.895937442779541] +0: [2025-11-23 14:45:06,115] [INFO] [axolotl.utils.data.sft._prepare_standard_dataset:127] [PID:1308399] [RANK:0] Maximum number of steps set at 1485 +3: Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`. +0: Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`. +1: Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`. +1: Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`. +2: Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`. +3: Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`. +1: Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`. +0: Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`. +2: Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`. +3: Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`. +0: Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`. +3: Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`. +0: Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`. +2: Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`. +2: Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`. +1: Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`. +0: [2025-11-23 14:45:13,869] [INFO] [axolotl.monkeypatch.transformers.trainer_loss_calc.patch_evaluation_loop:110] [PID:1308399] [RANK:0] Patched Trainer.evaluation_loop with nanmean loss calculation +0: [2025-11-23 14:45:13,870] [INFO] [axolotl.monkeypatch.transformers.trainer_loss_calc.patch_maybe_log_save_evaluate:164] [PID:1308399] [RANK:0] Patched Trainer._maybe_log_save_evaluate with nanmean loss calculation +2: Loading checkpoint shards: 0%| | 0/2 [00:003->2 [1] 0/-1/-1->3->2 [2] -1/-1/-1->3->1 [3] 1/11/-1->3->-1 [4] 2/-1/-1->3->0 [5] 0/-1/-1->3->1 [6] 1/-1/-1->3->0 [7] 2/11/-1->3->-1 [8] -1/-1/-1->3->2 [9] 0/-1/-1->3->2 [10] -1/-1/-1->3->1 [11] 1/-1/-1->3->7 [12] 2/-1/-1->3->0 [13] 0/-1/-1->3->1 [14] 1/-1/-1->3->0 [15] 2/-1/-1->3->7 +0: jzxh159:1308401:1311134 [2] NCCL INFO Trees [0] 3/-1/-1->2->1 [1] 3/-1/-1->2->1 [2] 0/10/-1->2->-1 [3] -1/-1/-1->2->0 [4] 1/-1/-1->2->3 [5] -1/-1/-1->2->0 [6] 0/10/-1->2->-1 [7] 1/-1/-1->2->3 [8] 3/-1/-1->2->1 [9] 3/-1/-1->2->1 [10] 0/-1/-1->2->6 [11] -1/-1/-1->2->0 [12] 1/-1/-1->2->3 [13] -1/-1/-1->2->0 [14] 0/-1/-1->2->6 [15] 1/-1/-1->2->3 +0: jzxh159:1308402:1311135 [3] NCCL INFO P2P Chunksize set to 131072 +0: jzxh159:1308399:1311133 [0] NCCL INFO Channel 01/16 : 0 3 2 5 4 7 6 9 8 11 10 13 12 15 14 1 +0: jzxh159:1308401:1311134 [2] NCCL INFO P2P Chunksize set to 131072 +0: jzxh159:1308400:1311136 [1] NCCL INFO Trees [0] 2/-1/-1->1->0 [1] 2/9/-1->1->-1 [2] 3/-1/-1->1->0 [3] 0/-1/-1->1->3 [4] -1/-1/-1->1->2 [5] 3/9/-1->1->-1 [6] -1/-1/-1->1->3 [7] 0/-1/-1->1->2 [8] 2/-1/-1->1->0 [9] 2/-1/-1->1->5 [10] 3/-1/-1->1->0 [11] 0/-1/-1->1->3 [12] -1/-1/-1->1->2 [13] 3/-1/-1->1->5 [14] -1/-1/-1->1->3 [15] 0/-1/-1->1->2 +0: jzxh159:1308400:1311136 [1] NCCL INFO P2P Chunksize set to 131072 +0: jzxh159:1308399:1311133 [0] NCCL INFO Channel 02/16 : 0 3 6 5 4 7 10 9 8 11 14 13 12 15 2 1 +0: jzxh159:1308399:1311133 [0] NCCL INFO Channel 03/16 : 0 1 2 7 4 5 6 11 8 9 10 15 12 13 14 3 +0: jzxh159:1308399:1311133 [0] NCCL INFO Channel 04/16 : 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 +0: jzxh159:1308399:1311133 [0] NCCL INFO Channel 05/16 : 0 3 2 5 4 7 6 9 8 11 10 13 12 15 14 1 +0: jzxh159:1308399:1311133 [0] NCCL INFO Channel 06/16 : 0 3 6 5 4 7 10 9 8 11 14 13 12 15 2 1 +0: jzxh159:1308399:1311133 [0] NCCL INFO Channel 07/16 : 0 1 2 7 4 5 6 11 8 9 10 15 12 13 14 3 +0: jzxh159:1308399:1311133 [0] NCCL INFO Channel 08/16 : 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 +0: jzxh159:1308399:1311133 [0] NCCL INFO Channel 09/16 : 0 3 2 5 4 7 6 9 8 11 10 13 12 15 14 1 +0: jzxh159:1308399:1311133 [0] NCCL INFO Channel 10/16 : 0 3 6 5 4 7 10 9 8 11 14 13 12 15 2 1 +0: jzxh159:1308399:1311133 [0] NCCL INFO Channel 11/16 : 0 1 2 7 4 5 6 11 8 9 10 15 12 13 14 3 +0: jzxh159:1308399:1311133 [0] NCCL INFO Channel 12/16 : 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 +0: jzxh159:1308399:1311133 [0] NCCL INFO Channel 13/16 : 0 3 2 5 4 7 6 9 8 11 10 13 12 15 14 1 +0: jzxh159:1308399:1311133 [0] NCCL INFO Channel 14/16 : 0 3 6 5 4 7 10 9 8 11 14 13 12 15 2 1 +0: jzxh159:1308399:1311133 [0] NCCL INFO Channel 15/16 : 0 1 2 7 4 5 6 11 8 9 10 15 12 13 14 3 +0: jzxh159:1308399:1311133 [0] NCCL INFO Trees [0] 1/8/-1->0->-1 [1] -1/-1/-1->0->3 [2] 1/-1/-1->0->2 [3] 2/-1/-1->0->1 [4] 3/8/-1->0->-1 [5] 2/-1/-1->0->3 [6] 3/-1/-1->0->2 [7] -1/-1/-1->0->1 [8] 1/-1/-1->0->4 [9] -1/-1/-1->0->3 [10] 1/-1/-1->0->2 [11] 2/-1/-1->0->1 [12] 3/-1/-1->0->4 [13] 2/-1/-1->0->3 [14] 3/-1/-1->0->2 [15] -1/-1/-1->0->1 +0: jzxh159:1308399:1311133 [0] NCCL INFO P2P Chunksize set to 131072 +1: jzxh160:364774:366440 [3] NCCL INFO comm 0x1457001250c0 rank 7 nRanks 16 nNodes 4 localRanks 4 localRank 3 MNNVL 0 +1: jzxh160:364773:366439 [2] NCCL INFO comm 0x149098133e00 rank 6 nRanks 16 nNodes 4 localRanks 4 localRank 2 MNNVL 0 +1: jzxh160:364772:366441 [1] NCCL INFO comm 0x144548117000 rank 5 nRanks 16 nNodes 4 localRanks 4 localRank 1 MNNVL 0 +1: jzxh160:364771:366438 [0] NCCL INFO comm 0x15301c1476c0 rank 4 nRanks 16 nNodes 4 localRanks 4 localRank 0 MNNVL 0 +3: jzxh162:3160745:3162425 [3] NCCL INFO comm 0x152d14114450 rank 15 nRanks 16 nNodes 4 localRanks 4 localRank 3 MNNVL 0 +3: jzxh162:3160744:3162427 [2] NCCL INFO comm 0x151768115ac0 rank 14 nRanks 16 nNodes 4 localRanks 4 localRank 2 MNNVL 0 +1: jzxh160:364773:366439 [2] NCCL INFO Trees [0] 7/-1/-1->6->5 [1] 7/-1/-1->6->5 [2] 4/-1/-1->6->10 [3] -1/-1/-1->6->4 [4] 5/-1/-1->6->7 [5] -1/-1/-1->6->4 [6] 4/-1/-1->6->10 [7] 5/-1/-1->6->7 [8] 7/-1/-1->6->5 [9] 7/-1/-1->6->5 [10] 4/10/2->6->14 [11] -1/-1/-1->6->4 [12] 5/-1/-1->6->7 [13] -1/-1/-1->6->4 [14] 4/10/2->6->14 [15] 5/-1/-1->6->7 +1: jzxh160:364774:366440 [3] NCCL INFO Trees [0] -1/-1/-1->7->6 [1] 4/-1/-1->7->6 [2] -1/-1/-1->7->5 [3] 5/-1/-1->7->11 [4] 6/-1/-1->7->4 [5] 4/-1/-1->7->5 [6] 5/-1/-1->7->4 [7] 6/-1/-1->7->11 [8] -1/-1/-1->7->6 [9] 4/-1/-1->7->6 [10] -1/-1/-1->7->5 [11] 5/11/3->7->15 [12] 6/-1/-1->7->4 [13] 4/-1/-1->7->5 [14] 5/-1/-1->7->4 [15] 6/11/3->7->15 +1: jzxh160:364773:366439 [2] NCCL INFO P2P Chunksize set to 131072 +1: jzxh160:364774:366440 [3] NCCL INFO P2P Chunksize set to 131072 +2: jzxh161:3283536:3285122 [0] NCCL INFO comm 0x14c064143000 rank 8 nRanks 16 nNodes 4 localRanks 4 localRank 0 MNNVL 0 +3: jzxh162:3160745:3162425 [3] NCCL INFO Trees [0] -1/-1/-1->15->14 [1] 12/-1/-1->15->14 [2] -1/-1/-1->15->13 [3] 13/-1/-1->15->11 [4] 14/-1/-1->15->12 [5] 12/-1/-1->15->13 [6] 13/-1/-1->15->12 [7] 14/-1/-1->15->11 [8] -1/-1/-1->15->14 [9] 12/-1/-1->15->14 [10] -1/-1/-1->15->13 [11] 13/7/-1->15->-1 [12] 14/-1/-1->15->12 [13] 12/-1/-1->15->13 [14] 13/-1/-1->15->12 [15] 14/7/-1->15->-1 +3: jzxh162:3160745:3162425 [3] NCCL INFO P2P Chunksize set to 131072 +3: jzxh162:3160744:3162427 [2] NCCL INFO Trees [0] 15/-1/-1->14->13 [1] 15/-1/-1->14->13 [2] 12/-1/-1->14->10 [3] -1/-1/-1->14->12 [4] 13/-1/-1->14->15 [5] -1/-1/-1->14->12 [6] 12/-1/-1->14->10 [7] 13/-1/-1->14->15 [8] 15/-1/-1->14->13 [9] 15/-1/-1->14->13 [10] 12/6/-1->14->-1 [11] -1/-1/-1->14->12 [12] 13/-1/-1->14->15 [13] -1/-1/-1->14->12 [14] 12/6/-1->14->-1 [15] 13/-1/-1->14->15 +3: jzxh162:3160744:3162427 [2] NCCL INFO P2P Chunksize set to 131072 +2: jzxh161:3283539:3285120 [3] NCCL INFO comm 0x14b15c115410 rank 11 nRanks 16 nNodes 4 localRanks 4 localRank 3 MNNVL 0 +2: jzxh161:3283537:3285123 [1] NCCL INFO comm 0x147064122c40 rank 9 nRanks 16 nNodes 4 localRanks 4 localRank 1 MNNVL 0 +2: jzxh161:3283538:3285121 [2] NCCL INFO comm 0x14a1d01231c0 rank 10 nRanks 16 nNodes 4 localRanks 4 localRank 2 MNNVL 0 +2: jzxh161:3283536:3285122 [0] NCCL INFO Trees [0] 9/4/12->8->0 [1] -1/-1/-1->8->11 [2] 9/-1/-1->8->10 [3] 10/-1/-1->8->9 [4] 11/4/12->8->0 [5] 10/-1/-1->8->11 [6] 11/-1/-1->8->10 [7] -1/-1/-1->8->9 [8] 9/-1/-1->8->4 [9] -1/-1/-1->8->11 [10] 9/-1/-1->8->10 [11] 10/-1/-1->8->9 [12] 11/-1/-1->8->4 [13] 10/-1/-1->8->11 [14] 11/-1/-1->8->10 [15] -1/-1/-1->8->9 +2: jzxh161:3283536:3285122 [0] NCCL INFO P2P Chunksize set to 131072 +3: jzxh162:3160743:3162426 [1] NCCL INFO comm 0x1481bc125340 rank 13 nRanks 16 nNodes 4 localRanks 4 localRank 1 MNNVL 0 +3: jzxh162:3160742:3162428 [0] NCCL INFO comm 0x14c174126240 rank 12 nRanks 16 nNodes 4 localRanks 4 localRank 0 MNNVL 0 +3: jzxh162:3160743:3162426 [1] NCCL INFO Trees [0] 14/-1/-1->13->12 [1] 14/-1/-1->13->9 [2] 15/-1/-1->13->12 [3] 12/-1/-1->13->15 [4] -1/-1/-1->13->14 [5] 15/-1/-1->13->9 [6] -1/-1/-1->13->15 [7] 12/-1/-1->13->14 [8] 14/-1/-1->13->12 [9] 14/5/-1->13->-1 [10] 15/-1/-1->13->12 [11] 12/-1/-1->13->15 [12] -1/-1/-1->13->14 [13] 15/5/-1->13->-1 [14] -1/-1/-1->13->15 [15] 12/-1/-1->13->14 +3: jzxh162:3160743:3162426 [1] NCCL INFO P2P Chunksize set to 131072 +3: jzxh162:3160742:3162428 [0] NCCL INFO Trees [0] 13/-1/-1->12->8 [1] -1/-1/-1->12->15 [2] 13/-1/-1->12->14 [3] 14/-1/-1->12->13 [4] 15/-1/-1->12->8 [5] 14/-1/-1->12->15 [6] 15/-1/-1->12->14 [7] -1/-1/-1->12->13 [8] 13/4/-1->12->-1 [9] -1/-1/-1->12->15 [10] 13/-1/-1->12->14 [11] 14/-1/-1->12->13 [12] 15/4/-1->12->-1 [13] 14/-1/-1->12->15 [14] 15/-1/-1->12->14 [15] -1/-1/-1->12->13 +3: jzxh162:3160742:3162428 [0] NCCL INFO P2P Chunksize set to 131072 +2: jzxh161:3283539:3285120 [3] NCCL INFO Trees [0] -1/-1/-1->11->10 [1] 8/-1/-1->11->10 [2] -1/-1/-1->11->9 [3] 9/7/15->11->3 [4] 10/-1/-1->11->8 [5] 8/-1/-1->11->9 [6] 9/-1/-1->11->8 [7] 10/7/15->11->3 [8] -1/-1/-1->11->10 [9] 8/-1/-1->11->10 [10] -1/-1/-1->11->9 [11] 9/-1/-1->11->7 [12] 10/-1/-1->11->8 [13] 8/-1/-1->11->9 [14] 9/-1/-1->11->8 [15] 10/-1/-1->11->7 +2: jzxh161:3283537:3285123 [1] NCCL INFO Trees [0] 10/-1/-1->9->8 [1] 10/5/13->9->1 [2] 11/-1/-1->9->8 [3] 8/-1/-1->9->11 [4] -1/-1/-1->9->10 [5] 11/5/13->9->1 [6] -1/-1/-1->9->11 [7] 8/-1/-1->9->10 [8] 10/-1/-1->9->8 [9] 10/-1/-1->9->5 [10] 11/-1/-1->9->8 [11] 8/-1/-1->9->11 [12] -1/-1/-1->9->10 [13] 11/-1/-1->9->5 [14] -1/-1/-1->9->11 [15] 8/-1/-1->9->10 +2: jzxh161:3283539:3285120 [3] NCCL INFO P2P Chunksize set to 131072 +2: jzxh161:3283537:3285123 [1] NCCL INFO P2P Chunksize set to 131072 +2: jzxh161:3283538:3285121 [2] NCCL INFO Trees [0] 11/-1/-1->10->9 [1] 11/-1/-1->10->9 [2] 8/6/14->10->2 [3] -1/-1/-1->10->8 [4] 9/-1/-1->10->11 [5] -1/-1/-1->10->8 [6] 8/6/14->10->2 [7] 9/-1/-1->10->11 [8] 11/-1/-1->10->9 [9] 11/-1/-1->10->9 [10] 8/-1/-1->10->6 [11] -1/-1/-1->10->8 [12] 9/-1/-1->10->11 [13] -1/-1/-1->10->8 [14] 8/-1/-1->10->6 [15] 9/-1/-1->10->11 +2: jzxh161:3283538:3285121 [2] NCCL INFO P2P Chunksize set to 131072 +1: jzxh160:364772:366441 [1] NCCL INFO Trees [0] 6/-1/-1->5->4 [1] 6/-1/-1->5->9 [2] 7/-1/-1->5->4 [3] 4/-1/-1->5->7 [4] -1/-1/-1->5->6 [5] 7/-1/-1->5->9 [6] -1/-1/-1->5->7 [7] 4/-1/-1->5->6 [8] 6/-1/-1->5->4 [9] 6/9/1->5->13 [10] 7/-1/-1->5->4 [11] 4/-1/-1->5->7 [12] -1/-1/-1->5->6 [13] 7/9/1->5->13 [14] -1/-1/-1->5->7 [15] 4/-1/-1->5->6 +1: jzxh160:364772:366441 [1] NCCL INFO P2P Chunksize set to 131072 +1: jzxh160:364771:366438 [0] NCCL INFO Trees [0] 5/-1/-1->4->8 [1] -1/-1/-1->4->7 [2] 5/-1/-1->4->6 [3] 6/-1/-1->4->5 [4] 7/-1/-1->4->8 [5] 6/-1/-1->4->7 [6] 7/-1/-1->4->6 [7] -1/-1/-1->4->5 [8] 5/8/0->4->12 [9] -1/-1/-1->4->7 [10] 5/-1/-1->4->6 [11] 6/-1/-1->4->5 [12] 7/8/0->4->12 [13] 6/-1/-1->4->7 [14] 7/-1/-1->4->6 [15] -1/-1/-1->4->5 +1: jzxh160:364771:366438 [0] NCCL INFO P2P Chunksize set to 131072 +2: jzxh161:3283537:3285123 [1] NCCL INFO Channel 00/0 : 9[1] -> 10[2] via P2P/CUMEM +0: jzxh159:1308401:1311134 [2] NCCL INFO Channel 00/0 : 2[2] -> 3[3] via P2P/CUMEM +0: jzxh159:1308401:1311134 [2] NCCL INFO Channel 04/0 : 2[2] -> 3[3] via P2P/CUMEM +0: jzxh159:1308401:1311134 [2] NCCL INFO Channel 08/0 : 2[2] -> 3[3] via P2P/CUMEM +1: jzxh160:364772:366441 [1] NCCL INFO Channel 00/0 : 5[1] -> 6[2] via P2P/CUMEM +2: jzxh161:3283537:3285123 [1] NCCL INFO Channel 03/0 : 9[1] -> 10[2] via P2P/CUMEM +3: jzxh162:3160744:3162427 [2] NCCL INFO Channel 00/0 : 14[2] -> 15[3] via P2P/CUMEM +0: jzxh159:1308401:1311134 [2] NCCL INFO Channel 12/0 : 2[2] -> 3[3] via P2P/CUMEM +1: jzxh160:364772:366441 [1] NCCL INFO Channel 03/0 : 5[1] -> 6[2] via P2P/CUMEM +2: jzxh161:3283537:3285123 [1] NCCL INFO Channel 04/0 : 9[1] -> 10[2] via P2P/CUMEM +3: jzxh162:3160744:3162427 [2] NCCL INFO Channel 04/0 : 14[2] -> 15[3] via P2P/CUMEM +0: jzxh159:1308400:1311136 [1] NCCL INFO Channel 00/0 : 1[1] -> 2[2] via P2P/CUMEM +1: jzxh160:364772:366441 [1] NCCL INFO Channel 04/0 : 5[1] -> 6[2] via P2P/CUMEM +2: jzxh161:3283537:3285123 [1] NCCL INFO Channel 07/0 : 9[1] -> 10[2] via P2P/CUMEM +0: jzxh159:1308400:1311136 [1] NCCL INFO Channel 03/0 : 1[1] -> 2[2] via P2P/CUMEM +1: jzxh160:364772:366441 [1] NCCL INFO Channel 07/0 : 5[1] -> 6[2] via P2P/CUMEM +2: jzxh161:3283537:3285123 [1] NCCL INFO Channel 08/0 : 9[1] -> 10[2] via P2P/CUMEM +2: jzxh161:3283537:3285123 [1] NCCL INFO Channel 11/0 : 9[1] -> 10[2] via P2P/CUMEM +2: jzxh161:3283537:3285123 [1] NCCL INFO Channel 12/0 : 9[1] -> 10[2] via P2P/CUMEM +0: jzxh159:1308400:1311136 [1] NCCL INFO Channel 04/0 : 1[1] -> 2[2] via P2P/CUMEM +2: jzxh161:3283537:3285123 [1] NCCL INFO Channel 15/0 : 9[1] -> 10[2] via P2P/CUMEM +3: jzxh162:3160744:3162427 [2] NCCL INFO Channel 08/0 : 14[2] -> 15[3] via P2P/CUMEM +1: jzxh160:364772:366441 [1] NCCL INFO Channel 08/0 : 5[1] -> 6[2] via P2P/CUMEM +0: jzxh159:1308400:1311136 [1] NCCL INFO Channel 07/0 : 1[1] -> 2[2] via P2P/CUMEM +0: jzxh159:1308400:1311136 [1] NCCL INFO Channel 08/0 : 1[1] -> 2[2] via P2P/CUMEM +1: jzxh160:364773:366439 [2] NCCL INFO Channel 00/0 : 6[2] -> 7[3] via P2P/CUMEM +1: jzxh160:364772:366441 [1] NCCL INFO Channel 11/0 : 5[1] -> 6[2] via P2P/CUMEM +3: jzxh162:3160744:3162427 [2] NCCL INFO Channel 12/0 : 14[2] -> 15[3] via P2P/CUMEM +0: jzxh159:1308400:1311136 [1] NCCL INFO Channel 11/0 : 1[1] -> 2[2] via P2P/CUMEM +1: jzxh160:364772:366441 [1] NCCL INFO Channel 12/0 : 5[1] -> 6[2] via P2P/CUMEM +0: jzxh159:1308400:1311136 [1] NCCL INFO Channel 12/0 : 1[1] -> 2[2] via P2P/CUMEM +1: jzxh160:364773:366439 [2] NCCL INFO Channel 04/0 : 6[2] -> 7[3] via P2P/CUMEM +2: jzxh161:3283538:3285121 [2] NCCL INFO Channel 00/0 : 10[2] -> 11[3] via P2P/CUMEM +0: jzxh159:1308400:1311136 [1] NCCL INFO Channel 15/0 : 1[1] -> 2[2] via P2P/CUMEM +1: jzxh160:364772:366441 [1] NCCL INFO Channel 15/0 : 5[1] -> 6[2] via P2P/CUMEM +1: jzxh160:364773:366439 [2] NCCL INFO Channel 08/0 : 6[2] -> 7[3] via P2P/CUMEM +0: jzxh159:1308399:1311133 [0] NCCL INFO Channel 00/0 : 15[3] -> 0[0] [receive] via NET/IB/0/GDRDMA +3: jzxh162:3160743:3162426 [1] NCCL INFO Channel 00/0 : 13[1] -> 14[2] via P2P/CUMEM +0: jzxh159:1308402:1311135 [3] NCCL INFO Channel 00/0 : 3[3] -> 4[0] [send] via NET/IB/0(0)/GDRDMA +1: jzxh160:364773:366439 [2] NCCL INFO Channel 12/0 : 6[2] -> 7[3] via P2P/CUMEM +2: jzxh161:3283538:3285121 [2] NCCL INFO Channel 04/0 : 10[2] -> 11[3] via P2P/CUMEM +0: jzxh159:1308399:1311133 [0] NCCL INFO Channel 04/0 : 15[3] -> 0[0] [receive] via NET/IB/0/GDRDMA +0: jzxh159:1308402:1311135 [3] NCCL INFO Channel 04/0 : 3[3] -> 4[0] [send] via NET/IB/0(0)/GDRDMA +3: jzxh162:3160743:3162426 [1] NCCL INFO Channel 03/0 : 13[1] -> 14[2] via P2P/CUMEM +0: jzxh159:1308399:1311133 [0] NCCL INFO Channel 08/0 : 15[3] -> 0[0] [receive] via NET/IB/0/GDRDMA +0: jzxh159:1308402:1311135 [3] NCCL INFO Channel 08/0 : 3[3] -> 4[0] [send] via NET/IB/0(0)/GDRDMA +2: jzxh161:3283538:3285121 [2] NCCL INFO Channel 08/0 : 10[2] -> 11[3] via P2P/CUMEM +3: jzxh162:3160743:3162426 [1] NCCL INFO Channel 04/0 : 13[1] -> 14[2] via P2P/CUMEM +0: jzxh159:1308399:1311133 [0] NCCL INFO Channel 12/0 : 15[3] -> 0[0] [receive] via NET/IB/0/GDRDMA +0: jzxh159:1308399:1311133 [0] NCCL INFO Channel 00/0 : 0[0] -> 1[1] via P2P/CUMEM +0: jzxh159:1308402:1311135 [3] NCCL INFO Channel 12/0 : 3[3] -> 4[0] [send] via NET/IB/0(0)/GDRDMA +2: jzxh161:3283538:3285121 [2] NCCL INFO Channel 12/0 : 10[2] -> 11[3] via P2P/CUMEM +3: jzxh162:3160743:3162426 [1] NCCL INFO Channel 07/0 : 13[1] -> 14[2] via P2P/CUMEM +3: jzxh162:3160743:3162426 [1] NCCL INFO Channel 08/0 : 13[1] -> 14[2] via P2P/CUMEM +0: jzxh159:1308399:1311133 [0] NCCL INFO Channel 03/0 : 0[0] -> 1[1] via P2P/CUMEM +3: jzxh162:3160743:3162426 [1] NCCL INFO Channel 11/0 : 13[1] -> 14[2] via P2P/CUMEM +0: jzxh159:1308399:1311133 [0] NCCL INFO Channel 04/0 : 0[0] -> 1[1] via P2P/CUMEM +0: jzxh159:1308399:1311133 [0] NCCL INFO Channel 07/0 : 0[0] -> 1[1] via P2P/CUMEM +3: jzxh162:3160743:3162426 [1] NCCL INFO Channel 12/0 : 13[1] -> 14[2] via P2P/CUMEM +3: jzxh162:3160743:3162426 [1] NCCL INFO Channel 15/0 : 13[1] -> 14[2] via P2P/CUMEM +0: jzxh159:1308399:1311133 [0] NCCL INFO Channel 08/0 : 0[0] -> 1[1] via P2P/CUMEM +0: jzxh159:1308399:1311133 [0] NCCL INFO Channel 11/0 : 0[0] -> 1[1] via P2P/CUMEM +1: jzxh160:364771:366438 [0] NCCL INFO Channel 00/0 : 3[3] -> 4[0] [receive] via NET/IB/0/GDRDMA +2: jzxh161:3283539:3285120 [3] NCCL INFO Channel 00/0 : 11[3] -> 12[0] [send] via NET/IB/0(8)/GDRDMA +0: jzxh159:1308399:1311133 [0] NCCL INFO Channel 12/0 : 0[0] -> 1[1] via P2P/CUMEM +1: jzxh160:364774:366440 [3] NCCL INFO Channel 00/0 : 7[3] -> 8[0] [send] via NET/IB/0(4)/GDRDMA +2: jzxh161:3283536:3285122 [0] NCCL INFO Channel 00/0 : 7[3] -> 8[0] [receive] via NET/IB/0/GDRDMA +1: jzxh160:364771:366438 [0] NCCL INFO Channel 04/0 : 3[3] -> 4[0] [receive] via NET/IB/0/GDRDMA +2: jzxh161:3283539:3285120 [3] NCCL INFO Channel 04/0 : 11[3] -> 12[0] [send] via NET/IB/0(8)/GDRDMA +1: jzxh160:364774:366440 [3] NCCL INFO Channel 04/0 : 7[3] -> 8[0] [send] via NET/IB/0(4)/GDRDMA +2: jzxh161:3283536:3285122 [0] NCCL INFO Channel 04/0 : 7[3] -> 8[0] [receive] via NET/IB/0/GDRDMA +1: jzxh160:364771:366438 [0] NCCL INFO Channel 08/0 : 3[3] -> 4[0] [receive] via NET/IB/0/GDRDMA +0: jzxh159:1308399:1311133 [0] NCCL INFO Channel 15/0 : 0[0] -> 1[1] via P2P/CUMEM +1: jzxh160:364774:366440 [3] NCCL INFO Channel 08/0 : 7[3] -> 8[0] [send] via NET/IB/0(4)/GDRDMA +2: jzxh161:3283539:3285120 [3] NCCL INFO Channel 08/0 : 11[3] -> 12[0] [send] via NET/IB/0(8)/GDRDMA +1: jzxh160:364771:366438 [0] NCCL INFO Channel 12/0 : 3[3] -> 4[0] [receive] via NET/IB/0/GDRDMA +2: jzxh161:3283536:3285122 [0] NCCL INFO Channel 08/0 : 7[3] -> 8[0] [receive] via NET/IB/0/GDRDMA +1: jzxh160:364771:366438 [0] NCCL INFO Channel 00/0 : 4[0] -> 5[1] via P2P/CUMEM +2: jzxh161:3283539:3285120 [3] NCCL INFO Channel 12/0 : 11[3] -> 12[0] [send] via NET/IB/0(8)/GDRDMA +1: jzxh160:364774:366440 [3] NCCL INFO Channel 12/0 : 7[3] -> 8[0] [send] via NET/IB/0(4)/GDRDMA +2: jzxh161:3283536:3285122 [0] NCCL INFO Channel 12/0 : 7[3] -> 8[0] [receive] via NET/IB/0/GDRDMA +2: jzxh161:3283536:3285122 [0] NCCL INFO Channel 00/0 : 8[0] -> 9[1] via P2P/CUMEM +1: jzxh160:364771:366438 [0] NCCL INFO Channel 03/0 : 4[0] -> 5[1] via P2P/CUMEM +3: jzxh162:3160745:3162425 [3] NCCL INFO Channel 00/0 : 15[3] -> 0[0] [send] via NET/IB/0(12)/GDRDMA +2: jzxh161:3283536:3285122 [0] NCCL INFO Channel 03/0 : 8[0] -> 9[1] via P2P/CUMEM +3: jzxh162:3160742:3162428 [0] NCCL INFO Channel 00/0 : 11[3] -> 12[0] [receive] via NET/IB/0/GDRDMA +3: jzxh162:3160745:3162425 [3] NCCL INFO Channel 04/0 : 15[3] -> 0[0] [send] via NET/IB/0(12)/GDRDMA +1: jzxh160:364771:366438 [0] NCCL INFO Channel 04/0 : 4[0] -> 5[1] via P2P/CUMEM +1: jzxh160:364771:366438 [0] NCCL INFO Channel 07/0 : 4[0] -> 5[1] via P2P/CUMEM +2: jzxh161:3283536:3285122 [0] NCCL INFO Channel 04/0 : 8[0] -> 9[1] via P2P/CUMEM +3: jzxh162:3160742:3162428 [0] NCCL INFO Channel 04/0 : 11[3] -> 12[0] [receive] via NET/IB/0/GDRDMA +3: jzxh162:3160745:3162425 [3] NCCL INFO Channel 08/0 : 15[3] -> 0[0] [send] via NET/IB/0(12)/GDRDMA +3: jzxh162:3160742:3162428 [0] NCCL INFO Channel 08/0 : 11[3] -> 12[0] [receive] via NET/IB/0/GDRDMA +3: jzxh162:3160745:3162425 [3] NCCL INFO Channel 12/0 : 15[3] -> 0[0] [send] via NET/IB/0(12)/GDRDMA +3: jzxh162:3160742:3162428 [0] NCCL INFO Channel 12/0 : 11[3] -> 12[0] [receive] via NET/IB/0/GDRDMA +3: jzxh162:3160742:3162428 [0] NCCL INFO Channel 00/0 : 12[0] -> 13[1] via P2P/CUMEM +0: jzxh159:1308399:1311133 [0] NCCL INFO Channel 01/0 : 0[0] -> 3[3] via P2P/CUMEM +1: jzxh160:364771:366438 [0] NCCL INFO Channel 08/0 : 4[0] -> 5[1] via P2P/CUMEM +2: jzxh161:3283536:3285122 [0] NCCL INFO Channel 07/0 : 8[0] -> 9[1] via P2P/CUMEM +0: jzxh159:1308399:1311133 [0] NCCL INFO Channel 02/0 : 0[0] -> 3[3] via P2P/CUMEM +1: jzxh160:364771:366438 [0] NCCL INFO Channel 11/0 : 4[0] -> 5[1] via P2P/CUMEM +2: jzxh161:3283536:3285122 [0] NCCL INFO Channel 08/0 : 8[0] -> 9[1] via P2P/CUMEM +3: jzxh162:3160742:3162428 [0] NCCL INFO Channel 03/0 : 12[0] -> 13[1] via P2P/CUMEM +0: jzxh159:1308399:1311133 [0] NCCL INFO Channel 05/0 : 0[0] -> 3[3] via P2P/CUMEM +1: jzxh160:364771:366438 [0] NCCL INFO Channel 12/0 : 4[0] -> 5[1] via P2P/CUMEM +2: jzxh161:3283536:3285122 [0] NCCL INFO Channel 11/0 : 8[0] -> 9[1] via P2P/CUMEM +1: jzxh160:364771:366438 [0] NCCL INFO Channel 15/0 : 4[0] -> 5[1] via P2P/CUMEM +0: jzxh159:1308399:1311133 [0] NCCL INFO Channel 06/0 : 0[0] -> 3[3] via P2P/CUMEM +3: jzxh162:3160742:3162428 [0] NCCL INFO Channel 04/0 : 12[0] -> 13[1] via P2P/CUMEM +0: jzxh159:1308399:1311133 [0] NCCL INFO Channel 09/0 : 0[0] -> 3[3] via P2P/CUMEM +2: jzxh161:3283536:3285122 [0] NCCL INFO Channel 12/0 : 8[0] -> 9[1] via P2P/CUMEM +2: jzxh161:3283536:3285122 [0] NCCL INFO Channel 15/0 : 8[0] -> 9[1] via P2P/CUMEM +1: jzxh160:364771:366438 [0] NCCL INFO Channel 01/0 : 4[0] -> 7[3] via P2P/CUMEM +3: jzxh162:3160742:3162428 [0] NCCL INFO Channel 07/0 : 12[0] -> 13[1] via P2P/CUMEM +0: jzxh159:1308399:1311133 [0] NCCL INFO Channel 10/0 : 0[0] -> 3[3] via P2P/CUMEM +1: jzxh160:364771:366438 [0] NCCL INFO Channel 02/0 : 4[0] -> 7[3] via P2P/CUMEM +2: jzxh161:3283536:3285122 [0] NCCL INFO Channel 01/0 : 8[0] -> 11[3] via P2P/CUMEM +3: jzxh162:3160742:3162428 [0] NCCL INFO Channel 08/0 : 12[0] -> 13[1] via P2P/CUMEM +0: jzxh159:1308399:1311133 [0] NCCL INFO Channel 13/0 : 0[0] -> 3[3] via P2P/CUMEM +2: jzxh161:3283536:3285122 [0] NCCL INFO Channel 02/0 : 8[0] -> 11[3] via P2P/CUMEM +1: jzxh160:364771:366438 [0] NCCL INFO Channel 05/0 : 4[0] -> 7[3] via P2P/CUMEM +3: jzxh162:3160742:3162428 [0] NCCL INFO Channel 11/0 : 12[0] -> 13[1] via P2P/CUMEM +2: jzxh161:3283536:3285122 [0] NCCL INFO Channel 05/0 : 8[0] -> 11[3] via P2P/CUMEM +0: jzxh159:1308399:1311133 [0] NCCL INFO Channel 14/0 : 0[0] -> 3[3] via P2P/CUMEM +1: jzxh160:364771:366438 [0] NCCL INFO Channel 06/0 : 4[0] -> 7[3] via P2P/CUMEM +3: jzxh162:3160742:3162428 [0] NCCL INFO Channel 12/0 : 12[0] -> 13[1] via P2P/CUMEM +2: jzxh161:3283536:3285122 [0] NCCL INFO Channel 06/0 : 8[0] -> 11[3] via P2P/CUMEM +3: jzxh162:3160742:3162428 [0] NCCL INFO Channel 15/0 : 12[0] -> 13[1] via P2P/CUMEM +1: jzxh160:364771:366438 [0] NCCL INFO Channel 09/0 : 4[0] -> 7[3] via P2P/CUMEM +2: jzxh161:3283536:3285122 [0] NCCL INFO Channel 09/0 : 8[0] -> 11[3] via P2P/CUMEM +2: jzxh161:3283536:3285122 [0] NCCL INFO Channel 10/0 : 8[0] -> 11[3] via P2P/CUMEM +1: jzxh160:364771:366438 [0] NCCL INFO Channel 10/0 : 4[0] -> 7[3] via P2P/CUMEM +2: jzxh161:3283536:3285122 [0] NCCL INFO Channel 13/0 : 8[0] -> 11[3] via P2P/CUMEM +1: jzxh160:364771:366438 [0] NCCL INFO Channel 13/0 : 4[0] -> 7[3] via P2P/CUMEM +3: jzxh162:3160742:3162428 [0] NCCL INFO Channel 01/0 : 12[0] -> 15[3] via P2P/CUMEM +2: jzxh161:3283536:3285122 [0] NCCL INFO Channel 14/0 : 8[0] -> 11[3] via P2P/CUMEM +1: jzxh160:364771:366438 [0] NCCL INFO Channel 14/0 : 4[0] -> 7[3] via P2P/CUMEM +3: jzxh162:3160742:3162428 [0] NCCL INFO Channel 02/0 : 12[0] -> 15[3] via P2P/CUMEM +0: jzxh159:1308401:1311134 [2] NCCL INFO Channel 02/0 : 15[3] -> 2[2] [receive] via NET/IB/2/GDRDMA +0: jzxh159:1308402:1311135 [3] NCCL INFO Channel 02/0 : 3[3] -> 6[2] [send] via NET/IB/2(2)/GDRDMA +0: jzxh159:1308401:1311134 [2] NCCL INFO Channel 06/0 : 15[3] -> 2[2] [receive] via NET/IB/2/GDRDMA +0: jzxh159:1308402:1311135 [3] NCCL INFO Channel 06/0 : 3[3] -> 6[2] [send] via NET/IB/2(2)/GDRDMA +0: jzxh159:1308401:1311134 [2] NCCL INFO Channel 10/0 : 15[3] -> 2[2] [receive] via NET/IB/2/GDRDMA +0: jzxh159:1308402:1311135 [3] NCCL INFO Channel 10/0 : 3[3] -> 6[2] [send] via NET/IB/2(2)/GDRDMA +0: jzxh159:1308401:1311134 [2] NCCL INFO Channel 14/0 : 15[3] -> 2[2] [receive] via NET/IB/2/GDRDMA +0: jzxh159:1308402:1311135 [3] NCCL INFO Channel 14/0 : 3[3] -> 6[2] [send] via NET/IB/2(2)/GDRDMA +3: jzxh162:3160742:3162428 [0] NCCL INFO Channel 05/0 : 12[0] -> 15[3] via P2P/CUMEM +3: jzxh162:3160742:3162428 [0] NCCL INFO Channel 06/0 : 12[0] -> 15[3] via P2P/CUMEM +3: jzxh162:3160742:3162428 [0] NCCL INFO Channel 09/0 : 12[0] -> 15[3] via P2P/CUMEM +3: jzxh162:3160742:3162428 [0] NCCL INFO Channel 10/0 : 12[0] -> 15[3] via P2P/CUMEM +3: jzxh162:3160742:3162428 [0] NCCL INFO Channel 13/0 : 12[0] -> 15[3] via P2P/CUMEM +3: jzxh162:3160742:3162428 [0] NCCL INFO Channel 14/0 : 12[0] -> 15[3] via P2P/CUMEM +1: jzxh160:364773:366439 [2] NCCL INFO Channel 02/0 : 3[3] -> 6[2] [receive] via NET/IB/2/GDRDMA +1: jzxh160:364774:366440 [3] NCCL INFO Channel 02/0 : 7[3] -> 10[2] [send] via NET/IB/2(6)/GDRDMA +2: jzxh161:3283538:3285121 [2] NCCL INFO Channel 02/0 : 7[3] -> 10[2] [receive] via NET/IB/2/GDRDMA +2: jzxh161:3283539:3285120 [3] NCCL INFO Channel 02/0 : 11[3] -> 14[2] [send] via NET/IB/2(10)/GDRDMA +1: jzxh160:364773:366439 [2] NCCL INFO Channel 06/0 : 3[3] -> 6[2] [receive] via NET/IB/2/GDRDMA +1: jzxh160:364774:366440 [3] NCCL INFO Channel 06/0 : 7[3] -> 10[2] [send] via NET/IB/2(6)/GDRDMA +1: jzxh160:364773:366439 [2] NCCL INFO Channel 10/0 : 3[3] -> 6[2] [receive] via NET/IB/2/GDRDMA +1: jzxh160:364774:366440 [3] NCCL INFO Channel 10/0 : 7[3] -> 10[2] [send] via NET/IB/2(6)/GDRDMA +1: jzxh160:364773:366439 [2] NCCL INFO Channel 14/0 : 3[3] -> 6[2] [receive] via NET/IB/2/GDRDMA +1: jzxh160:364774:366440 [3] NCCL INFO Channel 14/0 : 7[3] -> 10[2] [send] via NET/IB/2(6)/GDRDMA +2: jzxh161:3283538:3285121 [2] NCCL INFO Channel 06/0 : 7[3] -> 10[2] [receive] via NET/IB/2/GDRDMA +2: jzxh161:3283539:3285120 [3] NCCL INFO Channel 06/0 : 11[3] -> 14[2] [send] via NET/IB/2(10)/GDRDMA +2: jzxh161:3283538:3285121 [2] NCCL INFO Channel 10/0 : 7[3] -> 10[2] [receive] via NET/IB/2/GDRDMA +2: jzxh161:3283539:3285120 [3] NCCL INFO Channel 10/0 : 11[3] -> 14[2] [send] via NET/IB/2(10)/GDRDMA +0: jzxh159:1308400:1311136 [1] NCCL INFO Channel 01/0 : 14[2] -> 1[1] [receive] via NET/IB/1/GDRDMA +0: jzxh159:1308401:1311134 [2] NCCL INFO Channel 01/0 : 2[2] -> 5[1] [send] via NET/IB/1(1)/GDRDMA +2: jzxh161:3283538:3285121 [2] NCCL INFO Channel 14/0 : 7[3] -> 10[2] [receive] via NET/IB/2/GDRDMA +0: jzxh159:1308400:1311136 [1] NCCL INFO Channel 05/0 : 14[2] -> 1[1] [receive] via NET/IB/1/GDRDMA +2: jzxh161:3283539:3285120 [3] NCCL INFO Channel 14/0 : 11[3] -> 14[2] [send] via NET/IB/2(10)/GDRDMA +0: jzxh159:1308401:1311134 [2] NCCL INFO Channel 05/0 : 2[2] -> 5[1] [send] via NET/IB/1(1)/GDRDMA +0: jzxh159:1308400:1311136 [1] NCCL INFO Channel 09/0 : 14[2] -> 1[1] [receive] via NET/IB/1/GDRDMA +0: jzxh159:1308401:1311134 [2] NCCL INFO Channel 09/0 : 2[2] -> 5[1] [send] via NET/IB/1(1)/GDRDMA +0: jzxh159:1308400:1311136 [1] NCCL INFO Channel 13/0 : 14[2] -> 1[1] [receive] via NET/IB/1/GDRDMA +0: jzxh159:1308401:1311134 [2] NCCL INFO Channel 13/0 : 2[2] -> 5[1] [send] via NET/IB/1(1)/GDRDMA +3: jzxh162:3160744:3162427 [2] NCCL INFO Channel 02/0 : 11[3] -> 14[2] [receive] via NET/IB/2/GDRDMA +3: jzxh162:3160745:3162425 [3] NCCL INFO Channel 02/0 : 15[3] -> 2[2] [send] via NET/IB/2(14)/GDRDMA +3: jzxh162:3160744:3162427 [2] NCCL INFO Channel 06/0 : 11[3] -> 14[2] [receive] via NET/IB/2/GDRDMA +3: jzxh162:3160745:3162425 [3] NCCL INFO Channel 06/0 : 15[3] -> 2[2] [send] via NET/IB/2(14)/GDRDMA +3: jzxh162:3160744:3162427 [2] NCCL INFO Channel 10/0 : 11[3] -> 14[2] [receive] via NET/IB/2/GDRDMA +3: jzxh162:3160745:3162425 [3] NCCL INFO Channel 10/0 : 15[3] -> 2[2] [send] via NET/IB/2(14)/GDRDMA +3: jzxh162:3160744:3162427 [2] NCCL INFO Channel 14/0 : 11[3] -> 14[2] [receive] via NET/IB/2/GDRDMA +3: jzxh162:3160745:3162425 [3] NCCL INFO Channel 14/0 : 15[3] -> 2[2] [send] via NET/IB/2(14)/GDRDMA +1: jzxh160:364772:366441 [1] NCCL INFO Channel 01/0 : 2[2] -> 5[1] [receive] via NET/IB/1/GDRDMA +1: jzxh160:364773:366439 [2] NCCL INFO Channel 01/0 : 6[2] -> 9[1] [send] via NET/IB/1(5)/GDRDMA +1: jzxh160:364772:366441 [1] NCCL INFO Channel 05/0 : 2[2] -> 5[1] [receive] via NET/IB/1/GDRDMA +1: jzxh160:364773:366439 [2] NCCL INFO Channel 05/0 : 6[2] -> 9[1] [send] via NET/IB/1(5)/GDRDMA +2: jzxh161:3283537:3285123 [1] NCCL INFO Channel 01/0 : 6[2] -> 9[1] [receive] via NET/IB/1/GDRDMA +2: jzxh161:3283538:3285121 [2] NCCL INFO Channel 01/0 : 10[2] -> 13[1] [send] via NET/IB/1(9)/GDRDMA +2: jzxh161:3283537:3285123 [1] NCCL INFO Channel 05/0 : 6[2] -> 9[1] [receive] via NET/IB/1/GDRDMA +2: jzxh161:3283538:3285121 [2] NCCL INFO Channel 05/0 : 10[2] -> 13[1] [send] via NET/IB/1(9)/GDRDMA +2: jzxh161:3283537:3285123 [1] NCCL INFO Channel 09/0 : 6[2] -> 9[1] [receive] via NET/IB/1/GDRDMA +1: jzxh160:364772:366441 [1] NCCL INFO Channel 09/0 : 2[2] -> 5[1] [receive] via NET/IB/1/GDRDMA +2: jzxh161:3283538:3285121 [2] NCCL INFO Channel 09/0 : 10[2] -> 13[1] [send] via NET/IB/1(9)/GDRDMA +1: jzxh160:364773:366439 [2] NCCL INFO Channel 09/0 : 6[2] -> 9[1] [send] via NET/IB/1(5)/GDRDMA +2: jzxh161:3283537:3285123 [1] NCCL INFO Channel 13/0 : 6[2] -> 9[1] [receive] via NET/IB/1/GDRDMA +1: jzxh160:364772:366441 [1] NCCL INFO Channel 13/0 : 2[2] -> 5[1] [receive] via NET/IB/1/GDRDMA +2: jzxh161:3283538:3285121 [2] NCCL INFO Channel 13/0 : 10[2] -> 13[1] [send] via NET/IB/1(9)/GDRDMA +1: jzxh160:364773:366439 [2] NCCL INFO Channel 13/0 : 6[2] -> 9[1] [send] via NET/IB/1(5)/GDRDMA +1: jzxh160:364772:366441 [1] NCCL INFO Channel 01/0 : 5[1] -> 4[0] via P2P/CUMEM +1: jzxh160:364772:366441 [1] NCCL INFO Channel 02/0 : 5[1] -> 4[0] via P2P/CUMEM +2: jzxh161:3283537:3285123 [1] NCCL INFO Channel 01/0 : 9[1] -> 8[0] via P2P/CUMEM +3: jzxh162:3160743:3162426 [1] NCCL INFO Channel 01/0 : 10[2] -> 13[1] [receive] via NET/IB/1/GDRDMA +2: jzxh161:3283537:3285123 [1] NCCL INFO Channel 02/0 : 9[1] -> 8[0] via P2P/CUMEM +1: jzxh160:364772:366441 [1] NCCL INFO Channel 05/0 : 5[1] -> 4[0] via P2P/CUMEM +3: jzxh162:3160744:3162427 [2] NCCL INFO Channel 01/0 : 14[2] -> 1[1] [send] via NET/IB/1(13)/GDRDMA +3: jzxh162:3160743:3162426 [1] NCCL INFO Channel 05/0 : 10[2] -> 13[1] [receive] via NET/IB/1/GDRDMA +3: jzxh162:3160744:3162427 [2] NCCL INFO Channel 05/0 : 14[2] -> 1[1] [send] via NET/IB/1(13)/GDRDMA +1: jzxh160:364772:366441 [1] NCCL INFO Channel 06/0 : 5[1] -> 4[0] via P2P/CUMEM +2: jzxh161:3283537:3285123 [1] NCCL INFO Channel 05/0 : 9[1] -> 8[0] via P2P/CUMEM +3: jzxh162:3160743:3162426 [1] NCCL INFO Channel 09/0 : 10[2] -> 13[1] [receive] via NET/IB/1/GDRDMA +3: jzxh162:3160744:3162427 [2] NCCL INFO Channel 09/0 : 14[2] -> 1[1] [send] via NET/IB/1(13)/GDRDMA +3: jzxh162:3160743:3162426 [1] NCCL INFO Channel 13/0 : 10[2] -> 13[1] [receive] via NET/IB/1/GDRDMA +1: jzxh160:364772:366441 [1] NCCL INFO Channel 09/0 : 5[1] -> 4[0] via P2P/CUMEM +2: jzxh161:3283537:3285123 [1] NCCL INFO Channel 06/0 : 9[1] -> 8[0] via P2P/CUMEM +3: jzxh162:3160744:3162427 [2] NCCL INFO Channel 13/0 : 14[2] -> 1[1] [send] via NET/IB/1(13)/GDRDMA +1: jzxh160:364772:366441 [1] NCCL INFO Channel 10/0 : 5[1] -> 4[0] via P2P/CUMEM +1: jzxh160:364772:366441 [1] NCCL INFO Channel 13/0 : 5[1] -> 4[0] via P2P/CUMEM +2: jzxh161:3283537:3285123 [1] NCCL INFO Channel 09/0 : 9[1] -> 8[0] via P2P/CUMEM +2: jzxh161:3283537:3285123 [1] NCCL INFO Channel 10/0 : 9[1] -> 8[0] via P2P/CUMEM +2: jzxh161:3283537:3285123 [1] NCCL INFO Channel 13/0 : 9[1] -> 8[0] via P2P/CUMEM +2: jzxh161:3283537:3285123 [1] NCCL INFO Channel 14/0 : 9[1] -> 8[0] via P2P/CUMEM +1: jzxh160:364772:366441 [1] NCCL INFO Channel 14/0 : 5[1] -> 4[0] via P2P/CUMEM +0: jzxh159:1308400:1311136 [1] NCCL INFO Channel 01/0 : 1[1] -> 0[0] via P2P/CUMEM +3: jzxh162:3160743:3162426 [1] NCCL INFO Channel 01/0 : 13[1] -> 12[0] via P2P/CUMEM +0: jzxh159:1308400:1311136 [1] NCCL INFO Channel 02/0 : 1[1] -> 0[0] via P2P/CUMEM +3: jzxh162:3160743:3162426 [1] NCCL INFO Channel 02/0 : 13[1] -> 12[0] via P2P/CUMEM +0: jzxh159:1308400:1311136 [1] NCCL INFO Channel 05/0 : 1[1] -> 0[0] via P2P/CUMEM +3: jzxh162:3160743:3162426 [1] NCCL INFO Channel 05/0 : 13[1] -> 12[0] via P2P/CUMEM +0: jzxh159:1308400:1311136 [1] NCCL INFO Channel 06/0 : 1[1] -> 0[0] via P2P/CUMEM +3: jzxh162:3160743:3162426 [1] NCCL INFO Channel 06/0 : 13[1] -> 12[0] via P2P/CUMEM +0: jzxh159:1308400:1311136 [1] NCCL INFO Channel 09/0 : 1[1] -> 0[0] via P2P/CUMEM +0: jzxh159:1308400:1311136 [1] NCCL INFO Channel 10/0 : 1[1] -> 0[0] via P2P/CUMEM +3: jzxh162:3160743:3162426 [1] NCCL INFO Channel 09/0 : 13[1] -> 12[0] via P2P/CUMEM +0: jzxh159:1308400:1311136 [1] NCCL INFO Channel 13/0 : 1[1] -> 0[0] via P2P/CUMEM +3: jzxh162:3160743:3162426 [1] NCCL INFO Channel 10/0 : 13[1] -> 12[0] via P2P/CUMEM +3: jzxh162:3160743:3162426 [1] NCCL INFO Channel 13/0 : 13[1] -> 12[0] via P2P/CUMEM +0: jzxh159:1308400:1311136 [1] NCCL INFO Channel 14/0 : 1[1] -> 0[0] via P2P/CUMEM +3: jzxh162:3160743:3162426 [1] NCCL INFO Channel 14/0 : 13[1] -> 12[0] via P2P/CUMEM +3: jzxh162:3160745:3162425 [3] NCCL INFO Channel 03/0 : 10[2] -> 15[3] [receive] via NET/IB/3/GDRDMA +3: jzxh162:3160744:3162427 [2] NCCL INFO Channel 03/0 : 14[2] -> 3[3] [send] via NET/IB/3(15)/GDRDMA +3: jzxh162:3160745:3162425 [3] NCCL INFO Channel 07/0 : 10[2] -> 15[3] [receive] via NET/IB/3/GDRDMA +3: jzxh162:3160744:3162427 [2] NCCL INFO Channel 07/0 : 14[2] -> 3[3] [send] via NET/IB/3(15)/GDRDMA +3: jzxh162:3160745:3162425 [3] NCCL INFO Channel 11/0 : 10[2] -> 15[3] [receive] via NET/IB/3/GDRDMA +3: jzxh162:3160744:3162427 [2] NCCL INFO Channel 11/0 : 14[2] -> 3[3] [send] via NET/IB/3(15)/GDRDMA +3: jzxh162:3160745:3162425 [3] NCCL INFO Channel 15/0 : 10[2] -> 15[3] [receive] via NET/IB/3/GDRDMA +3: jzxh162:3160744:3162427 [2] NCCL INFO Channel 15/0 : 14[2] -> 3[3] [send] via NET/IB/3(15)/GDRDMA +1: jzxh160:364773:366439 [2] NCCL INFO Channel 03/0 : 6[2] -> 11[3] [send] via NET/IB/3(7)/GDRDMA +0: jzxh159:1308401:1311134 [2] NCCL INFO Channel 03/0 : 2[2] -> 7[3] [send] via NET/IB/3(3)/GDRDMA +0: jzxh159:1308402:1311135 [3] NCCL INFO Channel 03/0 : 14[2] -> 3[3] [receive] via NET/IB/3/GDRDMA +0: jzxh159:1308401:1311134 [2] NCCL INFO Channel 07/0 : 2[2] -> 7[3] [send] via NET/IB/3(3)/GDRDMA +0: jzxh159:1308402:1311135 [3] NCCL INFO Channel 07/0 : 14[2] -> 3[3] [receive] via NET/IB/3/GDRDMA +0: jzxh159:1308401:1311134 [2] NCCL INFO Channel 11/0 : 2[2] -> 7[3] [send] via NET/IB/3(3)/GDRDMA +0: jzxh159:1308402:1311135 [3] NCCL INFO Channel 11/0 : 14[2] -> 3[3] [receive] via NET/IB/3/GDRDMA +1: jzxh160:364774:366440 [3] NCCL INFO Channel 03/0 : 2[2] -> 7[3] [receive] via NET/IB/3/GDRDMA +1: jzxh160:364773:366439 [2] NCCL INFO Channel 07/0 : 6[2] -> 11[3] [send] via NET/IB/3(7)/GDRDMA +1: jzxh160:364774:366440 [3] NCCL INFO Channel 07/0 : 2[2] -> 7[3] [receive] via NET/IB/3/GDRDMA +0: jzxh159:1308401:1311134 [2] NCCL INFO Channel 15/0 : 2[2] -> 7[3] [send] via NET/IB/3(3)/GDRDMA +1: jzxh160:364773:366439 [2] NCCL INFO Channel 11/0 : 6[2] -> 11[3] [send] via NET/IB/3(7)/GDRDMA +1: jzxh160:364774:366440 [3] NCCL INFO Channel 11/0 : 2[2] -> 7[3] [receive] via NET/IB/3/GDRDMA +1: jzxh160:364773:366439 [2] NCCL INFO Channel 15/0 : 6[2] -> 11[3] [send] via NET/IB/3(7)/GDRDMA +0: jzxh159:1308402:1311135 [3] NCCL INFO Channel 15/0 : 14[2] -> 3[3] [receive] via NET/IB/3/GDRDMA +1: jzxh160:364774:366440 [3] NCCL INFO Channel 15/0 : 2[2] -> 7[3] [receive] via NET/IB/3/GDRDMA +0: jzxh159:1308402:1311135 [3] NCCL INFO Channel 03/0 : 3[3] -> 0[0] via P2P/CUMEM +1: jzxh160:364774:366440 [3] NCCL INFO Channel 03/0 : 7[3] -> 4[0] via P2P/CUMEM +0: jzxh159:1308402:1311135 [3] NCCL INFO Channel 07/0 : 3[3] -> 0[0] via P2P/CUMEM +1: jzxh160:364774:366440 [3] NCCL INFO Channel 07/0 : 7[3] -> 4[0] via P2P/CUMEM +0: jzxh159:1308402:1311135 [3] NCCL INFO Channel 11/0 : 3[3] -> 0[0] via P2P/CUMEM +1: jzxh160:364774:366440 [3] NCCL INFO Channel 11/0 : 7[3] -> 4[0] via P2P/CUMEM +0: jzxh159:1308402:1311135 [3] NCCL INFO Channel 15/0 : 3[3] -> 0[0] via P2P/CUMEM +1: jzxh160:364774:366440 [3] NCCL INFO Channel 15/0 : 7[3] -> 4[0] via P2P/CUMEM +3: jzxh162:3160744:3162427 [2] NCCL INFO Channel 02/0 : 14[2] -> 13[1] via P2P/CUMEM +0: jzxh159:1308401:1311134 [2] NCCL INFO Channel 02/0 : 2[2] -> 1[1] via P2P/CUMEM +1: jzxh160:364774:366440 [3] NCCL INFO Channel 01/0 : 7[3] -> 6[2] via P2P/CUMEM +3: jzxh162:3160744:3162427 [2] NCCL INFO Channel 06/0 : 14[2] -> 13[1] via P2P/CUMEM +0: jzxh159:1308402:1311135 [3] NCCL INFO Channel 01/0 : 3[3] -> 2[2] via P2P/CUMEM +2: jzxh161:3283539:3285120 [3] NCCL INFO Channel 03/0 : 6[2] -> 11[3] [receive] via NET/IB/3/GDRDMA +2: jzxh161:3283538:3285121 [2] NCCL INFO Channel 03/0 : 10[2] -> 15[3] [send] via NET/IB/3(11)/GDRDMA +0: jzxh159:1308401:1311134 [2] NCCL INFO Channel 06/0 : 2[2] -> 1[1] via P2P/CUMEM +1: jzxh160:364774:366440 [3] NCCL INFO Channel 05/0 : 7[3] -> 6[2] via P2P/CUMEM +3: jzxh162:3160744:3162427 [2] NCCL INFO Channel 10/0 : 14[2] -> 13[1] via P2P/CUMEM +2: jzxh161:3283539:3285120 [3] NCCL INFO Channel 07/0 : 6[2] -> 11[3] [receive] via NET/IB/3/GDRDMA +2: jzxh161:3283538:3285121 [2] NCCL INFO Channel 07/0 : 10[2] -> 15[3] [send] via NET/IB/3(11)/GDRDMA +2: jzxh161:3283539:3285120 [3] NCCL INFO Channel 11/0 : 6[2] -> 11[3] [receive] via NET/IB/3/GDRDMA +2: jzxh161:3283538:3285121 [2] NCCL INFO Channel 11/0 : 10[2] -> 15[3] [send] via NET/IB/3(11)/GDRDMA +2: jzxh161:3283539:3285120 [3] NCCL INFO Channel 15/0 : 6[2] -> 11[3] [receive] via NET/IB/3/GDRDMA +2: jzxh161:3283538:3285121 [2] NCCL INFO Channel 15/0 : 10[2] -> 15[3] [send] via NET/IB/3(11)/GDRDMA +1: jzxh160:364774:366440 [3] NCCL INFO Channel 09/0 : 7[3] -> 6[2] via P2P/CUMEM +0: jzxh159:1308402:1311135 [3] NCCL INFO Channel 05/0 : 3[3] -> 2[2] via P2P/CUMEM +0: jzxh159:1308401:1311134 [2] NCCL INFO Channel 10/0 : 2[2] -> 1[1] via P2P/CUMEM +3: jzxh162:3160744:3162427 [2] NCCL INFO Channel 14/0 : 14[2] -> 13[1] via P2P/CUMEM +2: jzxh161:3283539:3285120 [3] NCCL INFO Channel 03/0 : 11[3] -> 8[0] via P2P/CUMEM +3: jzxh162:3160745:3162425 [3] NCCL INFO Channel 03/0 : 15[3] -> 12[0] via P2P/CUMEM +1: jzxh160:364774:366440 [3] NCCL INFO Channel 13/0 : 7[3] -> 6[2] via P2P/CUMEM +0: jzxh159:1308401:1311134 [2] NCCL INFO Channel 14/0 : 2[2] -> 1[1] via P2P/CUMEM +0: jzxh159:1308402:1311135 [3] NCCL INFO Channel 09/0 : 3[3] -> 2[2] via P2P/CUMEM +2: jzxh161:3283539:3285120 [3] NCCL INFO Channel 07/0 : 11[3] -> 8[0] via P2P/CUMEM +3: jzxh162:3160745:3162425 [3] NCCL INFO Channel 07/0 : 15[3] -> 12[0] via P2P/CUMEM +2: jzxh161:3283539:3285120 [3] NCCL INFO Channel 11/0 : 11[3] -> 8[0] via P2P/CUMEM +3: jzxh162:3160745:3162425 [3] NCCL INFO Channel 11/0 : 15[3] -> 12[0] via P2P/CUMEM +0: jzxh159:1308402:1311135 [3] NCCL INFO Channel 13/0 : 3[3] -> 2[2] via P2P/CUMEM +2: jzxh161:3283539:3285120 [3] NCCL INFO Channel 15/0 : 11[3] -> 8[0] via P2P/CUMEM +3: jzxh162:3160745:3162425 [3] NCCL INFO Channel 15/0 : 15[3] -> 12[0] via P2P/CUMEM +2: jzxh161:3283538:3285121 [2] NCCL INFO Channel 02/0 : 10[2] -> 9[1] via P2P/CUMEM +3: jzxh162:3160745:3162425 [3] NCCL INFO Channel 01/0 : 15[3] -> 14[2] via P2P/CUMEM +1: jzxh160:364773:366439 [2] NCCL INFO Channel 02/0 : 6[2] -> 5[1] via P2P/CUMEM +2: jzxh161:3283539:3285120 [3] NCCL INFO Channel 01/0 : 11[3] -> 10[2] via P2P/CUMEM +3: jzxh162:3160745:3162425 [3] NCCL INFO Channel 05/0 : 15[3] -> 14[2] via P2P/CUMEM +2: jzxh161:3283538:3285121 [2] NCCL INFO Channel 06/0 : 10[2] -> 9[1] via P2P/CUMEM +1: jzxh160:364773:366439 [2] NCCL INFO Channel 06/0 : 6[2] -> 5[1] via P2P/CUMEM +2: jzxh161:3283539:3285120 [3] NCCL INFO Channel 05/0 : 11[3] -> 10[2] via P2P/CUMEM +3: jzxh162:3160745:3162425 [3] NCCL INFO Channel 09/0 : 15[3] -> 14[2] via P2P/CUMEM +2: jzxh161:3283538:3285121 [2] NCCL INFO Channel 10/0 : 10[2] -> 9[1] via P2P/CUMEM +1: jzxh160:364773:366439 [2] NCCL INFO Channel 10/0 : 6[2] -> 5[1] via P2P/CUMEM +2: jzxh161:3283539:3285120 [3] NCCL INFO Channel 09/0 : 11[3] -> 10[2] via P2P/CUMEM +3: jzxh162:3160745:3162425 [3] NCCL INFO Channel 13/0 : 15[3] -> 14[2] via P2P/CUMEM +1: jzxh160:364773:366439 [2] NCCL INFO Channel 14/0 : 6[2] -> 5[1] via P2P/CUMEM +2: jzxh161:3283538:3285121 [2] NCCL INFO Channel 14/0 : 10[2] -> 9[1] via P2P/CUMEM +2: jzxh161:3283539:3285120 [3] NCCL INFO Channel 13/0 : 11[3] -> 10[2] via P2P/CUMEM +2: jzxh161:3283536:3285122 [0] NCCL INFO Connected all rings +2: jzxh161:3283536:3285122 [0] NCCL INFO Channel 02/0 : 8[0] -> 9[1] via P2P/CUMEM +2: jzxh161:3283536:3285122 [0] NCCL INFO Channel 10/0 : 8[0] -> 9[1] via P2P/CUMEM +0: jzxh159:1308401:1311134 [2] NCCL INFO Connected all rings +0: jzxh159:1308399:1311133 [0] NCCL INFO Connected all rings +0: jzxh159:1308402:1311135 [3] NCCL INFO Connected all rings +0: jzxh159:1308400:1311136 [1] NCCL INFO Connected all rings +0: jzxh159:1308399:1311133 [0] NCCL INFO Channel 02/0 : 0[0] -> 1[1] via P2P/CUMEM +1: jzxh160:364774:366440 [3] NCCL INFO Connected all rings +1: jzxh160:364773:366439 [2] NCCL INFO Connected all rings +1: jzxh160:364771:366438 [0] NCCL INFO Connected all rings +3: jzxh162:3160742:3162428 [0] NCCL INFO Connected all rings +1: jzxh160:364772:366441 [1] NCCL INFO Connected all rings +3: jzxh162:3160742:3162428 [0] NCCL INFO Channel 02/0 : 12[0] -> 13[1] via P2P/CUMEM +3: jzxh162:3160743:3162426 [1] NCCL INFO Connected all rings +1: jzxh160:364771:366438 [0] NCCL INFO Channel 02/0 : 4[0] -> 5[1] via P2P/CUMEM +3: jzxh162:3160745:3162425 [3] NCCL INFO Connected all rings +3: jzxh162:3160744:3162427 [2] NCCL INFO Connected all rings +0: jzxh159:1308399:1311133 [0] NCCL INFO Channel 10/0 : 0[0] -> 1[1] via P2P/CUMEM +1: jzxh160:364771:366438 [0] NCCL INFO Channel 10/0 : 4[0] -> 5[1] via P2P/CUMEM +3: jzxh162:3160742:3162428 [0] NCCL INFO Channel 10/0 : 12[0] -> 13[1] via P2P/CUMEM +2: jzxh161:3283537:3285123 [1] NCCL INFO Connected all rings +2: jzxh161:3283539:3285120 [3] NCCL INFO Connected all rings +2: jzxh161:3283538:3285121 [2] NCCL INFO Connected all rings +0: jzxh159:1308400:1311136 [1] NCCL INFO Channel 01/0 : 1[1] -> 2[2] via P2P/CUMEM +0: jzxh159:1308401:1311134 [2] NCCL INFO Channel 01/0 : 2[2] -> 3[3] via P2P/CUMEM +1: jzxh160:364772:366441 [1] NCCL INFO Channel 01/0 : 5[1] -> 6[2] via P2P/CUMEM +3: jzxh162:3160744:3162427 [2] NCCL INFO Channel 01/0 : 14[2] -> 15[3] via P2P/CUMEM +0: jzxh159:1308400:1311136 [1] NCCL INFO Channel 09/0 : 1[1] -> 2[2] via P2P/CUMEM +2: jzxh161:3283537:3285123 [1] NCCL INFO Channel 01/0 : 9[1] -> 10[2] via P2P/CUMEM +3: jzxh162:3160743:3162426 [1] NCCL INFO Channel 01/0 : 13[1] -> 14[2] via P2P/CUMEM +3: jzxh162:3160743:3162426 [1] NCCL INFO Channel 09/0 : 13[1] -> 14[2] via P2P/CUMEM +3: jzxh162:3160744:3162427 [2] NCCL INFO Channel 07/0 : 14[2] -> 15[3] via P2P/CUMEM +0: jzxh159:1308401:1311134 [2] NCCL INFO Channel 07/0 : 2[2] -> 3[3] via P2P/CUMEM +1: jzxh160:364773:366439 [2] NCCL INFO Channel 01/0 : 6[2] -> 7[3] via P2P/CUMEM +2: jzxh161:3283537:3285123 [1] NCCL INFO Channel 09/0 : 9[1] -> 10[2] via P2P/CUMEM +3: jzxh162:3160744:3162427 [2] NCCL INFO Channel 09/0 : 14[2] -> 15[3] via P2P/CUMEM +0: jzxh159:1308399:1311133 [0] NCCL INFO Channel 02/0 : 0[0] -> 2[2] via P2P/CUMEM +0: jzxh159:1308401:1311134 [2] NCCL INFO Channel 09/0 : 2[2] -> 3[3] via P2P/CUMEM +1: jzxh160:364773:366439 [2] NCCL INFO Channel 07/0 : 6[2] -> 7[3] via P2P/CUMEM +2: jzxh161:3283538:3285121 [2] NCCL INFO Channel 01/0 : 10[2] -> 11[3] via P2P/CUMEM +3: jzxh162:3160742:3162428 [0] NCCL INFO Channel 02/0 : 12[0] -> 14[2] via P2P/CUMEM +3: jzxh162:3160744:3162427 [2] NCCL INFO Channel 15/0 : 14[2] -> 15[3] via P2P/CUMEM +1: jzxh160:364772:366441 [1] NCCL INFO Channel 09/0 : 5[1] -> 6[2] via P2P/CUMEM +1: jzxh160:364773:366439 [2] NCCL INFO Channel 09/0 : 6[2] -> 7[3] via P2P/CUMEM +1: jzxh160:364773:366439 [2] NCCL INFO Channel 15/0 : 6[2] -> 7[3] via P2P/CUMEM +0: jzxh159:1308399:1311133 [0] NCCL INFO Channel 03/0 : 0[0] -> 2[2] via P2P/CUMEM +2: jzxh161:3283538:3285121 [2] NCCL INFO Channel 07/0 : 10[2] -> 11[3] via P2P/CUMEM +0: jzxh159:1308401:1311134 [2] NCCL INFO Channel 15/0 : 2[2] -> 3[3] via P2P/CUMEM +1: jzxh160:364772:366441 [1] NCCL INFO Channel 02/0 : 5[1] -> 7[3] via P2P/CUMEM +3: jzxh162:3160742:3162428 [0] NCCL INFO Channel 03/0 : 12[0] -> 14[2] via P2P/CUMEM +2: jzxh161:3283536:3285122 [0] NCCL INFO Channel 02/0 : 8[0] -> 10[2] via P2P/CUMEM +3: jzxh162:3160743:3162426 [1] NCCL INFO Channel 02/0 : 13[1] -> 15[3] via P2P/CUMEM +0: jzxh159:1308399:1311133 [0] NCCL INFO Channel 05/0 : 0[0] -> 2[2] via P2P/CUMEM +2: jzxh161:3283538:3285121 [2] NCCL INFO Channel 09/0 : 10[2] -> 11[3] via P2P/CUMEM +0: jzxh159:1308400:1311136 [1] NCCL INFO Channel 02/0 : 1[1] -> 3[3] via P2P/CUMEM +0: jzxh159:1308399:1311133 [0] NCCL INFO Channel 06/0 : 0[0] -> 2[2] via P2P/CUMEM +1: jzxh160:364772:366441 [1] NCCL INFO Channel 03/0 : 5[1] -> 7[3] via P2P/CUMEM +2: jzxh161:3283536:3285122 [0] NCCL INFO Channel 03/0 : 8[0] -> 10[2] via P2P/CUMEM +2: jzxh161:3283538:3285121 [2] NCCL INFO Channel 15/0 : 10[2] -> 11[3] via P2P/CUMEM +3: jzxh162:3160743:3162426 [1] NCCL INFO Channel 03/0 : 13[1] -> 15[3] via P2P/CUMEM +3: jzxh162:3160742:3162428 [0] NCCL INFO Channel 05/0 : 12[0] -> 14[2] via P2P/CUMEM +2: jzxh161:3283536:3285122 [0] NCCL INFO Channel 05/0 : 8[0] -> 10[2] via P2P/CUMEM +2: jzxh161:3283537:3285123 [1] NCCL INFO Channel 02/0 : 9[1] -> 11[3] via P2P/CUMEM +2: jzxh161:3283536:3285122 [0] NCCL INFO Channel 06/0 : 8[0] -> 10[2] via P2P/CUMEM +0: jzxh159:1308399:1311133 [0] NCCL INFO Channel 10/0 : 0[0] -> 2[2] via P2P/CUMEM +0: jzxh159:1308400:1311136 [1] NCCL INFO Channel 03/0 : 1[1] -> 3[3] via P2P/CUMEM +1: jzxh160:364772:366441 [1] NCCL INFO Channel 05/0 : 5[1] -> 7[3] via P2P/CUMEM +3: jzxh162:3160743:3162426 [1] NCCL INFO Channel 05/0 : 13[1] -> 15[3] via P2P/CUMEM +1: jzxh160:364771:366438 [0] NCCL INFO Channel 02/0 : 4[0] -> 6[2] via P2P/CUMEM +2: jzxh161:3283536:3285122 [0] NCCL INFO Channel 10/0 : 8[0] -> 10[2] via P2P/CUMEM +3: jzxh162:3160742:3162428 [0] NCCL INFO Channel 06/0 : 12[0] -> 14[2] via P2P/CUMEM +0: jzxh159:1308399:1311133 [0] NCCL INFO Channel 11/0 : 0[0] -> 2[2] via P2P/CUMEM +2: jzxh161:3283537:3285123 [1] NCCL INFO Channel 03/0 : 9[1] -> 11[3] via P2P/CUMEM +0: jzxh159:1308400:1311136 [1] NCCL INFO Channel 05/0 : 1[1] -> 3[3] via P2P/CUMEM +1: jzxh160:364772:366441 [1] NCCL INFO Channel 06/0 : 5[1] -> 7[3] via P2P/CUMEM +2: jzxh161:3283536:3285122 [0] NCCL INFO Channel 11/0 : 8[0] -> 10[2] via P2P/CUMEM +3: jzxh162:3160743:3162426 [1] NCCL INFO Channel 06/0 : 13[1] -> 15[3] via P2P/CUMEM +2: jzxh161:3283537:3285123 [1] NCCL INFO Channel 05/0 : 9[1] -> 11[3] via P2P/CUMEM +3: jzxh162:3160742:3162428 [0] NCCL INFO Channel 10/0 : 12[0] -> 14[2] via P2P/CUMEM +1: jzxh160:364771:366438 [0] NCCL INFO Channel 03/0 : 4[0] -> 6[2] via P2P/CUMEM +0: jzxh159:1308399:1311133 [0] NCCL INFO Channel 13/0 : 0[0] -> 2[2] via P2P/CUMEM +1: jzxh160:364772:366441 [1] NCCL INFO Channel 10/0 : 5[1] -> 7[3] via P2P/CUMEM +3: jzxh162:3160742:3162428 [0] NCCL INFO Channel 11/0 : 12[0] -> 14[2] via P2P/CUMEM +2: jzxh161:3283536:3285122 [0] NCCL INFO Channel 13/0 : 8[0] -> 10[2] via P2P/CUMEM +0: jzxh159:1308400:1311136 [1] NCCL INFO Channel 06/0 : 1[1] -> 3[3] via P2P/CUMEM +1: jzxh160:364771:366438 [0] NCCL INFO Channel 05/0 : 4[0] -> 6[2] via P2P/CUMEM +3: jzxh162:3160743:3162426 [1] NCCL INFO Channel 10/0 : 13[1] -> 15[3] via P2P/CUMEM +0: jzxh159:1308399:1311133 [0] NCCL INFO Channel 14/0 : 0[0] -> 2[2] via P2P/CUMEM +2: jzxh161:3283537:3285123 [1] NCCL INFO Channel 06/0 : 9[1] -> 11[3] via P2P/CUMEM +0: jzxh159:1308400:1311136 [1] NCCL INFO Channel 10/0 : 1[1] -> 3[3] via P2P/CUMEM +1: jzxh160:364772:366441 [1] NCCL INFO Channel 11/0 : 5[1] -> 7[3] via P2P/CUMEM +3: jzxh162:3160743:3162426 [1] NCCL INFO Channel 11/0 : 13[1] -> 15[3] via P2P/CUMEM +1: jzxh160:364771:366438 [0] NCCL INFO Channel 06/0 : 4[0] -> 6[2] via P2P/CUMEM +2: jzxh161:3283536:3285122 [0] NCCL INFO Channel 14/0 : 8[0] -> 10[2] via P2P/CUMEM +2: jzxh161:3283537:3285123 [1] NCCL INFO Channel 10/0 : 9[1] -> 11[3] via P2P/CUMEM +3: jzxh162:3160742:3162428 [0] NCCL INFO Channel 13/0 : 12[0] -> 14[2] via P2P/CUMEM +0: jzxh159:1308400:1311136 [1] NCCL INFO Channel 11/0 : 1[1] -> 3[3] via P2P/CUMEM +1: jzxh160:364771:366438 [0] NCCL INFO Channel 10/0 : 4[0] -> 6[2] via P2P/CUMEM +3: jzxh162:3160742:3162428 [0] NCCL INFO Channel 14/0 : 12[0] -> 14[2] via P2P/CUMEM +2: jzxh161:3283537:3285123 [1] NCCL INFO Channel 11/0 : 9[1] -> 11[3] via P2P/CUMEM +3: jzxh162:3160743:3162426 [1] NCCL INFO Channel 13/0 : 13[1] -> 15[3] via P2P/CUMEM +2: jzxh161:3283537:3285123 [1] NCCL INFO Channel 13/0 : 9[1] -> 11[3] via P2P/CUMEM +0: jzxh159:1308400:1311136 [1] NCCL INFO Channel 13/0 : 1[1] -> 3[3] via P2P/CUMEM +1: jzxh160:364771:366438 [0] NCCL INFO Channel 11/0 : 4[0] -> 6[2] via P2P/CUMEM +1: jzxh160:364771:366438 [0] NCCL INFO Channel 13/0 : 4[0] -> 6[2] via P2P/CUMEM +2: jzxh161:3283537:3285123 [1] NCCL INFO Channel 14/0 : 9[1] -> 11[3] via P2P/CUMEM +3: jzxh162:3160743:3162426 [1] NCCL INFO Channel 14/0 : 13[1] -> 15[3] via P2P/CUMEM +0: jzxh159:1308400:1311136 [1] NCCL INFO Channel 14/0 : 1[1] -> 3[3] via P2P/CUMEM +1: jzxh160:364771:366438 [0] NCCL INFO Channel 14/0 : 4[0] -> 6[2] via P2P/CUMEM +3: jzxh162:3160742:3162428 [0] NCCL INFO Channel 04/0 : 12[0] -> 15[3] via P2P/CUMEM +0: jzxh159:1308399:1311133 [0] NCCL INFO Channel 04/0 : 0[0] -> 3[3] via P2P/CUMEM +3: jzxh162:3160744:3162427 [2] NCCL INFO Channel 02/0 : 10[2] -> 14[2] [receive] via NET/IB/2/GDRDMA +3: jzxh162:3160744:3162427 [2] NCCL INFO Channel 06/0 : 10[2] -> 14[2] [receive] via NET/IB/2/GDRDMA +3: jzxh162:3160742:3162428 [0] NCCL INFO Channel 12/0 : 12[0] -> 15[3] via P2P/CUMEM +1: jzxh160:364771:366438 [0] NCCL INFO Channel 04/0 : 4[0] -> 7[3] via P2P/CUMEM +0: jzxh159:1308400:1311136 [1] NCCL INFO Channel 09/0 : 1[1] -> 5[1] [send] via NET/IB/1/GDRDMA +0: jzxh159:1308401:1311134 [2] NCCL INFO Channel 10/0 : 2[2] -> 6[2] [send] via NET/IB/2/GDRDMA +0: jzxh159:1308401:1311134 [2] NCCL INFO Channel 14/0 : 2[2] -> 6[2] [send] via NET/IB/2/GDRDMA +0: jzxh159:1308400:1311136 [1] NCCL INFO Channel 13/0 : 1[1] -> 5[1] [send] via NET/IB/1/GDRDMA +1: jzxh160:364773:366439 [2] NCCL INFO Channel 10/0 : 2[2] -> 6[2] [receive] via NET/IB/2/GDRDMA +1: jzxh160:364772:366441 [1] NCCL INFO Channel 13/0 : 5[1] -> 7[3] via P2P/CUMEM +2: jzxh161:3283537:3285123 [1] NCCL INFO Channel 01/0 : 5[1] -> 9[1] [receive] via NET/IB/1/GDRDMA +3: jzxh162:3160743:3162426 [1] NCCL INFO Channel 01/0 : 9[1] -> 13[1] [receive] via NET/IB/1/GDRDMA +1: jzxh160:364773:366439 [2] NCCL INFO Channel 14/0 : 2[2] -> 6[2] [receive] via NET/IB/2/GDRDMA +1: jzxh160:364771:366438 [0] NCCL INFO Channel 12/0 : 4[0] -> 7[3] via P2P/CUMEM +3: jzxh162:3160743:3162426 [1] NCCL INFO Channel 05/0 : 9[1] -> 13[1] [receive] via NET/IB/1/GDRDMA +0: jzxh159:1308399:1311133 [0] NCCL INFO Channel 12/0 : 0[0] -> 3[3] via P2P/CUMEM +1: jzxh160:364773:366439 [2] NCCL INFO Channel 02/0 : 6[2] -> 10[2] [send] via NET/IB/2/GDRDMA +1: jzxh160:364772:366441 [1] NCCL INFO Channel 14/0 : 5[1] -> 7[3] via P2P/CUMEM +1: jzxh160:364773:366439 [2] NCCL INFO Channel 06/0 : 6[2] -> 10[2] [send] via NET/IB/2/GDRDMA +1: jzxh160:364773:366439 [2] NCCL INFO Channel 10/0 : 6[2] -> 10[2] [send] via NET/IB/2/GDRDMA +2: jzxh161:3283537:3285123 [1] NCCL INFO Channel 05/0 : 5[1] -> 9[1] [receive] via NET/IB/1/GDRDMA +0: jzxh159:1308401:1311134 [2] NCCL INFO Channel 02/0 : 10[2] -> 2[2] [receive] via NET/IB/2/GDRDMA +1: jzxh160:364773:366439 [2] NCCL INFO Channel 14/0 : 6[2] -> 10[2] [send] via NET/IB/2/GDRDMA +2: jzxh161:3283537:3285123 [1] NCCL INFO Channel 09/0 : 5[1] -> 9[1] [receive] via NET/IB/1/GDRDMA +2: jzxh161:3283537:3285123 [1] NCCL INFO Channel 13/0 : 5[1] -> 9[1] [receive] via NET/IB/1/GDRDMA +2: jzxh161:3283536:3285122 [0] NCCL INFO Channel 04/0 : 8[0] -> 11[3] via P2P/CUMEM +2: jzxh161:3283537:3285123 [1] NCCL INFO Channel 01/0 : 9[1] -> 13[1] [send] via NET/IB/1/GDRDMA +3: jzxh162:3160745:3162425 [3] NCCL INFO Channel 03/0 : 11[3] -> 15[3] [receive] via NET/IB/3/GDRDMA +3: jzxh162:3160742:3162428 [0] NCCL INFO Channel 00/0 : 8[0] -> 12[0] [receive] via NET/IB/0/GDRDMA +0: jzxh159:1308402:1311135 [3] NCCL INFO Channel 11/0 : 3[3] -> 7[3] [send] via NET/IB/3/GDRDMA +0: jzxh159:1308402:1311135 [3] NCCL INFO Channel 15/0 : 3[3] -> 7[3] [send] via NET/IB/3/GDRDMA +2: jzxh161:3283537:3285123 [1] NCCL INFO Channel 05/0 : 9[1] -> 13[1] [send] via NET/IB/1/GDRDMA +3: jzxh162:3160745:3162425 [3] NCCL INFO Channel 07/0 : 11[3] -> 15[3] [receive] via NET/IB/3/GDRDMA +3: jzxh162:3160742:3162428 [0] NCCL INFO Channel 04/0 : 8[0] -> 12[0] [receive] via NET/IB/0/GDRDMA +0: jzxh159:1308401:1311134 [2] NCCL INFO Channel 06/0 : 10[2] -> 2[2] [receive] via NET/IB/2/GDRDMA +2: jzxh161:3283538:3285121 [2] NCCL INFO Channel 02/0 : 6[2] -> 10[2] [receive] via NET/IB/2/GDRDMA +3: jzxh162:3160743:3162426 [1] NCCL INFO Channel 09/0 : 5[1] -> 13[1] [receive] via NET/IB/1/GDRDMA +2: jzxh161:3283536:3285122 [0] NCCL INFO Channel 12/0 : 8[0] -> 11[3] via P2P/CUMEM +2: jzxh161:3283538:3285121 [2] NCCL INFO Channel 06/0 : 6[2] -> 10[2] [receive] via NET/IB/2/GDRDMA +2: jzxh161:3283538:3285121 [2] NCCL INFO Channel 10/0 : 6[2] -> 10[2] [receive] via NET/IB/2/GDRDMA +2: jzxh161:3283538:3285121 [2] NCCL INFO Channel 14/0 : 6[2] -> 10[2] [receive] via NET/IB/2/GDRDMA +1: jzxh160:364772:366441 [1] NCCL INFO Channel 09/0 : 1[1] -> 5[1] [receive] via NET/IB/1/GDRDMA +2: jzxh161:3283538:3285121 [2] NCCL INFO Channel 02/0 : 10[2] -> 14[2] [send] via NET/IB/2/GDRDMA +0: jzxh159:1308401:1311134 [2] NCCL INFO Channel 02/0 : 2[2] -> 10[2] [send] via NET/IB/2/GDRDMA +0: jzxh159:1308399:1311133 [0] NCCL INFO Channel 08/0 : 0[0] -> 4[0] [send] via NET/IB/0/GDRDMA +0: jzxh159:1308401:1311134 [2] NCCL INFO Channel 06/0 : 2[2] -> 10[2] [send] via NET/IB/2/GDRDMA +0: jzxh159:1308399:1311133 [0] NCCL INFO Channel 12/0 : 0[0] -> 4[0] [send] via NET/IB/0/GDRDMA +1: jzxh160:364772:366441 [1] NCCL INFO Channel 13/0 : 1[1] -> 5[1] [receive] via NET/IB/1/GDRDMA +2: jzxh161:3283538:3285121 [2] NCCL INFO Channel 06/0 : 10[2] -> 14[2] [send] via NET/IB/2/GDRDMA +1: jzxh160:364772:366441 [1] NCCL INFO Channel 01/0 : 5[1] -> 9[1] [send] via NET/IB/1/GDRDMA +1: jzxh160:364773:366439 [2] NCCL INFO Channel 10/0 : 14[2] -> 6[2] [receive] via NET/IB/2/GDRDMA +1: jzxh160:364772:366441 [1] NCCL INFO Channel 05/0 : 5[1] -> 9[1] [send] via NET/IB/1/GDRDMA +1: jzxh160:364773:366439 [2] NCCL INFO Channel 14/0 : 14[2] -> 6[2] [receive] via NET/IB/2/GDRDMA +2: jzxh161:3283539:3285120 [3] NCCL INFO Channel 03/0 : 7[3] -> 11[3] [receive] via NET/IB/3/GDRDMA +2: jzxh161:3283536:3285122 [0] NCCL INFO Channel 00/0 : 4[0] -> 8[0] [receive] via NET/IB/0/GDRDMA +2: jzxh161:3283539:3285120 [3] NCCL INFO Channel 07/0 : 7[3] -> 11[3] [receive] via NET/IB/3/GDRDMA +2: jzxh161:3283536:3285122 [0] NCCL INFO Channel 04/0 : 4[0] -> 8[0] [receive] via NET/IB/0/GDRDMA +2: jzxh161:3283539:3285120 [3] NCCL INFO Channel 11/0 : 7[3] -> 11[3] [receive] via NET/IB/3/GDRDMA +2: jzxh161:3283538:3285121 [2] NCCL INFO Channel 02/0 : 2[2] -> 10[2] [receive] via NET/IB/2/GDRDMA +2: jzxh161:3283536:3285122 [0] NCCL INFO Channel 08/0 : 4[0] -> 8[0] [receive] via NET/IB/0/GDRDMA +2: jzxh161:3283539:3285120 [3] NCCL INFO Channel 15/0 : 7[3] -> 11[3] [receive] via NET/IB/3/GDRDMA +0: jzxh159:1308400:1311136 [1] NCCL INFO Channel 01/0 : 9[1] -> 1[1] [receive] via NET/IB/1/GDRDMA +2: jzxh161:3283538:3285121 [2] NCCL INFO Channel 06/0 : 2[2] -> 10[2] [receive] via NET/IB/2/GDRDMA +1: jzxh160:364772:366441 [1] NCCL INFO Channel 09/0 : 5[1] -> 9[1] [send] via NET/IB/1/GDRDMA +2: jzxh161:3283536:3285122 [0] NCCL INFO Channel 12/0 : 4[0] -> 8[0] [receive] via NET/IB/0/GDRDMA +1: jzxh160:364773:366439 [2] NCCL INFO Channel 10/0 : 6[2] -> 14[2] [send] via NET/IB/2/GDRDMA +2: jzxh161:3283539:3285120 [3] NCCL INFO Channel 03/0 : 11[3] -> 15[3] [send] via NET/IB/3/GDRDMA +1: jzxh160:364772:366441 [1] NCCL INFO Channel 13/0 : 5[1] -> 9[1] [send] via NET/IB/1/GDRDMA +2: jzxh161:3283538:3285121 [2] NCCL INFO Channel 02/0 : 10[2] -> 2[2] [send] via NET/IB/2/GDRDMA +1: jzxh160:364773:366439 [2] NCCL INFO Channel 14/0 : 6[2] -> 14[2] [send] via NET/IB/2/GDRDMA +2: jzxh161:3283536:3285122 [0] NCCL INFO Channel 00/0 : 8[0] -> 12[0] [send] via NET/IB/0/GDRDMA +0: jzxh159:1308401:1311134 [2] NCCL INFO Channel 10/0 : 6[2] -> 2[2] [receive] via NET/IB/2/GDRDMA +1: jzxh160:364774:366440 [3] NCCL INFO Channel 11/0 : 3[3] -> 7[3] [receive] via NET/IB/3/GDRDMA +2: jzxh161:3283539:3285120 [3] NCCL INFO Channel 07/0 : 11[3] -> 15[3] [send] via NET/IB/3/GDRDMA +0: jzxh159:1308400:1311136 [1] NCCL INFO Channel 05/0 : 9[1] -> 1[1] [receive] via NET/IB/1/GDRDMA +1: jzxh160:364771:366438 [0] NCCL INFO Channel 08/0 : 0[0] -> 4[0] [receive] via NET/IB/0/GDRDMA +2: jzxh161:3283538:3285121 [2] NCCL INFO Channel 06/0 : 10[2] -> 2[2] [send] via NET/IB/2/GDRDMA +1: jzxh160:364774:366440 [3] NCCL INFO Channel 15/0 : 3[3] -> 7[3] [receive] via NET/IB/3/GDRDMA +1: jzxh160:364771:366438 [0] NCCL INFO Channel 12/0 : 0[0] -> 4[0] [receive] via NET/IB/0/GDRDMA +2: jzxh161:3283536:3285122 [0] NCCL INFO Channel 04/0 : 8[0] -> 12[0] [send] via NET/IB/0/GDRDMA +1: jzxh160:364774:366440 [3] NCCL INFO Channel 03/0 : 7[3] -> 11[3] [send] via NET/IB/3/GDRDMA +2: jzxh161:3283538:3285121 [2] NCCL INFO Channel 02/0 : 14[2] -> 10[2] [receive] via NET/IB/2/GDRDMA +2: jzxh161:3283537:3285123 [1] NCCL INFO Channel 01/0 : 1[1] -> 9[1] [receive] via NET/IB/1/GDRDMA +3: jzxh162:3160745:3162425 [3] NCCL INFO Channel 11/0 : 7[3] -> 15[3] [receive] via NET/IB/3/GDRDMA +2: jzxh161:3283538:3285121 [2] NCCL INFO Channel 06/0 : 14[2] -> 10[2] [receive] via NET/IB/2/GDRDMA +2: jzxh161:3283537:3285123 [1] NCCL INFO Channel 05/0 : 1[1] -> 9[1] [receive] via NET/IB/1/GDRDMA +0: jzxh159:1308400:1311136 [1] NCCL INFO Channel 01/0 : 1[1] -> 9[1] [send] via NET/IB/1/GDRDMA +0: jzxh159:1308401:1311134 [2] NCCL INFO Channel 14/0 : 6[2] -> 2[2] [receive] via NET/IB/2/GDRDMA +0: jzxh159:1308400:1311136 [1] NCCL INFO Channel 05/0 : 1[1] -> 9[1] [send] via NET/IB/1/GDRDMA +0: jzxh159:1308402:1311135 [3] NCCL INFO Channel 03/0 : 11[3] -> 3[3] [receive] via NET/IB/3/GDRDMA +1: jzxh160:364771:366438 [0] NCCL INFO Channel 00/0 : 4[0] -> 8[0] [send] via NET/IB/0/GDRDMA +1: jzxh160:364774:366440 [3] NCCL INFO Channel 07/0 : 7[3] -> 11[3] [send] via NET/IB/3/GDRDMA +3: jzxh162:3160745:3162425 [3] NCCL INFO Channel 15/0 : 7[3] -> 15[3] [receive] via NET/IB/3/GDRDMA +1: jzxh160:364772:366441 [1] NCCL INFO Channel 09/0 : 13[1] -> 5[1] [receive] via NET/IB/1/GDRDMA +3: jzxh162:3160745:3162425 [3] NCCL INFO Channel 11/0 : 15[3] -> 7[3] [send] via NET/IB/3/GDRDMA +1: jzxh160:364774:366440 [3] NCCL INFO Channel 11/0 : 7[3] -> 11[3] [send] via NET/IB/3/GDRDMA +1: jzxh160:364771:366438 [0] NCCL INFO Channel 04/0 : 4[0] -> 8[0] [send] via NET/IB/0/GDRDMA +3: jzxh162:3160745:3162425 [3] NCCL INFO Channel 15/0 : 15[3] -> 7[3] [send] via NET/IB/3/GDRDMA +1: jzxh160:364772:366441 [1] NCCL INFO Channel 13/0 : 13[1] -> 5[1] [receive] via NET/IB/1/GDRDMA +0: jzxh159:1308402:1311135 [3] NCCL INFO Channel 07/0 : 11[3] -> 3[3] [receive] via NET/IB/3/GDRDMA +0: jzxh159:1308399:1311133 [0] NCCL INFO Channel 00/0 : 8[0] -> 0[0] [receive] via NET/IB/0/GDRDMA +1: jzxh160:364774:366440 [3] NCCL INFO Channel 15/0 : 7[3] -> 11[3] [send] via NET/IB/3/GDRDMA +1: jzxh160:364772:366441 [1] NCCL INFO Channel 09/0 : 5[1] -> 13[1] [send] via NET/IB/1/GDRDMA +1: jzxh160:364771:366438 [0] NCCL INFO Channel 08/0 : 4[0] -> 8[0] [send] via NET/IB/0/GDRDMA +1: jzxh160:364772:366441 [1] NCCL INFO Channel 13/0 : 5[1] -> 13[1] [send] via NET/IB/1/GDRDMA +1: jzxh160:364771:366438 [0] NCCL INFO Channel 12/0 : 4[0] -> 8[0] [send] via NET/IB/0/GDRDMA +2: jzxh161:3283538:3285121 [2] NCCL INFO Channel 02/0 : 10[2] -> 6[2] [send] via NET/IB/2/GDRDMA +2: jzxh161:3283537:3285123 [1] NCCL INFO Channel 01/0 : 9[1] -> 1[1] [send] via NET/IB/1/GDRDMA +2: jzxh161:3283537:3285123 [1] NCCL INFO Channel 05/0 : 9[1] -> 1[1] [send] via NET/IB/1/GDRDMA +2: jzxh161:3283538:3285121 [2] NCCL INFO Channel 06/0 : 10[2] -> 6[2] [send] via NET/IB/2/GDRDMA +2: jzxh161:3283538:3285121 [2] NCCL INFO Channel 10/0 : 10[2] -> 6[2] [send] via NET/IB/2/GDRDMA +0: jzxh159:1308400:1311136 [1] NCCL INFO Channel 09/0 : 5[1] -> 1[1] [receive] via NET/IB/1/GDRDMA +0: jzxh159:1308402:1311135 [3] NCCL INFO Channel 03/0 : 3[3] -> 11[3] [send] via NET/IB/3/GDRDMA +0: jzxh159:1308399:1311133 [0] NCCL INFO Channel 04/0 : 8[0] -> 0[0] [receive] via NET/IB/0/GDRDMA +0: jzxh159:1308402:1311135 [3] NCCL INFO Channel 07/0 : 3[3] -> 11[3] [send] via NET/IB/3/GDRDMA +0: jzxh159:1308400:1311136 [1] NCCL INFO Channel 13/0 : 5[1] -> 1[1] [receive] via NET/IB/1/GDRDMA +0: jzxh159:1308399:1311133 [0] NCCL INFO Channel 00/0 : 0[0] -> 8[0] [send] via NET/IB/0/GDRDMA +3: jzxh162:3160743:3162426 [1] NCCL INFO Channel 13/0 : 5[1] -> 13[1] [receive] via NET/IB/1/GDRDMA +3: jzxh162:3160744:3162427 [2] NCCL INFO Channel 10/0 : 6[2] -> 14[2] [receive] via NET/IB/2/GDRDMA +3: jzxh162:3160743:3162426 [1] NCCL INFO Channel 09/0 : 13[1] -> 5[1] [send] via NET/IB/1/GDRDMA +0: jzxh159:1308399:1311133 [0] NCCL INFO Channel 04/0 : 0[0] -> 8[0] [send] via NET/IB/0/GDRDMA +3: jzxh162:3160744:3162427 [2] NCCL INFO Channel 14/0 : 6[2] -> 14[2] [receive] via NET/IB/2/GDRDMA +3: jzxh162:3160743:3162426 [1] NCCL INFO Channel 13/0 : 13[1] -> 5[1] [send] via NET/IB/1/GDRDMA +3: jzxh162:3160744:3162427 [2] NCCL INFO Channel 10/0 : 14[2] -> 6[2] [send] via NET/IB/2/GDRDMA +1: jzxh160:364774:366440 [3] NCCL INFO Channel 11/0 : 15[3] -> 7[3] [receive] via NET/IB/3/GDRDMA +2: jzxh161:3283538:3285121 [2] NCCL INFO Channel 14/0 : 10[2] -> 6[2] [send] via NET/IB/2/GDRDMA +1: jzxh160:364774:366440 [3] NCCL INFO Channel 15/0 : 15[3] -> 7[3] [receive] via NET/IB/3/GDRDMA +2: jzxh161:3283539:3285120 [3] NCCL INFO Channel 03/0 : 3[3] -> 11[3] [receive] via NET/IB/3/GDRDMA +1: jzxh160:364774:366440 [3] NCCL INFO Channel 11/0 : 7[3] -> 15[3] [send] via NET/IB/3/GDRDMA +2: jzxh161:3283539:3285120 [3] NCCL INFO Channel 07/0 : 3[3] -> 11[3] [receive] via NET/IB/3/GDRDMA +2: jzxh161:3283537:3285123 [1] NCCL INFO Channel 01/0 : 13[1] -> 9[1] [receive] via NET/IB/1/GDRDMA +2: jzxh161:3283539:3285120 [3] NCCL INFO Channel 03/0 : 11[3] -> 3[3] [send] via NET/IB/3/GDRDMA +0: jzxh159:1308402:1311135 [3] NCCL INFO Channel 11/0 : 7[3] -> 3[3] [receive] via NET/IB/3/GDRDMA +3: jzxh162:3160744:3162427 [2] NCCL INFO Channel 14/0 : 14[2] -> 6[2] [send] via NET/IB/2/GDRDMA +3: jzxh162:3160743:3162426 [1] NCCL INFO Channel 01/0 : 13[1] -> 9[1] [send] via NET/IB/1/GDRDMA +3: jzxh162:3160743:3162426 [1] NCCL INFO Channel 05/0 : 13[1] -> 9[1] [send] via NET/IB/1/GDRDMA +3: jzxh162:3160744:3162427 [2] NCCL INFO Channel 02/0 : 14[2] -> 10[2] [send] via NET/IB/2/GDRDMA +0: jzxh159:1308402:1311135 [3] NCCL INFO Channel 15/0 : 7[3] -> 3[3] [receive] via NET/IB/3/GDRDMA +2: jzxh161:3283537:3285123 [1] NCCL INFO Channel 05/0 : 13[1] -> 9[1] [receive] via NET/IB/1/GDRDMA +2: jzxh161:3283539:3285120 [3] NCCL INFO Channel 07/0 : 11[3] -> 3[3] [send] via NET/IB/3/GDRDMA +2: jzxh161:3283537:3285123 [1] NCCL INFO Channel 01/0 : 9[1] -> 5[1] [send] via NET/IB/1/GDRDMA +2: jzxh161:3283537:3285123 [1] NCCL INFO Channel 05/0 : 9[1] -> 5[1] [send] via NET/IB/1/GDRDMA +2: jzxh161:3283537:3285123 [1] NCCL INFO Channel 09/0 : 9[1] -> 5[1] [send] via NET/IB/1/GDRDMA +2: jzxh161:3283539:3285120 [3] NCCL INFO Channel 03/0 : 15[3] -> 11[3] [receive] via NET/IB/3/GDRDMA +2: jzxh161:3283537:3285123 [1] NCCL INFO Channel 13/0 : 9[1] -> 5[1] [send] via NET/IB/1/GDRDMA +2: jzxh161:3283539:3285120 [3] NCCL INFO Channel 07/0 : 15[3] -> 11[3] [receive] via NET/IB/3/GDRDMA +1: jzxh160:364774:366440 [3] NCCL INFO Channel 15/0 : 7[3] -> 15[3] [send] via NET/IB/3/GDRDMA +1: jzxh160:364772:366441 [1] NCCL INFO Channel 01/0 : 9[1] -> 5[1] [receive] via NET/IB/1/GDRDMA +1: jzxh160:364772:366441 [1] NCCL INFO Channel 05/0 : 9[1] -> 5[1] [receive] via NET/IB/1/GDRDMA +1: jzxh160:364772:366441 [1] NCCL INFO Channel 09/0 : 9[1] -> 5[1] [receive] via NET/IB/1/GDRDMA +1: jzxh160:364774:366440 [3] NCCL INFO Channel 03/0 : 11[3] -> 7[3] [receive] via NET/IB/3/GDRDMA +1: jzxh160:364773:366439 [2] NCCL INFO Channel 02/0 : 10[2] -> 6[2] [receive] via NET/IB/2/GDRDMA +1: jzxh160:364772:366441 [1] NCCL INFO Channel 13/0 : 9[1] -> 5[1] [receive] via NET/IB/1/GDRDMA +3: jzxh162:3160744:3162427 [2] NCCL INFO Channel 06/0 : 14[2] -> 10[2] [send] via NET/IB/2/GDRDMA +3: jzxh162:3160745:3162425 [3] NCCL INFO Channel 03/0 : 15[3] -> 11[3] [send] via NET/IB/3/GDRDMA +1: jzxh160:364774:366440 [3] NCCL INFO Channel 07/0 : 11[3] -> 7[3] [receive] via NET/IB/3/GDRDMA +3: jzxh162:3160745:3162425 [3] NCCL INFO Channel 07/0 : 15[3] -> 11[3] [send] via NET/IB/3/GDRDMA +1: jzxh160:364773:366439 [2] NCCL INFO Channel 06/0 : 10[2] -> 6[2] [receive] via NET/IB/2/GDRDMA +3: jzxh162:3160744:3162427 [2] NCCL INFO Channel 02/0 : 14[2] -> 12[0] via P2P/CUMEM +1: jzxh160:364772:366441 [1] NCCL INFO Channel 09/0 : 5[1] -> 1[1] [send] via NET/IB/1/GDRDMA +0: jzxh159:1308401:1311134 [2] NCCL INFO Channel 02/0 : 2[2] -> 0[0] via P2P/CUMEM +0: jzxh159:1308402:1311135 [3] NCCL INFO Channel 01/0 : 3[3] -> 0[0] via P2P/CUMEM +1: jzxh160:364774:366440 [3] NCCL INFO Channel 11/0 : 11[3] -> 7[3] [receive] via NET/IB/3/GDRDMA +1: jzxh160:364773:366439 [2] NCCL INFO Channel 10/0 : 10[2] -> 6[2] [receive] via NET/IB/2/GDRDMA +2: jzxh161:3283539:3285120 [3] NCCL INFO Channel 03/0 : 11[3] -> 7[3] [send] via NET/IB/3/GDRDMA +3: jzxh162:3160742:3162428 [0] NCCL INFO Channel 08/0 : 4[0] -> 12[0] [receive] via NET/IB/0/GDRDMA +3: jzxh162:3160744:3162427 [2] NCCL INFO Channel 03/0 : 14[2] -> 12[0] via P2P/CUMEM +1: jzxh160:364772:366441 [1] NCCL INFO Channel 13/0 : 5[1] -> 1[1] [send] via NET/IB/1/GDRDMA +2: jzxh161:3283539:3285120 [3] NCCL INFO Channel 07/0 : 11[3] -> 7[3] [send] via NET/IB/3/GDRDMA +3: jzxh162:3160745:3162425 [3] NCCL INFO Channel 01/0 : 15[3] -> 12[0] via P2P/CUMEM +3: jzxh162:3160742:3162428 [0] NCCL INFO Channel 12/0 : 4[0] -> 12[0] [receive] via NET/IB/0/GDRDMA +1: jzxh160:364774:366440 [3] NCCL INFO Channel 15/0 : 11[3] -> 7[3] [receive] via NET/IB/3/GDRDMA +2: jzxh161:3283539:3285120 [3] NCCL INFO Channel 11/0 : 11[3] -> 7[3] [send] via NET/IB/3/GDRDMA +1: jzxh160:364773:366439 [2] NCCL INFO Channel 14/0 : 10[2] -> 6[2] [receive] via NET/IB/2/GDRDMA +2: jzxh161:3283539:3285120 [3] NCCL INFO Channel 15/0 : 11[3] -> 7[3] [send] via NET/IB/3/GDRDMA +1: jzxh160:364774:366440 [3] NCCL INFO Channel 11/0 : 7[3] -> 3[3] [send] via NET/IB/3/GDRDMA +2: jzxh161:3283536:3285122 [0] NCCL INFO Channel 00/0 : 0[0] -> 8[0] [receive] via NET/IB/0/GDRDMA +1: jzxh160:364771:366438 [0] NCCL INFO Channel 08/0 : 12[0] -> 4[0] [receive] via NET/IB/0/GDRDMA +1: jzxh160:364773:366439 [2] NCCL INFO Channel 10/0 : 6[2] -> 2[2] [send] via NET/IB/2/GDRDMA +1: jzxh160:364774:366440 [3] NCCL INFO Channel 15/0 : 7[3] -> 3[3] [send] via NET/IB/3/GDRDMA +1: jzxh160:364773:366439 [2] NCCL INFO Channel 14/0 : 6[2] -> 2[2] [send] via NET/IB/2/GDRDMA +1: jzxh160:364771:366438 [0] NCCL INFO Channel 12/0 : 12[0] -> 4[0] [receive] via NET/IB/0/GDRDMA +1: jzxh160:364771:366438 [0] NCCL INFO Channel 08/0 : 4[0] -> 12[0] [send] via NET/IB/0/GDRDMA +0: jzxh159:1308399:1311133 [0] NCCL INFO Channel 08/0 : 4[0] -> 0[0] [receive] via NET/IB/0/GDRDMA +1: jzxh160:364771:366438 [0] NCCL INFO Channel 12/0 : 4[0] -> 12[0] [send] via NET/IB/0/GDRDMA +1: jzxh160:364773:366439 [2] NCCL INFO Channel 02/0 : 6[2] -> 4[0] via P2P/CUMEM +1: jzxh160:364774:366440 [3] NCCL INFO Channel 01/0 : 7[3] -> 4[0] via P2P/CUMEM +3: jzxh162:3160742:3162428 [0] NCCL INFO Channel 08/0 : 12[0] -> 4[0] [send] via NET/IB/0/GDRDMA +0: jzxh159:1308399:1311133 [0] NCCL INFO Channel 12/0 : 4[0] -> 0[0] [receive] via NET/IB/0/GDRDMA +1: jzxh160:364771:366438 [0] NCCL INFO Channel 00/0 : 8[0] -> 4[0] [receive] via NET/IB/0/GDRDMA +3: jzxh162:3160742:3162428 [0] NCCL INFO Channel 12/0 : 12[0] -> 4[0] [send] via NET/IB/0/GDRDMA +3: jzxh162:3160742:3162428 [0] NCCL INFO Channel 00/0 : 12[0] -> 8[0] [send] via NET/IB/0/GDRDMA +2: jzxh161:3283536:3285122 [0] NCCL INFO Channel 04/0 : 0[0] -> 8[0] [receive] via NET/IB/0/GDRDMA +2: jzxh161:3283536:3285122 [0] NCCL INFO Channel 00/0 : 8[0] -> 0[0] [send] via NET/IB/0/GDRDMA +2: jzxh161:3283538:3285121 [2] NCCL INFO Channel 02/0 : 10[2] -> 8[0] via P2P/CUMEM +2: jzxh161:3283539:3285120 [3] NCCL INFO Channel 01/0 : 11[3] -> 8[0] via P2P/CUMEM +2: jzxh161:3283536:3285122 [0] NCCL INFO Channel 04/0 : 8[0] -> 0[0] [send] via NET/IB/0/GDRDMA +2: jzxh161:3283536:3285122 [0] NCCL INFO Channel 00/0 : 12[0] -> 8[0] [receive] via NET/IB/0/GDRDMA +2: jzxh161:3283539:3285120 [3] NCCL INFO Channel 04/0 : 11[3] -> 8[0] via P2P/CUMEM +2: jzxh161:3283536:3285122 [0] NCCL INFO Channel 04/0 : 12[0] -> 8[0] [receive] via NET/IB/0/GDRDMA +0: jzxh159:1308401:1311134 [2] NCCL INFO Channel 03/0 : 2[2] -> 0[0] via P2P/CUMEM +0: jzxh159:1308402:1311135 [3] NCCL INFO Channel 04/0 : 3[3] -> 0[0] via P2P/CUMEM +1: jzxh160:364771:366438 [0] NCCL INFO Channel 04/0 : 8[0] -> 4[0] [receive] via NET/IB/0/GDRDMA +3: jzxh162:3160744:3162427 [2] NCCL INFO Channel 05/0 : 14[2] -> 12[0] via P2P/CUMEM +3: jzxh162:3160742:3162428 [0] NCCL INFO Channel 04/0 : 12[0] -> 8[0] [send] via NET/IB/0/GDRDMA +1: jzxh160:364774:366440 [3] NCCL INFO Channel 04/0 : 7[3] -> 4[0] via P2P/CUMEM +3: jzxh162:3160745:3162425 [3] NCCL INFO Channel 04/0 : 15[3] -> 12[0] via P2P/CUMEM +1: jzxh160:364771:366438 [0] NCCL INFO Channel 08/0 : 8[0] -> 4[0] [receive] via NET/IB/0/GDRDMA +1: jzxh160:364771:366438 [0] NCCL INFO Channel 12/0 : 8[0] -> 4[0] [receive] via NET/IB/0/GDRDMA +3: jzxh162:3160744:3162427 [2] NCCL INFO Channel 06/0 : 14[2] -> 12[0] via P2P/CUMEM +0: jzxh159:1308401:1311134 [2] NCCL INFO Channel 05/0 : 2[2] -> 0[0] via P2P/CUMEM +3: jzxh162:3160745:3162425 [3] NCCL INFO Channel 05/0 : 15[3] -> 12[0] via P2P/CUMEM +2: jzxh161:3283538:3285121 [2] NCCL INFO Channel 03/0 : 10[2] -> 8[0] via P2P/CUMEM +2: jzxh161:3283536:3285122 [0] NCCL INFO Channel 00/0 : 8[0] -> 4[0] [send] via NET/IB/0/GDRDMA +2: jzxh161:3283536:3285122 [0] NCCL INFO Channel 04/0 : 8[0] -> 4[0] [send] via NET/IB/0/GDRDMA +2: jzxh161:3283536:3285122 [0] NCCL INFO Channel 08/0 : 8[0] -> 4[0] [send] via NET/IB/0/GDRDMA +2: jzxh161:3283536:3285122 [0] NCCL INFO Channel 12/0 : 8[0] -> 4[0] [send] via NET/IB/0/GDRDMA +1: jzxh160:364771:366438 [0] NCCL INFO Channel 08/0 : 4[0] -> 0[0] [send] via NET/IB/0/GDRDMA +1: jzxh160:364773:366439 [2] NCCL INFO Channel 03/0 : 6[2] -> 4[0] via P2P/CUMEM +1: jzxh160:364771:366438 [0] NCCL INFO Channel 12/0 : 4[0] -> 0[0] [send] via NET/IB/0/GDRDMA +1: jzxh160:364774:366440 [3] NCCL INFO Channel 05/0 : 7[3] -> 4[0] via P2P/CUMEM +2: jzxh161:3283538:3285121 [2] NCCL INFO Channel 05/0 : 10[2] -> 8[0] via P2P/CUMEM +2: jzxh161:3283539:3285120 [3] NCCL INFO Channel 05/0 : 11[3] -> 8[0] via P2P/CUMEM +0: jzxh159:1308402:1311135 [3] NCCL INFO Channel 05/0 : 3[3] -> 0[0] via P2P/CUMEM +1: jzxh160:364773:366439 [2] NCCL INFO Channel 05/0 : 6[2] -> 4[0] via P2P/CUMEM +3: jzxh162:3160744:3162427 [2] NCCL INFO Channel 10/0 : 14[2] -> 12[0] via P2P/CUMEM +2: jzxh161:3283538:3285121 [2] NCCL INFO Channel 06/0 : 10[2] -> 8[0] via P2P/CUMEM +2: jzxh161:3283539:3285120 [3] NCCL INFO Channel 06/0 : 11[3] -> 8[0] via P2P/CUMEM +0: jzxh159:1308401:1311134 [2] NCCL INFO Channel 06/0 : 2[2] -> 0[0] via P2P/CUMEM +1: jzxh160:364774:366440 [3] NCCL INFO Channel 06/0 : 7[3] -> 4[0] via P2P/CUMEM +3: jzxh162:3160745:3162425 [3] NCCL INFO Channel 06/0 : 15[3] -> 12[0] via P2P/CUMEM +0: jzxh159:1308402:1311135 [3] NCCL INFO Channel 06/0 : 3[3] -> 0[0] via P2P/CUMEM +1: jzxh160:364773:366439 [2] NCCL INFO Channel 06/0 : 6[2] -> 4[0] via P2P/CUMEM +2: jzxh161:3283539:3285120 [3] NCCL INFO Channel 09/0 : 11[3] -> 8[0] via P2P/CUMEM +3: jzxh162:3160745:3162425 [3] NCCL INFO Channel 09/0 : 15[3] -> 12[0] via P2P/CUMEM +3: jzxh162:3160744:3162427 [2] NCCL INFO Channel 11/0 : 14[2] -> 12[0] via P2P/CUMEM +0: jzxh159:1308401:1311134 [2] NCCL INFO Channel 10/0 : 2[2] -> 0[0] via P2P/CUMEM +1: jzxh160:364774:366440 [3] NCCL INFO Channel 09/0 : 7[3] -> 4[0] via P2P/CUMEM +2: jzxh161:3283538:3285121 [2] NCCL INFO Channel 10/0 : 10[2] -> 8[0] via P2P/CUMEM +0: jzxh159:1308402:1311135 [3] NCCL INFO Channel 09/0 : 3[3] -> 0[0] via P2P/CUMEM +0: jzxh159:1308401:1311134 [2] NCCL INFO Channel 11/0 : 2[2] -> 0[0] via P2P/CUMEM +2: jzxh161:3283539:3285120 [3] NCCL INFO Channel 12/0 : 11[3] -> 8[0] via P2P/CUMEM +3: jzxh162:3160745:3162425 [3] NCCL INFO Channel 12/0 : 15[3] -> 12[0] via P2P/CUMEM +2: jzxh161:3283538:3285121 [2] NCCL INFO Channel 11/0 : 10[2] -> 8[0] via P2P/CUMEM +1: jzxh160:364773:366439 [2] NCCL INFO Channel 10/0 : 6[2] -> 4[0] via P2P/CUMEM +1: jzxh160:364774:366440 [3] NCCL INFO Channel 12/0 : 7[3] -> 4[0] via P2P/CUMEM +3: jzxh162:3160744:3162427 [2] NCCL INFO Channel 13/0 : 14[2] -> 12[0] via P2P/CUMEM +0: jzxh159:1308402:1311135 [3] NCCL INFO Channel 12/0 : 3[3] -> 0[0] via P2P/CUMEM +2: jzxh161:3283538:3285121 [2] NCCL INFO Channel 13/0 : 10[2] -> 8[0] via P2P/CUMEM +0: jzxh159:1308401:1311134 [2] NCCL INFO Channel 13/0 : 2[2] -> 0[0] via P2P/CUMEM +3: jzxh162:3160744:3162427 [2] NCCL INFO Channel 14/0 : 14[2] -> 12[0] via P2P/CUMEM +3: jzxh162:3160745:3162425 [3] NCCL INFO Channel 13/0 : 15[3] -> 12[0] via P2P/CUMEM +1: jzxh160:364773:366439 [2] NCCL INFO Channel 11/0 : 6[2] -> 4[0] via P2P/CUMEM +2: jzxh161:3283539:3285120 [3] NCCL INFO Channel 13/0 : 11[3] -> 8[0] via P2P/CUMEM +1: jzxh160:364774:366440 [3] NCCL INFO Channel 13/0 : 7[3] -> 4[0] via P2P/CUMEM +0: jzxh159:1308401:1311134 [2] NCCL INFO Channel 14/0 : 2[2] -> 0[0] via P2P/CUMEM +2: jzxh161:3283539:3285120 [3] NCCL INFO Channel 14/0 : 11[3] -> 8[0] via P2P/CUMEM +0: jzxh159:1308402:1311135 [3] NCCL INFO Channel 13/0 : 3[3] -> 0[0] via P2P/CUMEM +2: jzxh161:3283538:3285121 [2] NCCL INFO Channel 14/0 : 10[2] -> 8[0] via P2P/CUMEM +3: jzxh162:3160745:3162425 [3] NCCL INFO Channel 14/0 : 15[3] -> 12[0] via P2P/CUMEM +1: jzxh160:364773:366439 [2] NCCL INFO Channel 13/0 : 6[2] -> 4[0] via P2P/CUMEM +0: jzxh159:1308402:1311135 [3] NCCL INFO Channel 14/0 : 3[3] -> 0[0] via P2P/CUMEM +1: jzxh160:364774:366440 [3] NCCL INFO Channel 14/0 : 7[3] -> 4[0] via P2P/CUMEM +1: jzxh160:364773:366439 [2] NCCL INFO Channel 14/0 : 6[2] -> 4[0] via P2P/CUMEM +3: jzxh162:3160745:3162425 [3] NCCL INFO Channel 02/0 : 15[3] -> 13[1] via P2P/CUMEM +2: jzxh161:3283539:3285120 [3] NCCL INFO Channel 02/0 : 11[3] -> 9[1] via P2P/CUMEM +0: jzxh159:1308402:1311135 [3] NCCL INFO Channel 02/0 : 3[3] -> 1[1] via P2P/CUMEM +3: jzxh162:3160745:3162425 [3] NCCL INFO Channel 03/0 : 15[3] -> 13[1] via P2P/CUMEM +2: jzxh161:3283539:3285120 [3] NCCL INFO Channel 03/0 : 11[3] -> 9[1] via P2P/CUMEM +1: jzxh160:364774:366440 [3] NCCL INFO Channel 02/0 : 7[3] -> 5[1] via P2P/CUMEM +0: jzxh159:1308402:1311135 [3] NCCL INFO Channel 03/0 : 3[3] -> 1[1] via P2P/CUMEM +3: jzxh162:3160745:3162425 [3] NCCL INFO Channel 05/0 : 15[3] -> 13[1] via P2P/CUMEM +2: jzxh161:3283539:3285120 [3] NCCL INFO Channel 05/0 : 11[3] -> 9[1] via P2P/CUMEM +0: jzxh159:1308402:1311135 [3] NCCL INFO Channel 05/0 : 3[3] -> 1[1] via P2P/CUMEM +1: jzxh160:364774:366440 [3] NCCL INFO Channel 03/0 : 7[3] -> 5[1] via P2P/CUMEM +2: jzxh161:3283539:3285120 [3] NCCL INFO Channel 06/0 : 11[3] -> 9[1] via P2P/CUMEM +0: jzxh159:1308402:1311135 [3] NCCL INFO Channel 06/0 : 3[3] -> 1[1] via P2P/CUMEM +1: jzxh160:364774:366440 [3] NCCL INFO Channel 05/0 : 7[3] -> 5[1] via P2P/CUMEM +1: jzxh160:364774:366440 [3] NCCL INFO Channel 06/0 : 7[3] -> 5[1] via P2P/CUMEM +2: jzxh161:3283539:3285120 [3] NCCL INFO Channel 10/0 : 11[3] -> 9[1] via P2P/CUMEM +0: jzxh159:1308402:1311135 [3] NCCL INFO Channel 10/0 : 3[3] -> 1[1] via P2P/CUMEM +2: jzxh161:3283539:3285120 [3] NCCL INFO Channel 11/0 : 11[3] -> 9[1] via P2P/CUMEM +0: jzxh159:1308402:1311135 [3] NCCL INFO Channel 11/0 : 3[3] -> 1[1] via P2P/CUMEM +1: jzxh160:364774:366440 [3] NCCL INFO Channel 10/0 : 7[3] -> 5[1] via P2P/CUMEM +2: jzxh161:3283539:3285120 [3] NCCL INFO Channel 13/0 : 11[3] -> 9[1] via P2P/CUMEM +3: jzxh162:3160745:3162425 [3] NCCL INFO Channel 06/0 : 15[3] -> 13[1] via P2P/CUMEM +1: jzxh160:364774:366440 [3] NCCL INFO Channel 11/0 : 7[3] -> 5[1] via P2P/CUMEM +0: jzxh159:1308402:1311135 [3] NCCL INFO Channel 13/0 : 3[3] -> 1[1] via P2P/CUMEM +2: jzxh161:3283539:3285120 [3] NCCL INFO Channel 14/0 : 11[3] -> 9[1] via P2P/CUMEM +3: jzxh162:3160745:3162425 [3] NCCL INFO Channel 10/0 : 15[3] -> 13[1] via P2P/CUMEM +1: jzxh160:364774:366440 [3] NCCL INFO Channel 13/0 : 7[3] -> 5[1] via P2P/CUMEM +0: jzxh159:1308402:1311135 [3] NCCL INFO Channel 14/0 : 3[3] -> 1[1] via P2P/CUMEM +3: jzxh162:3160745:3162425 [3] NCCL INFO Channel 11/0 : 15[3] -> 13[1] via P2P/CUMEM +1: jzxh160:364774:366440 [3] NCCL INFO Channel 14/0 : 7[3] -> 5[1] via P2P/CUMEM +2: jzxh161:3283539:3285120 [3] NCCL INFO Channel 00/0 : 11[3] -> 10[2] via P2P/CUMEM +0: jzxh159:1308402:1311135 [3] NCCL INFO Channel 00/0 : 3[3] -> 2[2] via P2P/CUMEM +3: jzxh162:3160745:3162425 [3] NCCL INFO Channel 13/0 : 15[3] -> 13[1] via P2P/CUMEM +1: jzxh160:364774:366440 [3] NCCL INFO Channel 00/0 : 7[3] -> 6[2] via P2P/CUMEM +3: jzxh162:3160745:3162425 [3] NCCL INFO Channel 14/0 : 15[3] -> 13[1] via P2P/CUMEM +2: jzxh161:3283539:3285120 [3] NCCL INFO Channel 04/0 : 11[3] -> 10[2] via P2P/CUMEM +0: jzxh159:1308402:1311135 [3] NCCL INFO Channel 04/0 : 3[3] -> 2[2] via P2P/CUMEM +3: jzxh162:3160745:3162425 [3] NCCL INFO Channel 00/0 : 15[3] -> 14[2] via P2P/CUMEM +1: jzxh160:364774:366440 [3] NCCL INFO Channel 04/0 : 7[3] -> 6[2] via P2P/CUMEM +2: jzxh161:3283539:3285120 [3] NCCL INFO Channel 07/0 : 11[3] -> 10[2] via P2P/CUMEM +0: jzxh159:1308402:1311135 [3] NCCL INFO Channel 07/0 : 3[3] -> 2[2] via P2P/CUMEM +3: jzxh162:3160745:3162425 [3] NCCL INFO Channel 04/0 : 15[3] -> 14[2] via P2P/CUMEM +1: jzxh160:364774:366440 [3] NCCL INFO Channel 07/0 : 7[3] -> 6[2] via P2P/CUMEM +2: jzxh161:3283539:3285120 [3] NCCL INFO Channel 08/0 : 11[3] -> 10[2] via P2P/CUMEM +0: jzxh159:1308402:1311135 [3] NCCL INFO Channel 08/0 : 3[3] -> 2[2] via P2P/CUMEM +3: jzxh162:3160745:3162425 [3] NCCL INFO Channel 07/0 : 15[3] -> 14[2] via P2P/CUMEM +2: jzxh161:3283539:3285120 [3] NCCL INFO Channel 12/0 : 11[3] -> 10[2] via P2P/CUMEM +1: jzxh160:364774:366440 [3] NCCL INFO Channel 08/0 : 7[3] -> 6[2] via P2P/CUMEM +3: jzxh162:3160745:3162425 [3] NCCL INFO Channel 08/0 : 15[3] -> 14[2] via P2P/CUMEM +0: jzxh159:1308402:1311135 [3] NCCL INFO Channel 12/0 : 3[3] -> 2[2] via P2P/CUMEM +2: jzxh161:3283539:3285120 [3] NCCL INFO Channel 15/0 : 11[3] -> 10[2] via P2P/CUMEM +1: jzxh160:364774:366440 [3] NCCL INFO Channel 12/0 : 7[3] -> 6[2] via P2P/CUMEM +0: jzxh159:1308402:1311135 [3] NCCL INFO Channel 15/0 : 3[3] -> 2[2] via P2P/CUMEM +3: jzxh162:3160745:3162425 [3] NCCL INFO Channel 12/0 : 15[3] -> 14[2] via P2P/CUMEM +1: jzxh160:364774:366440 [3] NCCL INFO Channel 15/0 : 7[3] -> 6[2] via P2P/CUMEM +2: jzxh161:3283538:3285121 [2] NCCL INFO Channel 00/0 : 10[2] -> 9[1] via P2P/CUMEM +3: jzxh162:3160745:3162425 [3] NCCL INFO Channel 15/0 : 15[3] -> 14[2] via P2P/CUMEM +0: jzxh159:1308401:1311134 [2] NCCL INFO Channel 00/0 : 2[2] -> 1[1] via P2P/CUMEM +2: jzxh161:3283538:3285121 [2] NCCL INFO Channel 01/0 : 10[2] -> 9[1] via P2P/CUMEM +0: jzxh159:1308401:1311134 [2] NCCL INFO Channel 01/0 : 2[2] -> 1[1] via P2P/CUMEM +1: jzxh160:364773:366439 [2] NCCL INFO Channel 00/0 : 6[2] -> 5[1] via P2P/CUMEM +2: jzxh161:3283538:3285121 [2] NCCL INFO Channel 04/0 : 10[2] -> 9[1] via P2P/CUMEM +0: jzxh159:1308400:1311136 [1] NCCL INFO Channel 00/0 : 1[1] -> 0[0] via P2P/CUMEM +0: jzxh159:1308401:1311134 [2] NCCL INFO Channel 04/0 : 2[2] -> 1[1] via P2P/CUMEM +1: jzxh160:364773:366439 [2] NCCL INFO Channel 01/0 : 6[2] -> 5[1] via P2P/CUMEM +2: jzxh161:3283537:3285123 [1] NCCL INFO Channel 00/0 : 9[1] -> 8[0] via P2P/CUMEM +2: jzxh161:3283537:3285123 [1] NCCL INFO Channel 03/0 : 9[1] -> 8[0] via P2P/CUMEM +2: jzxh161:3283538:3285121 [2] NCCL INFO Channel 07/0 : 10[2] -> 9[1] via P2P/CUMEM +1: jzxh160:364773:366439 [2] NCCL INFO Channel 04/0 : 6[2] -> 5[1] via P2P/CUMEM +3: jzxh162:3160744:3162427 [2] NCCL INFO Channel 00/0 : 14[2] -> 13[1] via P2P/CUMEM +0: jzxh159:1308400:1311136 [1] NCCL INFO Channel 03/0 : 1[1] -> 0[0] via P2P/CUMEM +0: jzxh159:1308401:1311134 [2] NCCL INFO Channel 07/0 : 2[2] -> 1[1] via P2P/CUMEM +1: jzxh160:364772:366441 [1] NCCL INFO Channel 00/0 : 5[1] -> 4[0] via P2P/CUMEM +3: jzxh162:3160743:3162426 [1] NCCL INFO Channel 00/0 : 13[1] -> 12[0] via P2P/CUMEM +3: jzxh162:3160744:3162427 [2] NCCL INFO Channel 01/0 : 14[2] -> 13[1] via P2P/CUMEM +2: jzxh161:3283538:3285121 [2] NCCL INFO Channel 08/0 : 10[2] -> 9[1] via P2P/CUMEM +2: jzxh161:3283537:3285123 [1] NCCL INFO Channel 07/0 : 9[1] -> 8[0] via P2P/CUMEM +0: jzxh159:1308400:1311136 [1] NCCL INFO Channel 07/0 : 1[1] -> 0[0] via P2P/CUMEM +1: jzxh160:364773:366439 [2] NCCL INFO Channel 07/0 : 6[2] -> 5[1] via P2P/CUMEM +1: jzxh160:364772:366441 [1] NCCL INFO Channel 03/0 : 5[1] -> 4[0] via P2P/CUMEM +0: jzxh159:1308401:1311134 [2] NCCL INFO Channel 08/0 : 2[2] -> 1[1] via P2P/CUMEM +3: jzxh162:3160743:3162426 [1] NCCL INFO Channel 03/0 : 13[1] -> 12[0] via P2P/CUMEM +2: jzxh161:3283537:3285123 [1] NCCL INFO Channel 08/0 : 9[1] -> 8[0] via P2P/CUMEM +2: jzxh161:3283538:3285121 [2] NCCL INFO Channel 09/0 : 10[2] -> 9[1] via P2P/CUMEM +1: jzxh160:364772:366441 [1] NCCL INFO Channel 07/0 : 5[1] -> 4[0] via P2P/CUMEM +3: jzxh162:3160744:3162427 [2] NCCL INFO Channel 04/0 : 14[2] -> 13[1] via P2P/CUMEM +3: jzxh162:3160743:3162426 [1] NCCL INFO Channel 07/0 : 13[1] -> 12[0] via P2P/CUMEM +0: jzxh159:1308401:1311134 [2] NCCL INFO Channel 09/0 : 2[2] -> 1[1] via P2P/CUMEM +1: jzxh160:364773:366439 [2] NCCL INFO Channel 08/0 : 6[2] -> 5[1] via P2P/CUMEM +2: jzxh161:3283538:3285121 [2] NCCL INFO Channel 12/0 : 10[2] -> 9[1] via P2P/CUMEM +3: jzxh162:3160744:3162427 [2] NCCL INFO Channel 07/0 : 14[2] -> 13[1] via P2P/CUMEM +0: jzxh159:1308400:1311136 [1] NCCL INFO Channel 08/0 : 1[1] -> 0[0] via P2P/CUMEM +2: jzxh161:3283537:3285123 [1] NCCL INFO Channel 11/0 : 9[1] -> 8[0] via P2P/CUMEM +3: jzxh162:3160743:3162426 [1] NCCL INFO Channel 08/0 : 13[1] -> 12[0] via P2P/CUMEM +1: jzxh160:364773:366439 [2] NCCL INFO Channel 09/0 : 6[2] -> 5[1] via P2P/CUMEM +1: jzxh160:364772:366441 [1] NCCL INFO Channel 08/0 : 5[1] -> 4[0] via P2P/CUMEM +0: jzxh159:1308400:1311136 [1] NCCL INFO Channel 11/0 : 1[1] -> 0[0] via P2P/CUMEM +0: jzxh159:1308401:1311134 [2] NCCL INFO Channel 12/0 : 2[2] -> 1[1] via P2P/CUMEM +3: jzxh162:3160744:3162427 [2] NCCL INFO Channel 08/0 : 14[2] -> 13[1] via P2P/CUMEM +3: jzxh162:3160743:3162426 [1] NCCL INFO Channel 11/0 : 13[1] -> 12[0] via P2P/CUMEM +2: jzxh161:3283537:3285123 [1] NCCL INFO Channel 15/0 : 9[1] -> 8[0] via P2P/CUMEM +2: jzxh161:3283538:3285121 [2] NCCL INFO Channel 15/0 : 10[2] -> 9[1] via P2P/CUMEM +1: jzxh160:364772:366441 [1] NCCL INFO Channel 11/0 : 5[1] -> 4[0] via P2P/CUMEM +1: jzxh160:364773:366439 [2] NCCL INFO Channel 12/0 : 6[2] -> 5[1] via P2P/CUMEM +3: jzxh162:3160744:3162427 [2] NCCL INFO Channel 09/0 : 14[2] -> 13[1] via P2P/CUMEM +3: jzxh162:3160743:3162426 [1] NCCL INFO Channel 15/0 : 13[1] -> 12[0] via P2P/CUMEM +0: jzxh159:1308401:1311134 [2] NCCL INFO Channel 15/0 : 2[2] -> 1[1] via P2P/CUMEM +0: jzxh159:1308400:1311136 [1] NCCL INFO Channel 15/0 : 1[1] -> 0[0] via P2P/CUMEM +1: jzxh160:364773:366439 [2] NCCL INFO Channel 15/0 : 6[2] -> 5[1] via P2P/CUMEM +3: jzxh162:3160744:3162427 [2] NCCL INFO Channel 12/0 : 14[2] -> 13[1] via P2P/CUMEM +1: jzxh160:364772:366441 [1] NCCL INFO Channel 15/0 : 5[1] -> 4[0] via P2P/CUMEM +3: jzxh162:3160744:3162427 [2] NCCL INFO Channel 15/0 : 14[2] -> 13[1] via P2P/CUMEM +0: jzxh159:1308399:1311133 [0] NCCL INFO Connected all trees +0: jzxh159:1308402:1311135 [3] NCCL INFO Connected all trees +0: jzxh159:1308400:1311136 [1] NCCL INFO Connected all trees +0: jzxh159:1308402:1311135 [3] NCCL INFO threadThresholds 8/8/64 | 128/8/64 | 512 | 512 +0: jzxh159:1308402:1311135 [3] NCCL INFO 16 coll channels, 16 collnet channels, 0 nvls channels, 16 p2p channels, 2 p2p channels per peer +0: jzxh159:1308401:1311134 [2] NCCL INFO Connected all trees +0: jzxh159:1308400:1311136 [1] NCCL INFO threadThresholds 8/8/64 | 128/8/64 | 512 | 512 +0: jzxh159:1308400:1311136 [1] NCCL INFO 16 coll channels, 16 collnet channels, 0 nvls channels, 16 p2p channels, 2 p2p channels per peer +0: jzxh159:1308401:1311134 [2] NCCL INFO threadThresholds 8/8/64 | 128/8/64 | 512 | 512 +0: jzxh159:1308401:1311134 [2] NCCL INFO 16 coll channels, 16 collnet channels, 0 nvls channels, 16 p2p channels, 2 p2p channels per peer +0: jzxh159:1308399:1311133 [0] NCCL INFO threadThresholds 8/8/64 | 128/8/64 | 512 | 512 +0: jzxh159:1308399:1311133 [0] NCCL INFO 16 coll channels, 16 collnet channels, 0 nvls channels, 16 p2p channels, 2 p2p channels per peer +3: jzxh162:3160742:3162428 [0] NCCL INFO Connected all trees +3: jzxh162:3160745:3162425 [3] NCCL INFO Connected all trees +3: jzxh162:3160742:3162428 [0] NCCL INFO threadThresholds 8/8/64 | 128/8/64 | 512 | 512 +3: jzxh162:3160742:3162428 [0] NCCL INFO 16 coll channels, 16 collnet channels, 0 nvls channels, 16 p2p channels, 2 p2p channels per peer +3: jzxh162:3160745:3162425 [3] NCCL INFO threadThresholds 8/8/64 | 128/8/64 | 512 | 512 +3: jzxh162:3160745:3162425 [3] NCCL INFO 16 coll channels, 16 collnet channels, 0 nvls channels, 16 p2p channels, 2 p2p channels per peer +3: jzxh162:3160744:3162427 [2] NCCL INFO Connected all trees +3: jzxh162:3160743:3162426 [1] NCCL INFO Connected all trees +3: jzxh162:3160744:3162427 [2] NCCL INFO threadThresholds 8/8/64 | 128/8/64 | 512 | 512 +3: jzxh162:3160744:3162427 [2] NCCL INFO 16 coll channels, 16 collnet channels, 0 nvls channels, 16 p2p channels, 2 p2p channels per peer +3: jzxh162:3160743:3162426 [1] NCCL INFO threadThresholds 8/8/64 | 128/8/64 | 512 | 512 +3: jzxh162:3160743:3162426 [1] NCCL INFO 16 coll channels, 16 collnet channels, 0 nvls channels, 16 p2p channels, 2 p2p channels per peer +1: jzxh160:364771:366438 [0] NCCL INFO Connected all trees +2: jzxh161:3283536:3285122 [0] NCCL INFO Connected all trees +2: jzxh161:3283537:3285123 [1] NCCL INFO Connected all trees +1: jzxh160:364772:366441 [1] NCCL INFO Connected all trees +1: jzxh160:364771:366438 [0] NCCL INFO threadThresholds 8/8/64 | 128/8/64 | 512 | 512 +1: jzxh160:364771:366438 [0] NCCL INFO 16 coll channels, 16 collnet channels, 0 nvls channels, 16 p2p channels, 2 p2p channels per peer +1: jzxh160:364774:366440 [3] NCCL INFO Connected all trees +1: jzxh160:364772:366441 [1] NCCL INFO threadThresholds 8/8/64 | 128/8/64 | 512 | 512 +1: jzxh160:364772:366441 [1] NCCL INFO 16 coll channels, 16 collnet channels, 0 nvls channels, 16 p2p channels, 2 p2p channels per peer +1: jzxh160:364774:366440 [3] NCCL INFO threadThresholds 8/8/64 | 128/8/64 | 512 | 512 +1: jzxh160:364774:366440 [3] NCCL INFO 16 coll channels, 16 collnet channels, 0 nvls channels, 16 p2p channels, 2 p2p channels per peer +1: jzxh160:364773:366439 [2] NCCL INFO Connected all trees +1: jzxh160:364773:366439 [2] NCCL INFO threadThresholds 8/8/64 | 128/8/64 | 512 | 512 +1: jzxh160:364773:366439 [2] NCCL INFO 16 coll channels, 16 collnet channels, 0 nvls channels, 16 p2p channels, 2 p2p channels per peer +2: jzxh161:3283536:3285122 [0] NCCL INFO threadThresholds 8/8/64 | 128/8/64 | 512 | 512 +2: jzxh161:3283536:3285122 [0] NCCL INFO 16 coll channels, 16 collnet channels, 0 nvls channels, 16 p2p channels, 2 p2p channels per peer +2: jzxh161:3283537:3285123 [1] NCCL INFO threadThresholds 8/8/64 | 128/8/64 | 512 | 512 +2: jzxh161:3283537:3285123 [1] NCCL INFO 16 coll channels, 16 collnet channels, 0 nvls channels, 16 p2p channels, 2 p2p channels per peer +2: jzxh161:3283539:3285120 [3] NCCL INFO Connected all trees +2: jzxh161:3283539:3285120 [3] NCCL INFO threadThresholds 8/8/64 | 128/8/64 | 512 | 512 +2: jzxh161:3283539:3285120 [3] NCCL INFO 16 coll channels, 16 collnet channels, 0 nvls channels, 16 p2p channels, 2 p2p channels per peer +2: jzxh161:3283538:3285121 [2] NCCL INFO Connected all trees +2: jzxh161:3283538:3285121 [2] NCCL INFO threadThresholds 8/8/64 | 128/8/64 | 512 | 512 +2: jzxh161:3283538:3285121 [2] NCCL INFO 16 coll channels, 16 collnet channels, 0 nvls channels, 16 p2p channels, 2 p2p channels per peer +2: jzxh161:3283537:3285123 [1] NCCL INFO ncclCommInitRank comm 0x147064122c40 rank 9 nranks 16 cudaDev 1 nvmlDev 1 busId 2c000 commId 0x2b939d813fa1b763 - Init COMPLETE +2: jzxh161:3283539:3285120 [3] NCCL INFO ncclCommInitRank comm 0x14b15c115410 rank 11 nranks 16 cudaDev 3 nvmlDev 3 busId ad000 commId 0x2b939d813fa1b763 - Init COMPLETE +2: jzxh161:3283536:3285122 [0] NCCL INFO ncclCommInitRank comm 0x14c064143000 rank 8 nranks 16 cudaDev 0 nvmlDev 0 busId 1b000 commId 0x2b939d813fa1b763 - Init COMPLETE +2: jzxh161:3283538:3285121 [2] NCCL INFO ncclCommInitRank comm 0x14a1d01231c0 rank 10 nranks 16 cudaDev 2 nvmlDev 2 busId 9d000 commId 0x2b939d813fa1b763 - Init COMPLETE +3: jzxh162:3160744:3162427 [2] NCCL INFO ncclCommInitRank comm 0x151768115ac0 rank 14 nranks 16 cudaDev 2 nvmlDev 2 busId 9d000 commId 0x2b939d813fa1b763 - Init COMPLETE +3: jzxh162:3160742:3162428 [0] NCCL INFO ncclCommInitRank comm 0x14c174126240 rank 12 nranks 16 cudaDev 0 nvmlDev 0 busId 1b000 commId 0x2b939d813fa1b763 - Init COMPLETE +3: jzxh162:3160745:3162425 [3] NCCL INFO ncclCommInitRank comm 0x152d14114450 rank 15 nranks 16 cudaDev 3 nvmlDev 3 busId ad000 commId 0x2b939d813fa1b763 - Init COMPLETE +3: jzxh162:3160743:3162426 [1] NCCL INFO ncclCommInitRank comm 0x1481bc125340 rank 13 nranks 16 cudaDev 1 nvmlDev 1 busId 2c000 commId 0x2b939d813fa1b763 - Init COMPLETE +1: jzxh160:364774:366440 [3] NCCL INFO ncclCommInitRank comm 0x1457001250c0 rank 7 nranks 16 cudaDev 3 nvmlDev 3 busId ad000 commId 0x2b939d813fa1b763 - Init COMPLETE +1: jzxh160:364772:366441 [1] NCCL INFO ncclCommInitRank comm 0x144548117000 rank 5 nranks 16 cudaDev 1 nvmlDev 1 busId 2c000 commId 0x2b939d813fa1b763 - Init COMPLETE +1: jzxh160:364771:366438 [0] NCCL INFO ncclCommInitRank comm 0x15301c1476c0 rank 4 nranks 16 cudaDev 0 nvmlDev 0 busId 1b000 commId 0x2b939d813fa1b763 - Init COMPLETE +1: jzxh160:364773:366439 [2] NCCL INFO ncclCommInitRank comm 0x149098133e00 rank 6 nranks 16 cudaDev 2 nvmlDev 2 busId 9d000 commId 0x2b939d813fa1b763 - Init COMPLETE +0: jzxh159:1308402:1311135 [3] NCCL INFO ncclCommInitRank comm 0x154364116f80 rank 3 nranks 16 cudaDev 3 nvmlDev 3 busId ad000 commId 0x2b939d813fa1b763 - Init COMPLETE +0: jzxh159:1308400:1311136 [1] NCCL INFO ncclCommInitRank comm 0x14c618124680 rank 1 nranks 16 cudaDev 1 nvmlDev 1 busId 2c000 commId 0x2b939d813fa1b763 - Init COMPLETE +0: jzxh159:1308399:1311133 [0] NCCL INFO ncclCommInitRank comm 0x151f9411a7c0 rank 0 nranks 16 cudaDev 0 nvmlDev 0 busId 1b000 commId 0x2b939d813fa1b763 - Init COMPLETE +0: jzxh159:1308401:1311134 [2] NCCL INFO ncclCommInitRank comm 0x1463c8123fc0 rank 2 nranks 16 cudaDev 2 nvmlDev 2 busId 9d000 commId 0x2b939d813fa1b763 - Init COMPLETE +0: {'loss': 0.7405, 'grad_norm': 2.4418386537020766, 'learning_rate': 9.05e-07, 'memory/max_mem_active(gib)': 57.09, 'memory/max_mem_allocated(gib)': 57.09, 'memory/device_mem_reserved(gib)': 66.93, 'epoch': 0.0} +0: 0%| | 0/1485 [00:00