File size: 1,737 Bytes
61ba51e | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 | #!/usr/bin/env python3
"""
Simple RCCL test for multi-GPU communication.
This test verifies that RCCL can initialize and communicate across multiple GPUs.
"""
import os
import sys
import torch
import torch.distributed as dist
def test_rccl_allreduce():
"""Test basic RCCL allreduce operation across all GPUs."""
if not torch.cuda.is_available():
print("CUDA not available, skipping test")
sys.exit(1)
# Initialize process group with NCCL (RCCL on AMD)
dist.init_process_group(backend="nccl")
rank = dist.get_rank()
world_size = dist.get_world_size()
print(f"[Rank {rank}/{world_size}] Initialized successfully")
# Set device
device = torch.device(f"cuda:{rank}")
torch.cuda.set_device(device)
print(f"[Rank {rank}] Device: {torch.cuda.get_device_name(device)}")
print(
f"[Rank {rank}] Device memory: {torch.cuda.get_device_properties(device).total_memory / 1e9:.2f} GB"
)
# Create a tensor and perform allreduce
tensor = torch.ones(1000, device=device) * rank
print(f"[Rank {rank}] Before allreduce: tensor sum = {tensor.sum().item()}")
dist.all_reduce(tensor, op=dist.ReduceOp.SUM)
expected_sum = sum(range(world_size)) * 1000
actual_sum = tensor.sum().item()
print(
f"[Rank {rank}] After allreduce: tensor sum = {actual_sum}, expected = {expected_sum}"
)
if abs(actual_sum - expected_sum) < 0.1:
print(f"[Rank {rank}] ✓ RCCL allreduce test PASSED")
dist.destroy_process_group()
sys.exit(0)
else:
print(f"[Rank {rank}] ✗ RCCL allreduce test FAILED")
dist.destroy_process_group()
sys.exit(1)
if __name__ == "__main__":
test_rccl_allreduce()
|