#!/usr/bin/env python3 """ Simple RCCL test for multi-GPU communication. This test verifies that RCCL can initialize and communicate across multiple GPUs. """ import os import sys import torch import torch.distributed as dist def test_rccl_allreduce(): """Test basic RCCL allreduce operation across all GPUs.""" if not torch.cuda.is_available(): print("CUDA not available, skipping test") sys.exit(1) # Initialize process group with NCCL (RCCL on AMD) dist.init_process_group(backend="nccl") rank = dist.get_rank() world_size = dist.get_world_size() print(f"[Rank {rank}/{world_size}] Initialized successfully") # Set device device = torch.device(f"cuda:{rank}") torch.cuda.set_device(device) print(f"[Rank {rank}] Device: {torch.cuda.get_device_name(device)}") print( f"[Rank {rank}] Device memory: {torch.cuda.get_device_properties(device).total_memory / 1e9:.2f} GB" ) # Create a tensor and perform allreduce tensor = torch.ones(1000, device=device) * rank print(f"[Rank {rank}] Before allreduce: tensor sum = {tensor.sum().item()}") dist.all_reduce(tensor, op=dist.ReduceOp.SUM) expected_sum = sum(range(world_size)) * 1000 actual_sum = tensor.sum().item() print( f"[Rank {rank}] After allreduce: tensor sum = {actual_sum}, expected = {expected_sum}" ) if abs(actual_sum - expected_sum) < 0.1: print(f"[Rank {rank}] ✓ RCCL allreduce test PASSED") dist.destroy_process_group() sys.exit(0) else: print(f"[Rank {rank}] ✗ RCCL allreduce test FAILED") dist.destroy_process_group() sys.exit(1) if __name__ == "__main__": test_rccl_allreduce()