| |
| """ |
| Simple RCCL test for multi-GPU communication. |
| This test verifies that RCCL can initialize and communicate across multiple GPUs. |
| """ |
|
|
| import os |
| import sys |
|
|
| import torch |
| import torch.distributed as dist |
|
|
|
|
| def test_rccl_allreduce(): |
| """Test basic RCCL allreduce operation across all GPUs.""" |
| if not torch.cuda.is_available(): |
| print("CUDA not available, skipping test") |
| sys.exit(1) |
|
|
| |
| dist.init_process_group(backend="nccl") |
|
|
| rank = dist.get_rank() |
| world_size = dist.get_world_size() |
|
|
| print(f"[Rank {rank}/{world_size}] Initialized successfully") |
|
|
| |
| device = torch.device(f"cuda:{rank}") |
| torch.cuda.set_device(device) |
|
|
| print(f"[Rank {rank}] Device: {torch.cuda.get_device_name(device)}") |
| print( |
| f"[Rank {rank}] Device memory: {torch.cuda.get_device_properties(device).total_memory / 1e9:.2f} GB" |
| ) |
|
|
| |
| tensor = torch.ones(1000, device=device) * rank |
| print(f"[Rank {rank}] Before allreduce: tensor sum = {tensor.sum().item()}") |
|
|
| dist.all_reduce(tensor, op=dist.ReduceOp.SUM) |
|
|
| expected_sum = sum(range(world_size)) * 1000 |
| actual_sum = tensor.sum().item() |
|
|
| print( |
| f"[Rank {rank}] After allreduce: tensor sum = {actual_sum}, expected = {expected_sum}" |
| ) |
|
|
| if abs(actual_sum - expected_sum) < 0.1: |
| print(f"[Rank {rank}] ✓ RCCL allreduce test PASSED") |
| dist.destroy_process_group() |
| sys.exit(0) |
| else: |
| print(f"[Rank {rank}] ✗ RCCL allreduce test FAILED") |
| dist.destroy_process_group() |
| sys.exit(1) |
|
|
|
|
| if __name__ == "__main__": |
| test_rccl_allreduce() |
|
|