File size: 1,666 Bytes
50ebd92 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 | #!/usr/bin/env python3
"""
Minimal 8-GPU test: DDP init + one small op per GPU + all_reduce.
Run from repo root:
torchrun --standalone --nproc_per_node=8 -m scripts.test_8gpu
If NCCL segfaults (exit -11), try workaround env vars first:
bash scripts/try_nccl_8gpu.sh
Else use gloo:
NANOCHAT_DDP_BACKEND=gloo torchrun --standalone --nproc_per_node=8 -m scripts.test_8gpu
"""
import os
import torch
import torch.distributed as dist
def main():
# torchrun sets these
rank = int(os.environ["RANK"])
local_rank = int(os.environ["LOCAL_RANK"])
world_size = int(os.environ["WORLD_SIZE"])
backend = os.environ.get("NANOCHAT_DDP_BACKEND", "nccl").lower()
if backend not in ("nccl", "gloo"):
backend = "nccl"
device = torch.device("cuda", local_rank)
torch.cuda.set_device(device)
if backend == "nccl":
dist.init_process_group(backend="nccl", device_id=device)
else:
dist.init_process_group(backend="gloo")
# One small op on this GPU
n = 1024
a = torch.ones(n, n, device=device, dtype=torch.float32)
b = torch.ones(n, n, device=device, dtype=torch.float32)
c = a @ b
s = c.sum().item()
# All-reduce so all ranks agree we got here
t = torch.tensor([s], dtype=torch.float32, device=device)
dist.all_reduce(t, op=dist.ReduceOp.SUM)
dist.barrier()
if rank == 0:
gpu_name = torch.cuda.get_device_name(0)
print(f"test_8gpu OK: world_size={world_size} backend={backend} device={gpu_name}")
print(f" (each rank did 1024x1024 matmul; all_reduce sum={t.item():.0f})")
dist.destroy_process_group()
if __name__ == "__main__":
main()
|