nanochat-eos / scripts /test_8gpu.py
ksjpswaroop's picture
Upload folder using huggingface_hub
50ebd92 verified
#!/usr/bin/env python3
"""
Minimal 8-GPU test: DDP init + one small op per GPU + all_reduce.
Run from repo root:
torchrun --standalone --nproc_per_node=8 -m scripts.test_8gpu
If NCCL segfaults (exit -11), try workaround env vars first:
bash scripts/try_nccl_8gpu.sh
Else use gloo:
NANOCHAT_DDP_BACKEND=gloo torchrun --standalone --nproc_per_node=8 -m scripts.test_8gpu
"""
import os
import torch
import torch.distributed as dist
def main():
# torchrun sets these
rank = int(os.environ["RANK"])
local_rank = int(os.environ["LOCAL_RANK"])
world_size = int(os.environ["WORLD_SIZE"])
backend = os.environ.get("NANOCHAT_DDP_BACKEND", "nccl").lower()
if backend not in ("nccl", "gloo"):
backend = "nccl"
device = torch.device("cuda", local_rank)
torch.cuda.set_device(device)
if backend == "nccl":
dist.init_process_group(backend="nccl", device_id=device)
else:
dist.init_process_group(backend="gloo")
# One small op on this GPU
n = 1024
a = torch.ones(n, n, device=device, dtype=torch.float32)
b = torch.ones(n, n, device=device, dtype=torch.float32)
c = a @ b
s = c.sum().item()
# All-reduce so all ranks agree we got here
t = torch.tensor([s], dtype=torch.float32, device=device)
dist.all_reduce(t, op=dist.ReduceOp.SUM)
dist.barrier()
if rank == 0:
gpu_name = torch.cuda.get_device_name(0)
print(f"test_8gpu OK: world_size={world_size} backend={backend} device={gpu_name}")
print(f" (each rank did 1024x1024 matmul; all_reduce sum={t.item():.0f})")
dist.destroy_process_group()
if __name__ == "__main__":
main()