ksjpswaroop
/

nanochat-eos

Model card Files Files and versions

nanochat-eos / scripts /test_8gpu.py

ksjpswaroop's picture

Upload folder using huggingface_hub

50ebd92 verified 4 months ago

history blame contribute delete

1.67 kB

	#!/usr/bin/env python3
	"""
	Minimal 8-GPU test: DDP init + one small op per GPU + all_reduce.
	Run from repo root:
	torchrun --standalone --nproc_per_node=8 -m scripts.test_8gpu
	If NCCL segfaults (exit -11), try workaround env vars first:
	bash scripts/try_nccl_8gpu.sh
	Else use gloo:
	NANOCHAT_DDP_BACKEND=gloo torchrun --standalone --nproc_per_node=8 -m scripts.test_8gpu
	"""
	import os
	import torch
	import torch.distributed as dist

	def main():
	# torchrun sets these
	rank = int(os.environ["RANK"])
	local_rank = int(os.environ["LOCAL_RANK"])
	world_size = int(os.environ["WORLD_SIZE"])

	backend = os.environ.get("NANOCHAT_DDP_BACKEND", "nccl").lower()
	if backend not in ("nccl", "gloo"):
	backend = "nccl"

	device = torch.device("cuda", local_rank)
	torch.cuda.set_device(device)

	if backend == "nccl":
	dist.init_process_group(backend="nccl", device_id=device)
	else:
	dist.init_process_group(backend="gloo")

	# One small op on this GPU
	n = 1024
	a = torch.ones(n, n, device=device, dtype=torch.float32)
	b = torch.ones(n, n, device=device, dtype=torch.float32)
	c = a @ b
	s = c.sum().item()

	# All-reduce so all ranks agree we got here
	t = torch.tensor([s], dtype=torch.float32, device=device)
	dist.all_reduce(t, op=dist.ReduceOp.SUM)
	dist.barrier()

	if rank == 0:
	gpu_name = torch.cuda.get_device_name(0)
	print(f"test_8gpu OK: world_size={world_size} backend={backend} device={gpu_name}")
	print(f" (each rank did 1024x1024 matmul; all_reduce sum={t.item():.0f})")

	dist.destroy_process_group()

	if __name__ == "__main__":
	main()