Humoai

Sleeping

App Files Files Community

Humoai / humo /common /distributed /basic.py

alexnasa

Upload 54 files

295978e verified 5 months ago

raw

history blame contribute delete

5.24 kB

	# Copyright (c) 2025 Bytedance Ltd. and/or its affiliates
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	# http://www.apache.org/licenses/LICENSE-2.0
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.

	# Codes adapted from [SeedVR]
	# https://github.com/ByteDance-Seed/SeedVR/tree/main/common/distributed

	"""
	Distributed basic functions.
	"""

	import os
	import torch
	from torch import nn
	import torch.distributed as dist
	from torch.nn.parallel import DistributedDataParallel
	from torch.distributed.fsdp._common_utils import _is_fsdp_flattened


	def get_global_rank() -> int:
	"""
	Get the global rank, the global index of the GPU.
	"""
	return int(os.environ.get("RANK", "0"))


	def get_local_rank() -> int:
	"""
	Get the local rank, the local index of the GPU.
	"""
	return int(os.environ.get("LOCAL_RANK", "0"))


	def get_world_size() -> int:
	"""
	Get the world size, the total amount of GPUs.
	"""
	return int(os.environ.get("WORLD_SIZE", "1"))


	def get_device() -> torch.device:
	"""
	Get current rank device.
	"""
	return torch.device("cuda", get_local_rank())


	def barrier_if_distributed(args, *kwargs):
	"""
	Synchronizes all processes if under distributed context.
	"""
	if dist.is_initialized():
	return dist.barrier(args, *kwargs)


	def init_torch(cudnn_benchmark=True):
	"""
	Common PyTorch initialization configuration.
	"""
	torch.backends.cuda.matmul.allow_tf32 = True
	torch.backends.cudnn.allow_tf32 = True
	torch.backends.cudnn.benchmark = cudnn_benchmark
	torch.cuda.set_device(get_local_rank())
	dist.init_process_group(
	backend="nccl",
	rank=get_global_rank(),
	world_size=get_world_size(),
	)


	def convert_to_ddp(module: torch.nn.Module, **kwargs) -> DistributedDataParallel:
	return DistributedDataParallel(
	module=module,
	device_ids=[get_local_rank()],
	output_device=get_local_rank(),
	**kwargs,
	)


	def meta_param_init_fn(module: nn.Module) -> None:
	"""
	Used for model inited onto meta device.
	Init meta param/buffer with empty tensor.
	We don't care numerical correctness in this func.
	FSDP will sync param/buffer state from rank0 to the other ranks.
	"""

	with torch.no_grad():
	for submodule in module.modules():
	for param_name, param in submodule.named_parameters(recurse=False):
	if not _is_fsdp_flattened(param) and param.is_meta:
	materialized_param = nn.Parameter(torch.empty_like(param, device="cpu"))
	setattr(submodule, param_name, materialized_param)
	for buffer_name, buffer in submodule.named_buffers(recurse=False):
	if not _is_fsdp_flattened(buffer) and buffer.is_meta:
	materialized_param = torch.empty_like(buffer, device="cpu")
	setattr(submodule, buffer_name, materialized_param)
	torch.cuda.empty_cache()


	def meta_non_persistent_buffer_init_fn(module: nn.Module) -> nn.Module:
	"""
	Materialize meta device buffers that are not persistent in state_dict.
	Handles special cases like RotaryEmbedding.freqs.
	"""
	with torch.no_grad():
	for submodule in module.modules():
	if hasattr(submodule, "freqs"):
	freqs = getattr(submodule, "freqs")
	if isinstance(freqs, torch.Tensor) and freqs.is_meta:
	dim = submodule.dim
	def rope_params(max_seq_len, dim, theta=10000):
	assert dim % 2 == 0
	freqs = torch.outer(
	torch.arange(max_seq_len),
	1.0 / torch.pow(theta,
	torch.arange(0, dim, 2).to(torch.float32).div(dim)))
	freqs = torch.polar(torch.ones_like(freqs), freqs)
	return freqs

	dim = 5120 # 1536
	num_heads = 40 # 12
	# dim = 1536
	# num_heads = 12
	d = dim // num_heads
	freqs_tensor = torch.cat([
	rope_params(1024, d - 4 * (d // 6)),
	rope_params(1024, 2 * (d // 6)),
	rope_params(1024, 2 * (d // 6))
	], dim=1).to(dtype=torch.cfloat, device="cpu")

	setattr(submodule, "freqs", freqs_tensor)
	print(f"Successfully materialized freqs for {submodule.__class__.__name__}")

	assert not any(b.is_meta for n, b in module.named_buffers())
	return module