HY-Video-PRFL / diffusers_lite /utils /communication.py

Upload folder using huggingface_hub

e14f899 verified 20 days ago

24.7 kB

	# Copyright (c) Microsoft Corporation.
	# SPDX-License-Identifier: Apache-2.0

	# DeepSpeed Team

	from typing import Any, Tuple

	import os
	import torch
	import functools
	import torch.distributed as dist
	from torch import Tensor

	from ..utils.parallel_states import nccl_info, get_teacher_student_parallel_state


	def broadcast(input_: torch.Tensor):
	src = nccl_info.group_id * nccl_info.sp_size
	dist.broadcast(input_, src=src, group=nccl_info.group)

	def broadcast_within_ts_unit(input_):
	src = nccl_info.ts_unit_group_id * nccl_info.ts_unit_size
	dist.broadcast(input_, src=src, group=nccl_info.ts_unit_group)

	def broadcast_global(input_: torch.Tensor):
	dist.broadcast(input_, src=0, group=None)

	def broadcast_dict(input_: dict):
	src = nccl_info.group_id * nccl_info.sp_size
	for k, v in input_.items():
	if isinstance(input_[k], torch.Tensor):
	dist.broadcast(input_[k], src=src, group=nccl_info.group)

	def broadcast_dict_within_ts_unit(input_: dict):
	src = nccl_info.ts_unit_group_id * nccl_info.ts_unit_size
	for k, v in input_.items():
	if isinstance(input_[k], torch.Tensor):
	dist.broadcast(input_[k], src=src, group=nccl_info.ts_unit_group)

	def _all_to_all_4D(
	input: torch.tensor, scatter_idx: int = 2, gather_idx: int = 1, group=None
	) -> torch.tensor:
	"""
	all-to-all for QKV

	Args:
	input (torch.tensor): a tensor sharded along dim scatter dim
	scatter_idx (int): default 1
	gather_idx (int): default 2
	group : torch process group

	Returns:
	torch.tensor: resharded tensor (bs, seqlen/P, hc, hs)
	"""
	assert (
	input.dim() == 4
	), f"input must be 4D tensor, got {input.dim()} and shape {input.shape}"

	seq_world_size = dist.get_world_size(group)

	if scatter_idx == 2 and gather_idx == 1:
	# input (torch.tensor): a tensor sharded along dim 1 (bs, seqlen/P, hc, hs) output: (bs, seqlen, hc/P, hs)
	bs, shard_seqlen, hc, hs = input.shape
	seqlen = shard_seqlen * seq_world_size
	shard_hc = hc // seq_world_size

	# transpose groups of heads with the seq-len parallel dimension, so that we can scatter them!
	# (bs, seqlen/P, hc, hs) -reshape-> (bs, seq_len/P, P, hc/P, hs) -transpose(0,2)-> (P, seq_len/P, bs, hc/P, hs)
	input_t = (
	input.reshape(bs, shard_seqlen, seq_world_size, shard_hc, hs)
	.transpose(0, 2)
	.contiguous()
	)

	output = torch.empty_like(input_t)
	# https://pytorch.org/docs/stable/distributed.html#torch.distributed.all_to_all_single
	# (P, seq_len/P, bs, hc/P, hs) scatter seqlen -all2all-> (P, seq_len/P, bs, hc/P, hs) scatter head
	if seq_world_size > 1:
	dist.all_to_all_single(output, input_t, group=group)
	torch.cuda.synchronize()
	else:
	output = input_t
	# if scattering the seq-dim, transpose the heads back to the original dimension
	output = output.reshape(seqlen, bs, shard_hc, hs)

	# (seq_len, bs, hc/P, hs) -reshape-> (bs, seq_len, hc/P, hs)
	output = output.transpose(0, 1).contiguous().reshape(bs, seqlen, shard_hc, hs)

	return output

	elif scatter_idx == 1 and gather_idx == 2:
	# input (torch.tensor): a tensor sharded along dim 1 (bs, seqlen, hc/P, hs) output: (bs, seqlen/P, hc, hs)
	bs, seqlen, shard_hc, hs = input.shape
	hc = shard_hc * seq_world_size
	shard_seqlen = seqlen // seq_world_size
	seq_world_size = dist.get_world_size(group)

	# transpose groups of heads with the seq-len parallel dimension, so that we can scatter them!
	# (bs, seqlen, hc/P, hs) -reshape-> (bs, P, seq_len/P, hc/P, hs) -transpose(0, 3)-> (hc/P, P, seqlen/P, bs, hs) -transpose(0, 1) -> (P, hc/P, seqlen/P, bs, hs)
	input_t = (
	input.reshape(bs, seq_world_size, shard_seqlen, shard_hc, hs)
	.transpose(0, 3)
	.transpose(0, 1)
	.contiguous()
	.reshape(seq_world_size, shard_hc, shard_seqlen, bs, hs)
	)

	output = torch.empty_like(input_t)
	# https://pytorch.org/docs/stable/distributed.html#torch.distributed.all_to_all_single
	# (P, bs x hc/P, seqlen/P, hs) scatter seqlen -all2all-> (P, bs x seq_len/P, hc/P, hs) scatter head
	if seq_world_size > 1:
	dist.all_to_all_single(output, input_t, group=group)
	torch.cuda.synchronize()
	else:
	output = input_t

	# if scattering the seq-dim, transpose the heads back to the original dimension
	output = output.reshape(hc, shard_seqlen, bs, hs)

	# (hc, seqlen/N, bs, hs) -tranpose(0,2)-> (bs, seqlen/N, hc, hs)
	output = output.transpose(0, 2).contiguous().reshape(bs, shard_seqlen, hc, hs)

	return output
	else:
	raise RuntimeError("scatter_idx must be 1 or 2 and gather_idx must be 1 or 2")


	class SeqAllToAll4D(torch.autograd.Function):
	@staticmethod
	def forward(
	ctx: Any,
	group: dist.ProcessGroup,
	input: Tensor,
	scatter_idx: int,
	gather_idx: int,
	) -> Tensor:
	ctx.group = group
	ctx.scatter_idx = scatter_idx
	ctx.gather_idx = gather_idx

	return _all_to_all_4D(input, scatter_idx, gather_idx, group=group)

	@staticmethod
	def backward(ctx: Any, *grad_output: Tensor) -> Tuple[None, Tensor, None, None]:
	return (
	None,
	SeqAllToAll4D.apply(
	ctx.group, *grad_output, ctx.gather_idx, ctx.scatter_idx
	),
	None,
	None,
	)


	def all_to_all_4D(
	input_: torch.Tensor,
	scatter_dim: int = 2,
	gather_dim: int = 1,
	):
	return SeqAllToAll4D.apply(nccl_info.group, input_, scatter_dim, gather_dim)


	def _all_to_all(
	input_: torch.Tensor,
	world_size: int,
	group: dist.ProcessGroup,
	scatter_dim: int,
	gather_dim: int,
	):
	input_list = [
	t.contiguous() for t in torch.tensor_split(input_, world_size, scatter_dim)
	]
	output_list = [torch.empty_like(input_list[0]) for _ in range(world_size)]
	dist.all_to_all(output_list, input_list, group=group)
	return torch.cat(output_list, dim=gather_dim).contiguous()


	class _AllToAll(torch.autograd.Function):
	"""All-to-all communication.

	Args:
	input_: input matrix
	process_group: communication group
	scatter_dim: scatter dimension
	gather_dim: gather dimension
	"""

	@staticmethod
	def forward(ctx, input_, process_group, scatter_dim, gather_dim):
	ctx.process_group = process_group
	ctx.scatter_dim = scatter_dim
	ctx.gather_dim = gather_dim
	ctx.world_size = dist.get_world_size(process_group)
	output = _all_to_all(
	input_, ctx.world_size, process_group, scatter_dim, gather_dim
	)
	return output

	@staticmethod
	def backward(ctx, grad_output):
	grad_output = _all_to_all(
	grad_output,
	ctx.world_size,
	ctx.process_group,
	ctx.gather_dim,
	ctx.scatter_dim,
	)
	return (
	grad_output,
	None,
	None,
	None,
	)


	def all_to_all(
	input_: torch.Tensor,
	scatter_dim: int = 2,
	gather_dim: int = 1,
	):
	return _AllToAll.apply(input_, nccl_info.group, scatter_dim, gather_dim)


	class _AllGather(torch.autograd.Function):
	"""All-gather communication with autograd support.

	Args:
	input_: input tensor
	dim: dimension along which to concatenate
	"""

	@staticmethod
	def forward(ctx, input_, dim):
	ctx.dim = dim
	world_size = nccl_info.sp_size
	group = nccl_info.group
	input_size = list(input_.size())

	ctx.input_size = input_size[dim]

	tensor_list = [torch.empty_like(input_) for _ in range(world_size)]
	input_ = input_.contiguous()
	dist.all_gather(tensor_list, input_, group=group)

	output = torch.cat(tensor_list, dim=dim)
	return output

	@staticmethod
	def backward(ctx, grad_output):
	world_size = nccl_info.sp_size
	rank = nccl_info.rank_within_group
	dim = ctx.dim
	input_size = ctx.input_size

	sizes = [input_size] * world_size

	grad_input_list = torch.split(grad_output, sizes, dim=dim)
	grad_input = grad_input_list[rank]

	return grad_input, None


	def all_gather(input_: torch.Tensor, dim: int = 1):
	"""Performs an all-gather operation on the input tensor along the specified dimension.

	Args:
	input_ (torch.Tensor): Input tensor of shape [B, H, S, D].
	dim (int, optional): Dimension along which to concatenate. Defaults to 1.

	Returns:
	torch.Tensor: Output tensor after all-gather operation, concatenated along 'dim'.
	"""
	return _AllGather.apply(input_, dim)

	class _AllGather_TeacherStudent(torch.autograd.Function):
	"""All-gather communication with autograd support.

	Args:
	input_: input tensor
	dim: dimension along which to concatenate
	"""

	@staticmethod
	def forward(ctx, input_, dim):
	ctx.dim = dim
	world_size = nccl_info.ts_unit_size
	group = nccl_info.ts_unit_group
	input_size = list(input_.size())

	ctx.input_size = input_size[dim]

	tensor_list = [torch.empty_like(input_) for _ in range(world_size)]
	input_ = input_.contiguous()
	dist.all_gather(tensor_list, input_, group=group)

	output = torch.cat(tensor_list, dim=dim)
	return output

	@staticmethod
	def backward(ctx, grad_output):
	world_size = nccl_info.ts_unit_size
	rank = nccl_info.rank_within_ts_unit_group
	dim = ctx.dim
	input_size = ctx.input_size

	sizes = [input_size] * world_size
	grad_input_list = torch.split(grad_output, sizes, dim=dim)
	grad_input = grad_input_list[rank]
	return grad_input, None

	def all_gather_ts(input_: torch.Tensor, dim: int = 1):
	"""Performs an all-gather operation on the input tensor along the specified dimension.

	Args:
	input_ (torch.Tensor): Input tensor of shape [B, H, S, D].
	dim (int, optional): Dimension along which to concatenate. Defaults to 1.

	Returns:
	torch.Tensor: Output tensor after all-gather operation, concatenated along 'dim'.
	"""
	return _AllGather_TeacherStudent.apply(input_, dim)


	def prepare_sequence_parallel_data_wanx(
	hidden_states, encoder_hidden_states, uncond_text_states, image_embeds, latents_condition
	):
	if nccl_info.sp_size == 1:
	return (
	hidden_states,
	encoder_hidden_states,
	uncond_text_states,
	image_embeds,
	latents_condition,
	)

	def prepare(hidden_states, encoder_hidden_states, uncond_text_states, image_embeds, latents_condition):
	hidden_states = all_to_all(hidden_states, scatter_dim=2, gather_dim=0)
	encoder_hidden_states = all_to_all(
	encoder_hidden_states, scatter_dim=1, gather_dim=0
	)
	uncond_text_states = all_to_all(
	uncond_text_states, scatter_dim=1, gather_dim=0
	)
	image_embeds = all_to_all(image_embeds, scatter_dim=1, gather_dim=0)
	latents_condition = all_to_all(latents_condition, scatter_dim=2, gather_dim=0)

	return (
	hidden_states,
	encoder_hidden_states,
	uncond_text_states,
	image_embeds,
	latents_condition,
	)

	sp_size = nccl_info.sp_size
	frame = hidden_states.shape[2]
	assert frame % sp_size == 0, "frame should be a multiple of sp_size"

	(
	hidden_states,
	encoder_hidden_states,
	uncond_text_states,
	image_embeds,
	latents_condition,
	) = prepare(
	hidden_states,
	encoder_hidden_states.repeat(1, sp_size, 1),
	uncond_text_states.repeat(1, sp_size, 1),
	image_embeds.repeat(1, sp_size, 1),
	latents_condition,
	)

	return hidden_states, encoder_hidden_states, uncond_text_states, image_embeds, latents_condition


	def sp_parallel_dataloader_wrapper_wanx(
	dataloader, device, train_batch_size, sp_size, train_sp_batch_size
	):
	while True:
	for data_item in dataloader:
	latents, text_states, uncond_text_states, image_embeds, latents_condition = data_item
	latents = latents.to(device)
	text_states = text_states.to(device)
	uncond_text_states = uncond_text_states.to(device)
	image_embeds = image_embeds.to(device)
	latents_condition = latents_condition.to(device)
	frame = latents.shape[2]
	if frame == 1:
	yield latents, text_states, uncond_text_states, image_embeds, latents_condition
	else:
	latents, text_states, uncond_text_states, image_embeds, latents_condition = (
	prepare_sequence_parallel_data_wanx(
	latents, text_states, uncond_text_states, image_embeds, latents_condition
	)
	)
	assert (
	train_batch_size * sp_size >= train_sp_batch_size
	), "train_batch_size * sp_size should be greater than train_sp_batch_size"
	for iter in range(train_batch_size * sp_size // train_sp_batch_size):
	st_idx = iter * train_sp_batch_size
	ed_idx = (iter + 1) * train_sp_batch_size
	yield (
	latents[st_idx:ed_idx],
	text_states[st_idx:ed_idx],
	uncond_text_states[st_idx:ed_idx],
	image_embeds[st_idx:ed_idx],
	latents_condition[st_idx:ed_idx],
	)

	def prepare_sequence_parallel_data_wanx_dpo(
	hidden_states, encoder_hidden_states, uncond_text_states, image_embeds, latents_condition,latents_lose
	):
	if nccl_info.sp_size == 1:
	return (
	hidden_states,
	encoder_hidden_states,
	uncond_text_states,
	image_embeds,
	latents_condition,
	latents_lose,
	)

	def prepare(hidden_states, encoder_hidden_states, uncond_text_states, image_embeds, latents_condition, latents_lose):
	hidden_states = all_to_all(hidden_states, scatter_dim=2, gather_dim=0)
	latents_lose = all_to_all(latents_lose, scatter_dim=2, gather_dim=0)
	encoder_hidden_states = all_to_all(
	encoder_hidden_states, scatter_dim=1, gather_dim=0
	)
	uncond_text_states = all_to_all(
	uncond_text_states, scatter_dim=1, gather_dim=0
	)
	image_embeds = all_to_all(image_embeds, scatter_dim=1, gather_dim=0)
	latents_condition = all_to_all(latents_condition, scatter_dim=2, gather_dim=0)

	return (
	hidden_states,
	encoder_hidden_states,
	uncond_text_states,
	image_embeds,
	latents_condition,
	latents_lose,
	)

	sp_size = nccl_info.sp_size
	frame = hidden_states.shape[2]
	assert frame % sp_size == 0, "frame should be a multiple of sp_size"

	(
	hidden_states,
	encoder_hidden_states,
	uncond_text_states,
	image_embeds,
	latents_condition,latents_lose,
	) = prepare(
	hidden_states,
	encoder_hidden_states.repeat(1, sp_size, 1),
	uncond_text_states.repeat(1, sp_size, 1),
	image_embeds.repeat(1, sp_size, 1),
	latents_condition,
	latents_lose
	)

	return hidden_states, encoder_hidden_states, uncond_text_states, image_embeds, latents_condition,latents_lose

	def sp_parallel_dataloader_wrapper_wanx_dpo(
	dataloader, device, train_batch_size, sp_size, train_sp_batch_size
	):
	while True:
	for data_item in dataloader:
	latents, text_states, uncond_text_states, image_embeds, latents_condition,latent_lose = data_item
	latents = latents.to(device)
	latents_lose = latents.to(device)
	text_states = text_states.to(device)
	uncond_text_states = uncond_text_states.to(device)
	image_embeds = image_embeds.to(device)
	latents_condition = latents_condition.to(device)
	frame = latents.shape[2]
	if frame == 1:
	yield latents, text_states, uncond_text_states, image_embeds, latents_condition,latent_lose
	else:
	latents, text_states, uncond_text_states, image_embeds, latents_condition, latents_lose = (
	prepare_sequence_parallel_data_wanx_dpo(
	latents, text_states, uncond_text_states, image_embeds, latents_condition,latent_lose
	)
	)
	assert (
	train_batch_size * sp_size >= train_sp_batch_size
	), "train_batch_size * sp_size should be greater than train_sp_batch_size"
	for iter in range(train_batch_size * sp_size // train_sp_batch_size):
	st_idx = iter * train_sp_batch_size
	ed_idx = (iter + 1) * train_sp_batch_size
	yield (
	latents[st_idx:ed_idx],
	text_states[st_idx:ed_idx],
	uncond_text_states[st_idx:ed_idx],
	image_embeds[st_idx:ed_idx],
	latents_condition[st_idx:ed_idx],
	latents_lose[st_idx:ed_idx],
	)

	def prepare_sequence_parallel_data_ltx(
	hidden_states, encoder_hidden_states, text_mask, uncond_text_states, uncond_text_mask
	):
	if nccl_info.sp_size == 1:
	return (
	hidden_states,
	encoder_hidden_states,
	text_mask,
	uncond_text_states,
	uncond_text_mask,
	)

	def prepare(hidden_states, encoder_hidden_states, text_mask, uncond_text_states, uncond_text_mask):
	hidden_states = all_to_all(hidden_states, scatter_dim=2, gather_dim=0)
	encoder_hidden_states = all_to_all(
	encoder_hidden_states, scatter_dim=1, gather_dim=0
	)
	text_mask = all_to_all(text_mask, scatter_dim=1, gather_dim=0)
	uncond_text_states = all_to_all(
	uncond_text_states, scatter_dim=1, gather_dim=0
	)
	uncond_text_mask = all_to_all(
	uncond_text_mask, scatter_dim=1, gather_dim=0
	)

	return (
	hidden_states,
	encoder_hidden_states,
	text_mask,
	uncond_text_states,
	uncond_text_mask,
	)

	sp_size = nccl_info.sp_size
	frame = hidden_states.shape[2]
	assert frame % sp_size == 0, "frame should be a multiple of sp_size"

	(
	hidden_states,
	encoder_hidden_states,
	text_mask,
	uncond_text_states,
	uncond_text_mask,
	) = prepare(
	hidden_states,
	encoder_hidden_states.repeat(1, sp_size, 1),
	text_mask.repeat(1, sp_size),
	uncond_text_states.repeat(1, sp_size, 1),
	uncond_text_mask.repeat(1, sp_size)
	)

	return hidden_states, encoder_hidden_states, text_mask, uncond_text_states, uncond_text_mask


	def sp_parallel_dataloader_wrapper_ltx(
	dataloader, device, train_batch_size, sp_size, train_sp_batch_size
	):
	while True:
	for data_item in dataloader:
	latents, text_states, text_mask, uncond_text_states, uncond_text_mask = data_item
	latents = latents.to(device)
	text_states = text_states.to(device)
	text_mask = text_mask.to(device)
	uncond_text_states = uncond_text_states.to(device)
	uncond_text_mask = uncond_text_mask.to(device)
	frame = latents.shape[2]
	if frame == 1:
	yield latents, text_states, text_mask, uncond_text_states, uncond_text_mask
	else:
	latents, text_states, text_mask, uncond_text_states, uncond_text_mask = (
	prepare_sequence_parallel_data_ltx(
	latents, text_states, text_mask, uncond_text_states, uncond_text_mask
	)
	)
	assert (
	train_batch_size * sp_size >= train_sp_batch_size
	), "train_batch_size * sp_size should be greater than train_sp_batch_size"
	for iter in range(train_batch_size * sp_size // train_sp_batch_size):
	st_idx = iter * train_sp_batch_size
	ed_idx = (iter + 1) * train_sp_batch_size
	yield (
	latents[st_idx:ed_idx],
	text_states[st_idx:ed_idx],
	text_mask[st_idx:ed_idx],
	uncond_text_states[st_idx:ed_idx],
	uncond_text_mask[st_idx:ed_idx],
	)


	def parallelize_model(model):
	original_forward = model.forward

	@functools.wraps(model.__class__.forward)
	def new_forward(
	self,
	hidden_states: torch.Tensor,
	timestep: torch.LongTensor,
	text_states: torch.Tensor,
	text_states_2: torch.Tensor,
	encoder_attention_mask: torch.Tensor,
	output_features=False,
	output_features_stride=8,
	attention_kwargs=None,
	freqs_cos=None,
	freqs_sin=None,
	return_dict=False,
	guidance=None,
	):
	x = hidden_states
	sp_size = nccl_info.sp_size
	sp_rank = nccl_info.rank_within_group

	if x.shape[-2] // 2 % sp_size == 0:
	# try to split x by height
	split_dim = -2
	elif x.shape[-1] // 2 % sp_size == 0:
	# try to split x by width
	split_dim = -1
	else:
	raise ValueError(f"Cannot split video sequence into ulysses_degree ({sp_size}) parts evenly")

	_, _, ot, oh, ow = x.shape
	tt, th, tw = (
	ot // self.patch_size[0],
	oh // self.patch_size[1],
	ow // self.patch_size[2],
	)
	freqs_cos, freqs_sin = self.get_rotary_pos_embed((tt, th, tw))
	# patch sizes for the temporal, height, and width dimensions are 1, 2, and 2.
	temporal_size, h, w = x.shape[2], x.shape[3] // 2, x.shape[4] // 2

	x = torch.chunk(x, sp_size,dim=split_dim)[sp_rank]

	dim_thw = freqs_cos.shape[-1]
	freqs_cos = freqs_cos.reshape(temporal_size, h, w, dim_thw)
	freqs_cos = torch.chunk(freqs_cos, sp_size,dim=split_dim - 1)[sp_rank]
	freqs_cos = freqs_cos.reshape(-1, dim_thw)
	dim_thw = freqs_sin.shape[-1]
	freqs_sin = freqs_sin.reshape(temporal_size, h, w, dim_thw)
	freqs_sin = torch.chunk(freqs_sin, sp_size,dim=split_dim - 1)[sp_rank]
	freqs_sin = freqs_sin.reshape(-1, dim_thw)

	output = original_forward(
	x,
	timestep,
	text_states,
	text_states_2,
	encoder_attention_mask,
	output_features,
	output_features_stride,
	attention_kwargs,
	freqs_cos,
	freqs_sin,
	return_dict,
	guidance,
	)

	return_dict = not isinstance(output, tuple)
	shape = (tt, th, tw)
	if return_dict:
	assert not output_features, "output_feature is not compatible with return_dict"
	sample = output["x"]
	sample = all_gather(sample, dim=split_dim)
	output["x"] = sample
	else:
	sample = output[0]
	sample = all_gather(sample, dim=split_dim)
	if output_features:
	features_list = output[1]
	features_list = all_gather(features_list, dim=split_dim)
	else:
	features_list = None

	output = (sample, features_list, shape)
	return output

	new_forward = new_forward.__get__(model)
	model.forward = new_forward


	def all_reduce_tensor_item(item):
	world_size = int(os.environ["WORLD_SIZE"])
	item = item.detach().clone()
	dist.all_reduce(item, op=dist.ReduceOp.SUM)
	item = item / nccl_info.ts_group_size if get_teacher_student_parallel_state() else item / world_size
	return item

	def broadcast_item(item, idx):
	item_list = [item]
	dist.broadcast_object_list(item_list, src=idx)
	return item_list[0]