Spaces:

ArthurY
/

Physicsnemo

Sleeping

App Files Files Community

Physicsnemo / physics_mcp /source /physicsnemo /distributed /fft.py

ArthurY

update source

c3d0544 3 months ago

raw

history blame contribute delete

7.47 kB

	# SPDX-FileCopyrightText: Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES.
	# SPDX-FileCopyrightText: All rights reserved.
	# SPDX-License-Identifier: Apache-2.0
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.

	import torch

	from physicsnemo.distributed.manager import DistributedManager
	from physicsnemo.distributed.mappings import (
	gather_from_parallel_region,
	scatter_to_parallel_region,
	)
	from physicsnemo.distributed.utils import (
	distributed_transpose,
	pad_helper,
	truncate_helper,
	)


	def conj_pad_helper_2d(tensor, pad_dim, other_dim, new_size):
	ndim = tensor.ndim
	pad_dim = (pad_dim + ndim) % ndim
	other_dim = (other_dim + ndim) % ndim

	# pad with conj
	orig_size = tensor.shape[pad_dim]
	tensor_pad = pad_helper(tensor, pad_dim, new_size, mode="conj")

	# gather
	tensor_pad_gather = gather_from_parallel_region(
	tensor_pad, dim=other_dim, group="spatial_parallel"
	)

	# flip dims
	flip_slice = [
	slice(0, x)
	if ((idx != pad_dim) and (idx != other_dim))
	else slice(orig_size, new_size)
	if (idx == pad_dim)
	else slice(1, x)
	for idx, x in enumerate(tensor_pad_gather.shape)
	]
	tensor_pad_gather[flip_slice] = torch.flip(
	tensor_pad_gather[flip_slice], dims=[other_dim]
	)

	# truncate:
	result = scatter_to_parallel_region(
	tensor_pad_gather, dim=other_dim, group="spatial_parallel"
	)

	return result


	class DistributedRFFT2(torch.autograd.Function):
	"""
	Autograd Wrapper for a distributed 2D real to complex FFT primitive.
	It is based on the idea of a single global tensor which is distributed
	along a specified dimension into chunks of equal size.
	This primitive computes a 1D FFT first along dim[0], then performs
	an AllToAll transpose before computing a 1D FFT along dim[1].
	The backward pass performs an IFFT operation with communication
	in the opposite order as in the forward pass.

	For the forward method, data should be split along dim[1] across the
	"spatial_parallel" process group. The output is data split in dim[0].
	"""

	@staticmethod
	def forward(ctx, x, s, dim, norm="ortho"):
	"""
	Forward pass for DistributedRFFT2.
	"""
	# NVTX marker
	torch.cuda.nvtx.range_push("DistributedRFFT2.forward")

	# save:
	ctx.s = s
	ctx.dim = dim
	ctx.norm = norm

	# assume last dim is split (second to last is contiguous):
	x1 = torch.fft.fft(x, n=s[0], dim=dim[0], norm=norm)
	torch.cuda.nvtx.range_pop()

	# transpose
	x1_recv, _ = distributed_transpose(
	x1,
	dim[0],
	dim[1],
	group=DistributedManager().group("spatial_parallel"),
	async_op=False,
	)
	x1_tran = torch.cat(x1_recv, dim=dim[1])
	torch.cuda.nvtx.range_pop()

	# another fft:
	x2 = torch.fft.fft(x1_tran, n=s[1], dim=dim[1], norm=norm)
	torch.cuda.nvtx.range_pop()

	# truncate in last dim:
	ctx.last_dim_size = x2.shape[dim[1]]
	last_dim_size_trunc = ctx.last_dim_size // 2 + 1
	output = truncate_helper(x2, dim[1], last_dim_size_trunc)

	# pop range
	torch.cuda.nvtx.range_pop()

	return output

	@staticmethod
	def backward(ctx, grad_output):
	"""
	Backward pass for DistributedRFFT2.
	"""
	# load
	dim = ctx.dim
	norm = ctx.norm
	s = ctx.s
	last_dim_size = ctx.last_dim_size

	# pad the input to perform the backward fft
	g_pad = pad_helper(grad_output, dim[1], last_dim_size)

	# do fft
	g1 = torch.fft.ifft(g_pad, n=s[1], dim=dim[1], norm=norm)

	# transpose
	g1_recv, _ = distributed_transpose(
	g1,
	dim[1],
	dim[0],
	group=DistributedManager().group("spatial_parallel"),
	async_op=False,
	)
	g1_tran = torch.cat(g1_recv, dim=dim[0])

	# now do the BW fft:
	grad_input = torch.real(torch.fft.ifft(g1_tran, n=s[0], dim=dim[0], norm=norm))

	return grad_input, None, None, None


	class DistributedIRFFT2(torch.autograd.Function):
	"""
	Autograd Wrapper for a distributed 2D real to complex IFFT primitive.
	It is based on the idea of a single global tensor which is distributed
	along a specified dimension into chunks of equal size.
	This primitive computes a 1D IFFT first along dim[1], then performs
	an AllToAll transpose before computing a 1D FFT along dim[0].
	The backward pass performs an FFT operation with communication
	in the opposite order as in the forward pass.

	For the forward method, data should be split along dim[0] across the
	"spatial_parallel" process group. The output is data split in dim[1].
	"""

	@staticmethod
	def forward(ctx, x, s, dim, norm="ortho"):
	"""
	Forward pass for DistributedIRFFT2.
	"""
	# NVTX marker
	torch.cuda.nvtx.range_push("DistributedIRFFT2.forward")

	# save:
	ctx.s = s
	ctx.dim = dim
	ctx.norm = norm
	ctx.orig_dim_size = x.shape[dim[1]]

	if s is not None:
	first_dim_size = s[0]
	ctx.last_dim_size = s[1]
	else:
	first_dim_size = x.shape[dim[0]]
	ctx.last_dim_size = 2 * (ctx.orig_dim_size - 1)

	# fft in contig contig dim
	x_pad = conj_pad_helper_2d(x, dim[1], dim[0], ctx.last_dim_size)
	x1 = torch.fft.ifft(x_pad, n=ctx.last_dim_size, dim=dim[1], norm=norm)

	# transpose
	x1_recv, _ = distributed_transpose(
	x1,
	dim[1],
	dim[0],
	group=DistributedManager().group("spatial_parallel"),
	async_op=False,
	)
	x1_tran = torch.cat(x1_recv, dim=dim[0])

	# ifft in contig dim
	x2 = torch.fft.ifft(x1_tran, n=first_dim_size, dim=dim[0], norm=norm)

	# take real part
	output = torch.real(x2).contiguous()

	# pop range
	torch.cuda.nvtx.range_pop()

	return output

	@staticmethod
	def backward(ctx, grad_output):
	"""
	Backward pass for DistributedIRFFT2.
	"""
	# load
	dim = ctx.dim
	norm = ctx.norm
	orig_dim_size = ctx.orig_dim_size

	# do fft
	g1 = torch.fft.fft(grad_output, dim=dim[0], norm=norm)

	# transpose
	g1_recv, _ = distributed_transpose(
	g1,
	dim[0],
	dim[1],
	group=DistributedManager().group("spatial_parallel"),
	async_op=False,
	)
	g1_tran = torch.cat(g1_recv, dim=dim[1])

	# now do the BW fft:
	x2 = torch.fft.fft(g1_tran, dim=dim[1], norm=norm)

	# truncate
	grad_input = truncate_helper(x2, dim[1], orig_dim_size)

	return grad_input, None, None, None