Spaces:

ArthurY
/

Physicsnemo

Sleeping

App Files Files Community

Physicsnemo / physics_mcp /source /physicsnemo /distributed /custom_ops /_reductions.py

ArthurY

update source

c3d0544 3 months ago

raw

history blame contribute delete

18.8 kB

	# SPDX-FileCopyrightText: Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES.
	# SPDX-FileCopyrightText: All rights reserved.
	# SPDX-License-Identifier: Apache-2.0
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.

	from typing import (
	Any,
	Callable,
	Dict,
	Iterable,
	List,
	Optional,
	Set,
	Tuple,
	TypeVar,
	Union,
	)

	import torch

	from physicsnemo.utils.version_check import check_module_requirements

	check_module_requirements("physicsnemo.distributed.shard_tensor")

	from torch.distributed.tensor.placement_types import ( # noqa: E402
	Partial,
	Shard,
	)

	# noqa: E402
	from physicsnemo.distributed.shard_tensor import ShardTensor # noqa: E402

	aten = torch.ops.aten

	# Type variable for dimension parameter
	DimT = TypeVar("DimT", None, int, Iterable[int])


	def normalize_dim(
	dim: DimT, tensor_ndim: int, as_set: bool = False, handle_negatives: bool = True
	) -> Union[Optional[Tuple[int, ...]], Set[int]]:
	"""
	Normalize dimension argument to a consistent form.

	Args:
	dim: The dimension(s) to normalize. Can be None, int, or iterable of ints.
	tensor_ndim: Number of dimensions in the tensor.
	as_set: If True, return a set of dimensions instead of a tuple.
	handle_negatives: If True, convert negative dimensions to positive ones.

	Returns:
	- None if dim is None and as_set is False
	- A set of all dimensions if dim is None and as_set is True
	- A tuple of dimensions (or set if as_set is True)
	"""
	if dim is None:
	if as_set:
	return set(range(tensor_ndim))
	return None

	# Convert to tuple if iterable
	if isinstance(dim, Iterable) and not isinstance(dim, torch.Tensor):
	dims = tuple(dim)
	else:
	dims = (dim,)

	# Handle negative dimensions
	if handle_negatives:
	dims = tuple(d % tensor_ndim for d in dims)

	# Return as set or tuple based on as_set flag
	if as_set:
	return set(dims)
	return dims


	def is_full_reduction(dim: DimT, tensor_ndim: int) -> bool:
	"""
	Determine if this is a full reduction.

	Args:
	dim: The dimension(s) to check. Can be None, int, or iterable of ints.
	tensor_ndim: Number of dimensions in the tensor.

	Returns:
	bool: True if all dimensions are being reduced, False otherwise.
	"""
	if dim is None:
	return True
	if isinstance(dim, Iterable) and len(dim) == tensor_ndim:
	return True
	return False


	def compute_result_placements(
	tensor: ShardTensor, dim: DimT, reduction_name: str, keepdim: bool = False
	) -> List[Union[Partial, Shard]]:
	"""
	Compute placement info for reduction result.

	Args:
	tensor: The input ShardTensor being reduced.
	dim: The dimension(s) to reduce. Can be None, int, or iterable of ints.
	reduction_name: Type of reduction operation ("sum", "avg", etc.).
	keepdim: Whether to preserve reduced dimensions with size 1.

	Returns:
	List[Union[Partial, Shard]]: Placement specifications for the result tensor.
	"""
	if is_full_reduction(dim, tensor.ndim):
	return [
	p
	if p.is_replicate()
	else Partial("sum" if reduction_name != "avg" else "avg")
	for p in tensor._spec.placements
	]

	# Use enhanced normalize_dim to get dimensions as a set
	dims = normalize_dim(dim, tensor.ndim, as_set=True)

	placements = []
	for p in tensor._spec.placements:
	if isinstance(p, Shard):
	shard_dim = p.dim
	# Count how many reduction dims are less than this shard dim
	num_lower = sum(1 for d in dims if d < shard_dim)
	# If this sharded dim is being reduced, it becomes Partial
	if shard_dim in dims:
	placements.append(Partial(reduction_name))
	else:
	# If keepdim is False, dims to the left are removed, so shift left
	new_dim = shard_dim - num_lower if not keepdim else shard_dim
	placements.append(Shard(new_dim))
	else:
	placements.append(p)
	return placements


	def reduction_shape(
	S: torch.Size, dim: DimT = None, keepdim: bool = False
	) -> torch.Size:
	"""
	Calculate the resulting shape after a reduction operation.

	Args:
	S: Original shape of the tensor.
	dim: The dimension(s) to reduce. Can be None, int, or iterable of ints.
	keepdim: Whether to preserve reduced dimensions with size 1.

	Returns:
	torch.Size: The shape after reduction.
	"""
	shape = list(S)
	if dim is None:
	return torch.Size([1] * len(shape)) if keepdim else torch.Size([])

	# Use enhanced normalize_dim to handle iterable and negative dims
	dim = normalize_dim(dim, len(shape), handle_negatives=True)

	if keepdim:
	for d in dim:
	shape[d] = 1
	else:
	for d in sorted(dim, reverse=True):
	del shape[d]
	return torch.Size(shape)


	def compute_result_sharding_shapes(
	tensor: ShardTensor, dim: DimT, keepdim: bool
	) -> Dict[int, List[torch.Size]]:
	"""
	Compute sharding sizes for the result of a reduction operation.

	Args:
	tensor: The input ShardTensor being reduced.
	dim: The dimension(s) to reduce. Can be None, int, or iterable of ints.
	keepdim: Whether to preserve reduced dimensions with size 1.

	Returns:
	Dict[int, List[torch.Size]]: Mapping of mesh dimensions to sharding shapes.
	"""
	if is_full_reduction(dim, tensor.ndim):
	return {}
	else:
	# Create a dictionary to store sharding sizes for dimensions that remain in the output
	result_sharding_shapes = {}

	# Get the original sharding sizes
	original_sharding_shapes = tensor._spec.sharding_shapes()
	# Use normalize_dim directly
	normalized_dim = normalize_dim(dim, tensor.ndim)

	for mesh_dim, sharding_shapes in original_sharding_shapes.items():
	result_sharding_shapes[mesh_dim] = [
	reduction_shape(shape, normalized_dim, keepdim)
	for shape in sharding_shapes
	]

	return result_sharding_shapes


	def create_sharded_grad_input(
	local_grad_input: torch.Tensor, original_spec: Any
	) -> ShardTensor:
	"""
	Create a ShardTensor from local gradient input.

	Args:
	local_grad_input: The local gradient tensor.
	original_spec: The original ShardTensor's spec to use for placement.

	Returns:
	ShardTensor: A distributed tensor with the same sharding as the original input.
	"""
	return ShardTensor.from_local(
	local_grad_input,
	device_mesh=original_spec.mesh,
	placements=original_spec.placements,
	sharding_shapes=original_spec.sharding_shapes(),
	)


	# Base class for sharded reductions
	class ShardedReductionBase(torch.autograd.Function):
	"""Base class for implementing custom autograd functions for sharded tensor reductions."""

	@staticmethod
	def setup_ctx(
	ctx: Any, tensor: ShardTensor, dim: DimT, keepdim: bool
	) -> Tuple[Optional[Tuple[int, ...]], bool]:
	"""
	Save common context information for backward pass.

	Args:
	ctx: The autograd context object.
	tensor: The input ShardTensor being reduced.
	dim: The dimension(s) to reduce.
	keepdim: Whether to preserve reduced dimensions with size 1.

	Returns:
	Tuple[Optional[Tuple[int, ...]], bool]: Normalized dimension and keepdim flag.
	"""
	ctx.original_spec = tensor._spec
	ctx.output_requires_grad = tensor.requires_grad

	# Normalize dim to tuple form
	dim = normalize_dim(dim, tensor.ndim)

	# Ensure keepdim is a boolean
	keepdim = bool(keepdim)

	ctx.dim = dim
	ctx.keepdim = keepdim
	ctx.is_full_reduction = is_full_reduction(dim, tensor.ndim)

	# Save the shape of the local tensor
	ctx.local_grad_shape = tensor._local_tensor.shape

	return dim, keepdim


	# Specific reduction implementations
	class ShardedSum(ShardedReductionBase):
	"""
	Custom autograd function for sum reduction of sharded tensors.
	Handles both forward and backward passes with proper gradient computation.
	"""

	@staticmethod
	def forward(
	ctx: Any,
	tensor: ShardTensor,
	dim: DimT = None,
	keepdim: bool = False,
	dtype: Optional[torch.dtype] = None,
	) -> ShardTensor:
	"""
	Forward pass for sum reduction on ShardTensor.

	Args:
	ctx: The autograd context object.
	tensor: The input ShardTensor to be reduced.
	dim: The dimension(s) to reduce.
	keepdim: Whether to preserve reduced dimensions with size 1.
	dtype: Output data type (optional).

	Returns:
	ShardTensor: The result of sum reduction.
	"""
	dim, keepdim = ShardedReductionBase.setup_ctx(ctx, tensor, dim, keepdim)

	# Get local tensor
	local_tensor = tensor._local_tensor
	# Perform local sum
	local_result = aten.sum(local_tensor, dim=dim, keepdim=keepdim, dtype=dtype)

	# Compute placements for the result
	placements = compute_result_placements(tensor, dim, "sum")
	output_sharding_shapes = compute_result_sharding_shapes(tensor, dim, keepdim)

	# Create result ShardTensor
	result = ShardTensor.from_local(
	local_result,
	tensor.device_mesh,
	placements,
	sharding_shapes=output_sharding_shapes,
	)

	return result

	@staticmethod
	def backward(
	ctx: Any, grad_output: ShardTensor
	) -> Tuple[ShardTensor, None, None, None]:
	"""
	Backward pass for sum reduction.

	Args:
	ctx: The autograd context object.
	grad_output: Gradient of the loss with respect to the output.

	Returns:
	Tuple containing gradients for each input in the forward pass.
	"""
	original_spec = ctx.original_spec
	dim = ctx.dim
	is_full_reduction = ctx.is_full_reduction
	keepdim = ctx.keepdim
	local_grad_shape = ctx.local_grad_shape

	# Get local grad output
	local_grad_output = grad_output._local_tensor

	if is_full_reduction:
	# For full reduction, broadcast to original size
	grad_input = local_grad_output.expand(local_grad_shape)
	else:
	# For dimension-specific reduction
	if keepdim:
	# Just expand along reduced dimensions
	expand_shape = list(local_grad_shape)
	grad_input = local_grad_output.expand(expand_shape)
	else:
	# Need to unsqueeze first
	grad_shape = list(local_grad_output.shape)
	for d in sorted(dim):
	if d < 0:
	d += original_spec.tensor_meta.ndim
	grad_shape.insert(d, 1)

	grad_expanded = local_grad_output.reshape(grad_shape)
	expand_shape = list(local_grad_shape)
	grad_input = grad_expanded.expand(expand_shape)

	# Create ShardTensor from local grad
	grad_input = create_sharded_grad_input(grad_input, original_spec)
	# Return gradients for all inputs
	return grad_input, None, None, None


	class ShardedMean(ShardedReductionBase):
	"""
	Custom autograd function for mean reduction of sharded tensors.
	Handles both forward and backward passes with proper gradient computation and scaling.
	"""

	@staticmethod
	def forward(
	ctx: Any,
	tensor: ShardTensor,
	dim: DimT = None,
	keepdim: bool = False,
	dtype: Optional[torch.dtype] = None,
	) -> ShardTensor:
	"""
	Forward pass for mean reduction on ShardTensor.

	Args:
	ctx: The autograd context object.
	tensor: The input ShardTensor to be reduced.
	dim: The dimension(s) to reduce.
	keepdim: Whether to preserve reduced dimensions with size 1.
	dtype: Output data type (optional).

	Returns:
	ShardTensor: The result of mean reduction.
	"""
	dim, keepdim = ShardedReductionBase.setup_ctx(ctx, tensor, dim, keepdim)

	# Get local tensor
	local_tensor = tensor._local_tensor

	# Compute proper weighting for mean
	weight = 1.0

	# Normalize dimensions for consistent handling
	if is_full_reduction(dim, tensor.ndim):
	# For full reduction, use all dimensions
	reduction_dims = set(range(tensor.ndim))
	else:
	# Only use the normalized dimensions for partial reduction
	reduction_dims = dim

	# Calculate weight based on local vs global shape ratio for reduction dimensions
	local_shape = local_tensor.shape
	global_shape = tensor.shape

	for d in reduction_dims:
	weight *= local_shape[d] / global_shape[d]

	# Perform local mean
	local_result = aten.mean(local_tensor, dim=dim, keepdim=keepdim, dtype=dtype)
	# Apply weighting
	local_result = local_result * weight

	placements = compute_result_placements(tensor, dim, "sum")
	output_sharding_shapes = compute_result_sharding_shapes(tensor, dim, keepdim)

	# Create result ShardTensor
	result = ShardTensor.from_local(
	local_result,
	tensor.device_mesh,
	placements,
	sharding_shapes=output_sharding_shapes,
	)

	return result

	@staticmethod
	def backward(
	ctx: Any, grad_output: ShardTensor
	) -> Tuple[ShardTensor, None, None, None]:
	"""
	Backward pass for mean reduction.

	Args:
	ctx: The autograd context object.
	grad_output: Gradient of the loss with respect to the output.

	Returns:
	Tuple containing gradients for each input in the forward pass.
	"""
	original_spec = ctx.original_spec
	dim = ctx.dim
	is_full_reduction = ctx.is_full_reduction
	keepdim = ctx.keepdim
	local_grad_shape = ctx.local_grad_shape
	global_shape = original_spec.tensor_meta.shape

	# Get local grad output
	local_grad_output = grad_output._local_tensor

	if is_full_reduction:
	# For full reduction, broadcast to original size with scaling
	factor = 1.0 / torch.prod(torch.tensor(global_shape))
	grad_input = local_grad_output.expand(local_grad_shape) * factor
	else:
	# For dimension-specific reduction
	if keepdim:
	# Just expand along reduced dimensions
	expand_shape = list(local_grad_shape)
	grad_input = local_grad_output.expand(expand_shape)
	else:
	# Need to unsqueeze first
	grad_shape = list(local_grad_output.shape)
	for d in sorted(dim):
	if d < 0:
	d += original_spec.tensor_meta.ndim
	grad_shape.insert(d, 1)

	grad_expanded = local_grad_output.reshape(grad_shape)
	expand_shape = list(local_grad_shape)
	grad_input = grad_expanded.expand(expand_shape)

	# Apply scaling factor for mean
	factor = 1.0
	for d in dim:
	if d < 0:
	d += original_spec.tensor_meta.ndim
	factor /= global_shape[d]
	grad_input = grad_input * factor

	# Create ShardTensor from local grad
	grad_input = create_sharded_grad_input(grad_input, original_spec)

	# Return gradients for all inputs
	return grad_input, None, None, None


	def sum_wrapper(
	func: Callable, types: Any, args: Tuple[Any, ...], kwargs: Dict[str, Any]
	) -> ShardTensor:
	"""
	Wrapper function for ShardTensor sum reduction.

	In Args and kwargs:
	tensor: Input ShardTensor to reduce.
	dim: The dimension(s) to reduce.
	keepdim: Whether to preserve reduced dimensions with size 1.
	*args: Additional positional arguments.
	**kwargs: Additional keyword arguments.

	Returns:
	ShardTensor: Result of sum reduction.
	"""
	tensor, dim, keepdim, extra_args, extra_kwargs = unpack_args(args, *kwargs)

	return ShardedSum.apply(tensor, dim, keepdim, extra_args, *extra_kwargs)


	# TODO - accept func, types, args, kwargs instead
	def mean_wrapper(
	func: Callable, types: Any, args: Tuple[Any, ...], kwargs: Dict[str, Any]
	) -> ShardTensor:
	"""
	Wrapper function for ShardTensor mean reduction.

	Args:
	tensor: Input ShardTensor to reduce.
	dim: The dimension(s) to reduce.
	keepdim: Whether to preserve reduced dimensions with size 1.
	*args: Additional positional arguments.
	**kwargs: Additional keyword arguments.

	Returns:
	ShardTensor: Result of mean reduction.
	"""
	tensor, dim, keepdim, extra_args, extra_kwargs = unpack_args(args, *kwargs)

	return ShardedMean.apply(tensor, dim, keepdim, extra_args, *extra_kwargs)


	def unpack_args(
	tensor: ShardTensor,
	dim: DimT = None,
	keepdim: bool = False,
	*args: Any,
	**kwargs: Any,
	) -> Tuple[ShardTensor, DimT, bool, Tuple[Any, ...], Dict[str, Any]]:
	"""
	Unpack arguments for reduction functions. Maps default args from torch.

	Returns:
	tensor: Input ShardTensor to reduce.
	dim: The dimension(s) to reduce.
	"""
	return tensor, dim, keepdim, args, kwargs


	# Map the reduction ops to their handlers
	reduction_mapping: Dict[str, Callable] = {
	"sum": sum_wrapper,
	"avg": mean_wrapper,
	}


	# Register handlers for standalone functions and methods
	ShardTensor.register_function_handler(torch.mean, mean_wrapper)
	ShardTensor.register_function_handler(torch.Tensor.mean, mean_wrapper)
	ShardTensor.register_function_handler(torch.sum, sum_wrapper)
	ShardTensor.register_function_handler(torch.Tensor.sum, sum_wrapper)