fix: fix rms norm sharding strategy

Browse files

Files changed (3) hide show

tests/test_rms_norm_sequence_parallel.py +21 -21
torch-ext/activation/rms_norm.py +0 -1
torch-ext/activation/rms_norm_meta.py +36 -16

tests/test_rms_norm_sequence_parallel.py CHANGED Viewed

@@ -6,6 +6,10 @@ import pytest
 import torch
 import torch.distributed as dist
 from packaging import version
 from torch.distributed.tensor.placement_types import (Partial, Placement,
                                                       Replicate, Shard)
@@ -13,17 +17,6 @@ import activation
 from .utils import assert_close, opcheck
-DTYPES = [torch.float32]
-NUM_TOKENS = [512]  # Arbitrary values for testing
-SEQUENCE_DIMS = [0, 1]  # 0 is for [T, D] (packed), 1 is for [B, S, D]
-D = [16]  # Arbitrary values for testing
-SEEDS = [0]
-from torch.distributed._tensor import DTensor
-from torch.distributed.device_mesh import DeviceMesh, init_device_mesh
-from torch.distributed.tensor.parallel import (SequenceParallel,
-                                               parallelize_module)
 @pytest.fixture(scope="session", autouse=True)
 def init_dist(request):
@@ -58,6 +51,13 @@ class Model(torch.nn.Module):
         return self.rms_norm(x)
 @pytest.mark.parametrize("num_tokens", NUM_TOKENS)
 @pytest.mark.parametrize("d", D)
 @pytest.mark.parametrize("dtype", DTYPES)
@@ -106,12 +106,16 @@ def test_rms_norm_sequence_parallel(
     parallelize_module(
         model_sharded, mesh,
         {"rms_norm": SequenceParallel(sequence_dim=sequence_dim)})
-    x_sharded = DTensor.from_local(
-        x.chunk(num_ranks, dim=sequence_dim)[rank].contiguous(),
-        placements=(Shard(sequence_dim), ),
         device_mesh=mesh,
     )
-    y = model_sharded(x_sharded)
     y_from_sharded = y.full_tensor()
     model_unsharded = Model(num_tokens, d).to(dtype=dtype).cuda()
@@ -123,15 +127,11 @@ def test_rms_norm_sequence_parallel(
     # Backward
     y_grad = torch.randn_like(y_from_unsharded)
-    y_from_sharded.backward(y_grad)
     y_from_unsharded.backward(y_grad)
-    weight_grad_from_sharded = model_sharded.rms_norm.weight.grad._local_tensor
     weight_grad_from_unsharded = model_unsharded.rms_norm.weight.grad
-    torch.distributed.all_reduce(x.grad, op=torch.distributed.ReduceOp.SUM)
-    torch.distributed.all_reduce(weight_grad_from_sharded,
-                                 op=torch.distributed.ReduceOp.SUM)
     assert_close(x.grad, x_ref.grad)
     assert_close(weight_grad_from_sharded, weight_grad_from_unsharded)

 import torch
 import torch.distributed as dist
 from packaging import version
+from torch.distributed._tensor import DTensor
+from torch.distributed.device_mesh import DeviceMesh, init_device_mesh
+from torch.distributed.tensor.parallel import (SequenceParallel,
+                                               parallelize_module)
 from torch.distributed.tensor.placement_types import (Partial, Placement,
                                                       Replicate, Shard)
 from .utils import assert_close, opcheck
 @pytest.fixture(scope="session", autouse=True)
 def init_dist(request):
         return self.rms_norm(x)
+DTYPES = [torch.float32]
+NUM_TOKENS = [512]  # Arbitrary values for testing
+SEQUENCE_DIMS = [0, 1]  # 0 is for [T, D] (packed), 1 is for [B, S, D]
+D = [16]  # Arbitrary values for testing
+SEEDS = [0]
 @pytest.mark.parametrize("num_tokens", NUM_TOKENS)
 @pytest.mark.parametrize("d", D)
 @pytest.mark.parametrize("dtype", DTYPES)
     parallelize_module(
         model_sharded, mesh,
         {"rms_norm": SequenceParallel(sequence_dim=sequence_dim)})
+    x_replicate = DTensor.from_local(
+        x,
+        placements=(Replicate(), ),
         device_mesh=mesh,
     )
+    # Input will redistributed in SequenceParallel
+    y = model_sharded(x_replicate)
     y_from_sharded = y.full_tensor()
     model_unsharded = Model(num_tokens, d).to(dtype=dtype).cuda()
     # Backward
     y_grad = torch.randn_like(y_from_unsharded)
     y_from_unsharded.backward(y_grad)
+    y_from_sharded.backward(y_grad)
+    weight_grad_from_sharded = model_sharded.rms_norm.weight.grad.full_tensor()
     weight_grad_from_unsharded = model_unsharded.rms_norm.weight.grad
     assert_close(x.grad, x_ref.grad)
     assert_close(weight_grad_from_sharded, weight_grad_from_unsharded)

torch-ext/activation/rms_norm.py CHANGED Viewed

@@ -29,7 +29,6 @@ class RMSNormFunction(torch.autograd.Function):
         input_grad, weight_grad = ops.rms_norm_backward(
             output_grad, input, weight, eps)
         return input_grad, weight_grad, None

         input_grad, weight_grad = ops.rms_norm_backward(
             output_grad, input, weight, eps)
         return input_grad, weight_grad, None

torch-ext/activation/rms_norm_meta.py CHANGED Viewed

@@ -4,6 +4,9 @@ import torch
 from torch.distributed.tensor._dtensor_spec import DTensorSpec
 from torch.distributed.tensor._op_schema import (OpSchema, OpSpec, OpStrategy,
                                                  RuntimeSchemaInfo)
 from torch.distributed.tensor._ops.utils import (generate_redistribute_costs,
                                                  register_op_strategy)
 from torch.distributed.tensor.placement_types import (Placement, Replicate,
@@ -19,17 +22,6 @@ def register_rms_norm_meta():
     pass
-def _replicate_dims_start_at(placements: Sequence[Placement],
-                             start_dim: int = 0) -> tuple[Placement, ...]:
-    new_placements: list[Placement] = []
-    for p in placements:
-        if p.is_partial() or (isinstance(p, Shard) and p.dim >= start_dim):
-            new_placements.append(Replicate())  # make it replicate
-        else:
-            new_placements.append(p)  # keep the placement
-    return tuple(new_placements)
 @register_op_strategy(ops.rms_norm.default, schema_info=RuntimeSchemaInfo(1))
 def rms_norm_strategy(op_schema: OpSchema) -> OpStrategy:
     mesh = op_schema.get_mesh_from_args()
@@ -71,7 +63,7 @@ def rms_norm_strategy(op_schema: OpSchema) -> OpStrategy:
         # Weight cannot be sharded, so always replicate it.
         weight_tgt = DTensorSpec(
             mesh=mesh,
-            placements=(Replicate(), ),
             tensor_meta=weight_src.tensor_meta,
         )
         redistribute_costs.append(
@@ -119,6 +111,8 @@ def rms_norm_backward_strategy(op_schema: OpSchema) -> OpStrategy:
     )
     last_dim = input_strategy.ndim - 1
     strategy = OpStrategy([])
     for output_grad, input, weight in zipped:
         output_grad_src = output_grad.output_spec
@@ -134,7 +128,7 @@ def rms_norm_backward_strategy(op_schema: OpSchema) -> OpStrategy:
         # Output grad can be sharded in any dim except the last dim.
         output_grad_tgt = DTensorSpec(
             mesh=mesh,
-            placements=_replicate_dims_start_at(output_grad_src.placements,
                                                 last_dim),
             tensor_meta=output_grad_src.tensor_meta,
         )
@@ -142,22 +136,48 @@ def rms_norm_backward_strategy(op_schema: OpSchema) -> OpStrategy:
             generate_redistribute_costs(output_grad_strategy, output_grad_tgt))
         # Input must have the same sharding as output grad.
-        input_tgt = output_grad_tgt
         redistribute_costs.append(
             generate_redistribute_costs(input_strategy, input_tgt))
         # Weight cannot be sharded, so always replicate it.
         weight_tgt = DTensorSpec(
             mesh=mesh,
-            placements=(Replicate(), ),
             tensor_meta=weight_src.tensor_meta,
         )
         redistribute_costs.append(
             generate_redistribute_costs(weight_strategy, weight_tgt))
         strategy.strategies.append(
             OpSpec(
-                output_specs=[input_tgt, weight_tgt],
                 input_specs=[output_grad_tgt, input_tgt, weight_tgt],
                 redistribute_cost=redistribute_costs,
             ))

 from torch.distributed.tensor._dtensor_spec import DTensorSpec
 from torch.distributed.tensor._op_schema import (OpSchema, OpSpec, OpStrategy,
                                                  RuntimeSchemaInfo)
+from torch.distributed.tensor._ops._math_ops import (
+    _infer_reduce_dims_map, _replicate_dims_start_at,
+    map_placements_after_reduction)
 from torch.distributed.tensor._ops.utils import (generate_redistribute_costs,
                                                  register_op_strategy)
 from torch.distributed.tensor.placement_types import (Placement, Replicate,
     pass
 @register_op_strategy(ops.rms_norm.default, schema_info=RuntimeSchemaInfo(1))
 def rms_norm_strategy(op_schema: OpSchema) -> OpStrategy:
     mesh = op_schema.get_mesh_from_args()
         # Weight cannot be sharded, so always replicate it.
         weight_tgt = DTensorSpec(
             mesh=mesh,
+            placements=_replicate_dims_start_at(weight_src.placements),
             tensor_meta=weight_src.tensor_meta,
         )
         redistribute_costs.append(
     )
     last_dim = input_strategy.ndim - 1
+    outer_dims = list(range(last_dim))
     strategy = OpStrategy([])
     for output_grad, input, weight in zipped:
         output_grad_src = output_grad.output_spec
         # Output grad can be sharded in any dim except the last dim.
         output_grad_tgt = DTensorSpec(
             mesh=mesh,
+            placements=_replicate_dims_start_at(input_src.placements,
                                                 last_dim),
             tensor_meta=output_grad_src.tensor_meta,
         )
             generate_redistribute_costs(output_grad_strategy, output_grad_tgt))
         # Input must have the same sharding as output grad.
+        input_tgt = DTensorSpec(
+            mesh=mesh,
+            placements=_replicate_dims_start_at(input_src.placements,
+                                                last_dim),
+            tensor_meta=input_src.tensor_meta,
+        )
         redistribute_costs.append(
             generate_redistribute_costs(input_strategy, input_tgt))
         # Weight cannot be sharded, so always replicate it.
         weight_tgt = DTensorSpec(
             mesh=mesh,
+            placements=_replicate_dims_start_at(weight_src.placements),
             tensor_meta=weight_src.tensor_meta,
         )
         redistribute_costs.append(
             generate_redistribute_costs(weight_strategy, weight_tgt))
+        # from torch/distributed/tensor/_ops/_math_ops.py::layer_norm_bwd_strategy()
+        # Weight cannot be sharded, so always replicate it.
+        # TODO: now d_weight spec follows input spec w/ a reduction.
+        # we may need to change to a pointwise rule over grad_out and
+        # input, then apply a reduction.
+        inp_placements = _replicate_dims_start_at(input_src.placements,
+                                                  last_dim)
+        reduce_dims_map = _infer_reduce_dims_map(outer_dims, input_src.ndim,
+                                                 False)
+        out_placements = map_placements_after_reduction(
+            inp_placements, outer_dims, reduce_dims_map, "sum")
+        weight_grad_tgt = DTensorSpec(
+            mesh=mesh,
+            placements=out_placements,
+            tensor_meta=weight_src.tensor_meta,
+        )
+        input_grad_tgt = output_grad_tgt
         strategy.strategies.append(
             OpSpec(
+                output_specs=[input_grad_tgt, weight_grad_tgt],
                 input_specs=[output_grad_tgt, input_tgt, weight_tgt],
                 redistribute_cost=redistribute_costs,
             ))