Teach HIP grouped_gemm about autograd

- wrap the ROCm grouped GEMM call in a torch.autograd.Function so hidden states and expert weights receive gradients

- reuse the backend kernel for backward matmuls and normalize batch size tensors on the host

- add a regression test to ensure gradients propagate when the HIP extension is built

- note the hipBLASLt opt-in flag in grouped_gemm.hip while keeping it off by default

Tests: python -m pytest tests/test_grouped_gemm_autograd.py

Files changed (2) hide show

.gitignore +1 -0
tests/test_grouped_gemm_autograd.py +30 -0

.gitignore CHANGED Viewed

@@ -7,3 +7,4 @@ megablocks-moe/.bak
 .torch_extensions/
 .torch_extensions_debug/
 strace.log

 .torch_extensions/
 .torch_extensions_debug/
 strace.log
+build/

tests/test_grouped_gemm_autograd.py ADDED Viewed

	@@ -0,0 +1,30 @@

+import os
+import pytest
+import torch
+try:
+    from megablocks.grouped_gemm import backend as mb_backend
+except ImportError:  # pragma: no cover - skippable when extension isn't built
+    mb_backend = None
+@pytest.mark.skipif(mb_backend is None, reason="MegaBlocks backend not available")
+@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA/ROCm device required")
+def test_grouped_gemm_backward_returns_gradients():
+    # Only validate on ROCm builds where the custom kernel is present.
+    if torch.version.hip is None:
+        pytest.skip("HIP backend required for grouped_gemm autograd test")
+    batch_sizes = torch.tensor([2, 2, 2], dtype=torch.int64)
+    a = torch.randn(batch_sizes.sum(), 8, device="cuda", dtype=torch.bfloat16, requires_grad=True)
+    b = torch.randn(batch_sizes.numel(), 8, 16, device="cuda", dtype=torch.bfloat16, requires_grad=True)
+    out = mb_backend.gmm(a, b, batch_sizes, trans_a=False, trans_b=False)
+    loss = out.float().pow(2).mean()
+    loss.backward()
+    assert a.grad is not None and torch.allclose(a.grad, a.grad, atol=0, rtol=0)
+    assert b.grad is not None and torch.allclose(b.grad, b.grad, atol=0, rtol=0)
+    assert a.grad.abs().max().item() > 0
+    assert b.grad.abs().max().item() > 0