rmsnorm

Model card Files Files and versions

xet

Community

Michael Benayoun commited on Feb 27

Commit

b444fe2

1 Parent(s): ab3f905

debug

Browse files

Files changed (1) hide show

build/torch-neuron/__init__.py +0 -67

build/torch-neuron/__init__.py CHANGED Viewed

@@ -21,9 +21,6 @@ def rmsnorm(hidden_states, weight, eps: float = 1e-6):
     Returns:
         Normalized tensor of shape (B, S, H)
     """
-    # Get input shape
-    original_shape = hidden_states.shape
     num_rows = 1
     for r in hidden_states.shape[:-1]:
         num_rows *= r
@@ -79,10 +76,7 @@ def rmsnorm(hidden_states, weight, eps: float = 1e-6):
         # Step 6: Normalize: row * rsqrt(variance + eps)
         # Broadcast rms_reciprocal across hidden_dim using tensor_scalar
         normalized = sbuf.view(dtype=dtype, shape=(rows, hidden_dim))
-        # rms_reciprocal_fp32 = sbuf.view(dtype=nl.float32, shape=(rows, 1))
-        # nisa.tensor_copy(dst=rms_reciprocal_fp32, src=rms_reciprocal)  # Convert to fp32 for better precision in multiplication
         nisa.tensor_scalar(normalized, row_tile, nl.multiply, rms_reciprocal)
-        # nisa.tensor_tensor(normalized, row_tile, rms_reciprocal, op=nl.multiply)
         # Step 7: Apply weight element-wise
         weight_tile_rows = sbuf.view(dtype=dtype, shape=(rows, hidden_dim))
@@ -102,67 +96,6 @@ def rmsnorm(hidden_states, weight, eps: float = 1e-6):
     return output_flat
-@nki.jit(platform_target="trn2")
-def rmsnorm_(hidden_states, weight, eps: float = 1e-6):
-    """
-    Optimized NKI kernel for RMSNorm.
-    """
-    # 1. Calculate shapes
-    B, S, H = hidden_states.shape
-    num_rows = B * S
-    hidden_dim = H
-    max_rows = nl.tile_size.pmax # Maximum hardware partition size (usually 128)
-    # 2. Allocate Output in HBM
-    output_flat = nl.ndarray(shape=(num_rows, hidden_dim), dtype=hidden_states.dtype, buffer=nl.hbm)
-    # 3. FAST WEIGHT LOADING: Load the 1D weight into SBUF exactly ONCE before the loop.
-    weight_sbuf = nl.ndarray(shape=(1, hidden_dim), dtype=weight.dtype, buffer=nl.sbuf)
-    nisa.dma_copy(dst=weight_sbuf, src=weight.reshape((1, hidden_dim)))
-    # 4. Process in chunks using NKI's hardware-optimized affine_range
-    # (Assuming num_rows is perfectly divisible by max_rows for standard tiling)
-    print("Num rows:", num_rows, "Max rows per tile:", max_rows)
-    for i in nl.affine_range(num_rows // max_rows):
-        # Calculate the exact memory offset for this specific chunk
-        offset = i * max_rows
-        # Allocate fast on-chip memory (SBUF) for our tiles
-        in_tile = nl.ndarray(shape=(max_rows, hidden_dim), dtype=hidden_states.dtype, buffer=nl.sbuf)
-        out_tile = nl.ndarray(shape=(max_rows, hidden_dim), dtype=hidden_states.dtype, buffer=nl.sbuf)
-        # DMA Load: Pull just this chunk from HBM to SBUF
-        nisa.dma_copy(dst=in_tile, src=hidden_states.reshape((num_rows, hidden_dim))[offset : offset + max_rows, :])
-        # Step 1: Compute x^2
-        squared = nisa.tensor_tensor(in_tile, in_tile, op=nl.multiply)
-        # Step 2: Sum across hidden_dim (axis 1). Results in shape (max_rows, 1)
-        square_sum = nisa.tensor_reduce(data=squared, op=nl.add, axis=1)
-        # Step 3 & 4: Mean and Add epsilon
-        mean = nisa.tensor_scalar(square_sum, nl.multiply, 1.0 / hidden_dim)
-        mean_eps = nisa.tensor_scalar(mean, nl.add, eps)
-        # Step 5: rsqrt(mean + eps)
-        sqrt_mean = nisa.activation(data=mean_eps, op=nl.sqrt)
-        rms_reciprocal = nisa.reciprocal(data=sqrt_mean)
-        # Step 6: Normalize.
-        # The hardware automatically broadcasts the (max_rows, 1) reciprocal across the (max_rows, hidden_dim) input tile.
-        normalized = nisa.tensor_tensor(in_tile, rms_reciprocal, op=nl.multiply)
-        # Step 7: Apply weight.
-        # The hardware automatically broadcasts the (1, hidden_dim) weight across the (max_rows, hidden_dim) normalized tile.
-        nisa.tensor_tensor(dst=out_tile, data0=normalized, data1=weight_sbuf, op=nl.multiply)
-        # DMA Store: Push the result back to HBM.
-        # BUG FIXED: Using `offset` ensures we write to the correct block in the output tensor!
-        nisa.dma_copy(dst=output_flat[offset : offset + max_rows, :], src=out_tile)
-    return output_flat
 from . import layers
 __all__ = [

     Returns:
         Normalized tensor of shape (B, S, H)
     """
     num_rows = 1
     for r in hidden_states.shape[:-1]:
         num_rows *= r
         # Step 6: Normalize: row * rsqrt(variance + eps)
         # Broadcast rms_reciprocal across hidden_dim using tensor_scalar
         normalized = sbuf.view(dtype=dtype, shape=(rows, hidden_dim))
         nisa.tensor_scalar(normalized, row_tile, nl.multiply, rms_reciprocal)
         # Step 7: Apply weight element-wise
         weight_tile_rows = sbuf.view(dtype=dtype, shape=(rows, hidden_dim))
     return output_flat
 from . import layers
 __all__ = [