feat: add RMSNorm benchmark scripts and K8s job

- run_rms_bench.py: custom RMS benchmark for dims 512/1024/4096/16384
- run_bench.sh / run_and_wait.sh: K8s job apply + log streaming helpers
- benchmark_rms_optim.yaml: TrainJob manifest for B200 benchmarks
- bench_framework.py: minor fixes for benchmark runner

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

Files changed (5) hide show

benchmarks/benchmark_rms_optim.yaml +93 -0
benchmarks/common/bench_framework.py +7 -13
benchmarks/run_and_wait.sh +66 -0
benchmarks/run_bench.sh +54 -0
benchmarks/run_rms_bench.py +61 -0

benchmarks/benchmark_rms_optim.yaml ADDED Viewed

	@@ -0,0 +1,93 @@

+apiVersion: trainer.kubeflow.org/v1alpha1
+kind: TrainJob
+metadata:
+  name: jeesoo-rms-optim-v10
+  namespace: kbm-g-np-motif
+spec:
+  managedBy: trainer.kubeflow.org/trainjob-controller
+  podTemplateOverrides:
+    - spec:
+        containers:
+          - name: node
+            volumeMounts:
+              - mountPath: /dev/shm
+                name: shm
+              - mountPath: /mair
+                name: mair
+        volumes:
+          - emptyDir:
+              medium: Memory
+              sizeLimit: 64Gi
+            name: shm
+          - name: mair
+            persistentVolumeClaim:
+              claimName: mair
+      targetJobs:
+        - name: node
+  runtimeRef:
+    apiGroup: trainer.kubeflow.org
+    kind: ClusterTrainingRuntime
+    name: torch-distributed
+  suspend: false
+  trainer:
+    args:
+      - /bin/bash
+      - '-c'
+      - >
+        ACTIVATIONPATH=/mair/team-sys/jeesoo/activation
+        pip install triton matplotlib pandas
+        echo "=== Building ==="
+        cd $ACTIVATIONPATH
+        pip uninstall -y activation 2>/dev/null; true
+        rm -rf $ACTIVATIONPATH/build/temp.linux-x86_64-cpython-312 $ACTIVATIONPATH/_activation*.so $ACTIVATIONPATH/*.egg-info
+        pip install --no-build-isolation --no-cache-dir -e . -v 2>&1 | tail -100
+        python -c "import _activation; print('Build OK:', _activation)" || { echo "BUILD FAILED"; exit 0; }
+        echo "=== Running RMS tests ==="
+        cd $ACTIVATIONPATH
+        python -m pytest tests/test_rms_norm.py -v 2>&1 | tail -40
+        echo "=== Warmup ==="
+        python -c "import torch; x=torch.randn(8192,1280,device='cuda',dtype=torch.bfloat16); [torch.mm(x.T,x) for _ in range(100)]; torch.cuda.synchronize(); print('warmup done')"
+        echo "=== RMS Benchmark ==="
+        cd $ACTIVATIONPATH/benchmarks
+        python run_rms_bench.py 2>&1 | tee results/rms_optim_log.txt
+        echo "=== Done ==="
+        exit 0;
+    env:
+      - name: PYTHONUNBUFFERED
+        value: '1'
+      - name: PYTORCH_ALLOC_CONF
+        value: expandable_segments:True
+      - name: CUDA_LAUNCH_BLOCKING
+        value: '0'
+      - name: OMP_NUM_THREADS
+        value: '1'
+      - name: HF_HOME
+        value: /mair/llm-dataset/hf_cache
+    image: ghcr.io/motiftechnologies/llm-training:v0.1.3
+    numNodes: 1
+    numProcPerNode: 1
+    resourcesPerNode:
+      limits:
+        cpu: '16'
+        memory: 128Gi
+        nvidia.com/gpu: '1'
+      requests:
+        cpu: '16'
+        memory: 128Gi
+        nvidia.com/gpu: '1'

benchmarks/common/bench_framework.py CHANGED Viewed

@@ -4,8 +4,8 @@ import re
 from typing import Any, Dict, Sequence
 import torch
-import triton
 from torch.profiler import ProfilerActivity, profile
 from .diff_engine import DiffCase
@@ -42,8 +42,8 @@ def _compute_bytes(inputs, forward_fn, obj):
     if isinstance(output, torch.Tensor):
         output_bytes = output.nbytes
     elif isinstance(output, (tuple, list)):
-        output_bytes = sum(o.nbytes for o in output
-                           if isinstance(o, torch.Tensor))
     else:
         output_bytes = 0
     return input_bytes + output_bytes
@@ -158,9 +158,7 @@ def make_fwd_benchmark_for_case(
         key = make_fwd_key(dim, batch_size, seq_len)
         I = case.build_inputs(batch_size, seq_len, dim, dtype, eps)
         if provider == "speedup":
-            return round(
-                timings_ms["naive"][key] /
-                _get_best_cuda_timing(timings_ms, key), 2)
         if provider.endswith("_bw"):
             base = provider[:-3]
             ms = timings_ms[base][key]
@@ -229,8 +227,7 @@ def make_fwd_benchmark_plot_for_case(
         ms = profile_bench(run, total_bytes=nbytes)
         timings_ms[provider][config] = ms
         if provider == "cuda":
-            ratio = timings_ms["naive"][config] / _get_best_cuda_timing(
-                timings_ms, config)
             spdup_ratio.append(ratio)
             return round(ratio, 2)
         else:
@@ -270,9 +267,7 @@ def make_bwd_benchmark_for_case(
         key = make_bwd_key(dim, batch_size, seq_len)
         I = case.build_inputs(batch_size, seq_len, dim, dtype, eps)
         if provider == "speedup":
-            return round(
-                timings_ms["naive"][key] /
-                _get_best_cuda_timing(timings_ms, key), 2)
         if provider.endswith("_bw"):
             base = provider[:-3]
             ms = timings_ms[base][key]
@@ -365,8 +360,7 @@ def make_bwd_benchmark_plot_for_case(
         ms = profile_bench(run, total_bytes=nbytes)
         timings_ms[provider][config] = ms
         if provider == "cuda":
-            ratio = timings_ms["naive"][config] / _get_best_cuda_timing(
-                timings_ms, config)
             spdup_ratio.append(ratio)
             return round(ratio, 2)
         else:

 from typing import Any, Dict, Sequence
 import torch
 from torch.profiler import ProfilerActivity, profile
+import triton
 from .diff_engine import DiffCase
     if isinstance(output, torch.Tensor):
         output_bytes = output.nbytes
     elif isinstance(output, (tuple, list)):
+        output_bytes = sum(
+            o.nbytes for o in output if isinstance(o, torch.Tensor))
     else:
         output_bytes = 0
     return input_bytes + output_bytes
         key = make_fwd_key(dim, batch_size, seq_len)
         I = case.build_inputs(batch_size, seq_len, dim, dtype, eps)
         if provider == "speedup":
+            return round(timings_ms["naive"][key] / _get_best_cuda_timing(timings_ms, key), 2)
         if provider.endswith("_bw"):
             base = provider[:-3]
             ms = timings_ms[base][key]
         ms = profile_bench(run, total_bytes=nbytes)
         timings_ms[provider][config] = ms
         if provider == "cuda":
+            ratio = timings_ms["naive"][config] / _get_best_cuda_timing(timings_ms, config)
             spdup_ratio.append(ratio)
             return round(ratio, 2)
         else:
         key = make_bwd_key(dim, batch_size, seq_len)
         I = case.build_inputs(batch_size, seq_len, dim, dtype, eps)
         if provider == "speedup":
+            return round(timings_ms["naive"][key] / _get_best_cuda_timing(timings_ms, key), 2)
         if provider.endswith("_bw"):
             base = provider[:-3]
             ms = timings_ms[base][key]
         ms = profile_bench(run, total_bytes=nbytes)
         timings_ms[provider][config] = ms
         if provider == "cuda":
+            ratio = timings_ms["naive"][config] / _get_best_cuda_timing(timings_ms, config)
             spdup_ratio.append(ratio)
             return round(ratio, 2)
         else:

benchmarks/run_and_wait.sh ADDED Viewed

	@@ -0,0 +1,66 @@

+#!/bin/bash
+# Usage: ./run_and_wait.sh <yaml-file> [poll-interval-seconds]
+# Deletes existing job, applies yaml, waits for build+run, prints results.
+set -euo pipefail
+YAML="${1:?Usage: $0 <yaml-file> [poll-interval]}"
+POLL="${2:-10}"
+JOB_NAME=$(grep -m1 '^\s*name:' "$YAML" | awk '{print $2}')
+NAMESPACE=$(grep -m1 '^\s*namespace:' "$YAML" | awk '{print $2}')
+LABEL="batch.kubernetes.io/job-name=${JOB_NAME}-node-0"
+echo "=== $JOB_NAME | $NAMESPACE ==="
+# Remember old pods to ignore them
+OLD_PODS=$(kubectl get pods -n "$NAMESPACE" -l "$LABEL" -o jsonpath='{.items[*].metadata.name}' 2>/dev/null || true)
+# Delete if exists
+kubectl delete -f "$YAML" 2>/dev/null && {
+  echo "Deleted old job. Waiting for cleanup..."
+  while kubectl get trainjob -n "$NAMESPACE" "$JOB_NAME" &>/dev/null; do sleep 2; done
+  sleep 3
+}
+# Apply
+kubectl apply -f "$YAML"
+# Wait for NEW pod (not in OLD_PODS)
+echo -n "Waiting for new pod"
+POD=""
+while true; do
+  ALL_PODS=$(kubectl get pods -n "$NAMESPACE" -l "$LABEL" -o jsonpath='{.items[*].metadata.name}' 2>/dev/null || true)
+  for p in $ALL_PODS; do
+    if [[ ! " $OLD_PODS " =~ " $p " ]]; then
+      PHASE=$(kubectl get pod -n "$NAMESPACE" "$p" -o jsonpath='{.status.phase}' 2>/dev/null || true)
+      if [[ -n "$PHASE" ]]; then
+        POD="$p"
+        echo " $POD ($PHASE)"
+        break 2
+      fi
+    fi
+  done
+  echo -n "."
+  sleep "$POLL"
+done
+# Wait for pod to complete
+echo -n "Running"
+while true; do
+  PHASE=$(kubectl get pod -n "$NAMESPACE" "$POD" -o jsonpath='{.status.phase}' 2>/dev/null || echo "Gone")
+  if [[ "$PHASE" != "Running" && "$PHASE" != "Pending" ]]; then
+    echo " ($PHASE)"
+    break
+  fi
+  echo -n "."
+  sleep "$POLL"
+done
+# Print logs
+echo ""
+echo "=== LOGS ==="
+kubectl logs -n "$NAMESPACE" "$POD" 2>/dev/null || echo "(no logs available)"
+echo ""
+echo "=== STATUS: $PHASE ==="

benchmarks/run_bench.sh ADDED Viewed

	@@ -0,0 +1,54 @@

+#!/bin/bash
+# Usage: ./run_bench.sh <yaml-file> [poll-interval-seconds]
+# Example: ./run_bench.sh benchmark_rms_optim.yaml 10
+set -euo pipefail
+YAML="${1:?Usage: $0 <yaml-file> [poll-interval]}"
+POLL="${2:-10}"
+if [[ ! -f "$YAML" ]]; then
+  echo "Error: $YAML not found"
+  exit 1
+fi
+# Extract job name and namespace from yaml
+JOB_NAME=$(grep -m1 '^\s*name:' "$YAML" | awk '{print $2}')
+NAMESPACE=$(grep -m1 '^\s*namespace:' "$YAML" | awk '{print $2}')
+echo "=== Job: $JOB_NAME | Namespace: $NAMESPACE ==="
+# Delete if exists
+kubectl delete trainjob -n "$NAMESPACE" "$JOB_NAME" 2>/dev/null && {
+  echo "Deleted existing job, waiting 5s..."
+  sleep 5
+}
+# Apply
+kubectl apply -f "$YAML"
+echo "Applied. Polling every ${POLL}s..."
+# Wait for pod
+echo -n "Waiting for pod..."
+while true; do
+  POD=$(kubectl get pods -n "$NAMESPACE" -l "batch.kubernetes.io/job-name=${JOB_NAME}-node-0" -o jsonpath='{.items[0].metadata.name}' 2>/dev/null || true)
+  if [[ -n "$POD" && "$POD" != "" ]]; then
+    STATUS=$(kubectl get pod -n "$NAMESPACE" "$POD" -o jsonpath='{.status.phase}' 2>/dev/null || true)
+    if [[ "$STATUS" == "Running" || "$STATUS" == "Succeeded" || "$STATUS" == "Failed" ]]; then
+      echo " $POD ($STATUS)"
+      break
+    fi
+  fi
+  echo -n "."
+  sleep "$POLL"
+done
+# Stream logs until completion
+echo "=== Streaming logs ==="
+kubectl logs -n "$NAMESPACE" "$POD" -f 2>/dev/null || true
+# Final status
+echo ""
+echo "=== Final Status ==="
+kubectl get trainjob -n "$NAMESPACE" "$JOB_NAME" -o jsonpath='{.status.conditions[0].reason}' 2>/dev/null
+echo ""

benchmarks/run_rms_bench.py ADDED Viewed

	@@ -0,0 +1,61 @@

+"""Quick RMS benchmark with custom configs."""
+import os
+import sys
+import torch
+from common.bench_framework import (make_bwd_benchmark_for_case,
+                                    make_fwd_benchmark_for_case)
+from common.diff_engine import calculate_diff
+sys.path.insert(0, os.path.dirname(__file__))
+from cases.rms import CASE
+torch.set_default_device("cuda")
+configs = [
+    (512, 8, 4096),
+    (1024, 8, 4096),
+    (4096, 8, 4096),
+    (16384, 8, 4096),
+]
+# Correctness check
+for dim, bs, sl in configs:
+    print(f"Correctness: bs={bs}, sl={sl}, D={dim}...", end=" ")
+    calculate_diff(CASE, batch_size=bs, seq_len=sl, hidden_size=dim)
+    print("ok")
+print()
+line_vals = ("naive", "naive_bw", "cuda", "cuda_bw", "speedup")
+line_names = {
+    "naive": "Naive (us)",
+    "naive_bw": "Naive (GB/s)",
+    "cuda": "CUDA (us)",
+    "cuda_bw": "CUDA (GB/s)",
+    "speedup": "SpeedUp (ratio)",
+}
+save_dir = "./results/rms_custom"
+os.makedirs(save_dir, exist_ok=True)
+bench = make_fwd_benchmark_for_case(
+    case=CASE,
+    configs=configs,
+    plot_name="rms-bf16-fwd",
+    dtype=torch.bfloat16,
+    line_vals=line_vals,
+    line_names=line_names,
+)
+bench.run(print_data=True, save_path=save_dir)
+bench = make_bwd_benchmark_for_case(
+    case=CASE,
+    configs=configs,
+    plot_name="rms-bf16-bwd",
+    dtype=torch.bfloat16,
+    line_vals=line_vals,
+    line_names=line_names,
+)
+bench.run(print_data=True, save_path=save_dir)