liangsu9988 commited on 27 days ago

Commit

a6a49dc

verified ·

1 Parent(s): ed5bd9e

Uploaded using `kernel-builder`.

Browse files

Files changed (31) hide show

benchmarks/benchmark.py +315 -0
build/torch210-cxx11-cu128-x86_64-linux/__init__.py +123 -0
build/torch210-cxx11-cu128-x86_64-linux/_flashrt_residual_norm_quant_cuda_cf903dd.abi3.so +3 -0
build/torch210-cxx11-cu128-x86_64-linux/_ops.py +9 -0
build/torch210-cxx11-cu128-x86_64-linux/flashrt_residual_norm_quant/__init__.py +26 -0
build/torch210-cxx11-cu128-x86_64-linux/metadata.json +23 -0
build/torch210-cxx11-cu130-x86_64-linux/__init__.py +123 -0
build/torch210-cxx11-cu130-x86_64-linux/_flashrt_residual_norm_quant_cuda_cf903dd.abi3.so +3 -0
build/torch210-cxx11-cu130-x86_64-linux/_ops.py +9 -0
build/torch210-cxx11-cu130-x86_64-linux/flashrt_residual_norm_quant/__init__.py +26 -0
build/torch210-cxx11-cu130-x86_64-linux/metadata.json +21 -0
build/torch211-cxx11-cu128-x86_64-linux/__init__.py +123 -0
build/torch211-cxx11-cu128-x86_64-linux/_flashrt_residual_norm_quant_cuda_cf903dd.abi3.so +3 -0
build/torch211-cxx11-cu128-x86_64-linux/_ops.py +9 -0
build/torch211-cxx11-cu128-x86_64-linux/flashrt_residual_norm_quant/__init__.py +26 -0
build/torch211-cxx11-cu128-x86_64-linux/metadata.json +23 -0
build/torch211-cxx11-cu130-x86_64-linux/__init__.py +123 -0
build/torch211-cxx11-cu130-x86_64-linux/_flashrt_residual_norm_quant_cuda_cf903dd.abi3.so +3 -0
build/torch211-cxx11-cu130-x86_64-linux/_ops.py +9 -0
build/torch211-cxx11-cu130-x86_64-linux/flashrt_residual_norm_quant/__init__.py +26 -0
build/torch211-cxx11-cu130-x86_64-linux/metadata.json +21 -0
build/torch212-cxx11-cu130-x86_64-linux/__init__.py +123 -0
build/torch212-cxx11-cu130-x86_64-linux/_flashrt_residual_norm_quant_cuda_cf903dd.abi3.so +3 -0
build/torch212-cxx11-cu130-x86_64-linux/_ops.py +9 -0
build/torch212-cxx11-cu130-x86_64-linux/flashrt_residual_norm_quant/__init__.py +26 -0
build/torch212-cxx11-cu130-x86_64-linux/metadata.json +21 -0
build/torch212-cxx11-cu132-x86_64-linux/__init__.py +123 -0
build/torch212-cxx11-cu132-x86_64-linux/_flashrt_residual_norm_quant_cuda_cf903dd.abi3.so +3 -0
build/torch212-cxx11-cu132-x86_64-linux/_ops.py +9 -0
build/torch212-cxx11-cu132-x86_64-linux/flashrt_residual_norm_quant/__init__.py +26 -0
build/torch212-cxx11-cu132-x86_64-linux/metadata.json +21 -0

benchmarks/benchmark.py ADDED Viewed

	@@ -0,0 +1,315 @@

+#!/usr/bin/env python3
+"""Benchmark flashrt-residual-norm-quant against PyTorch eager references."""
+from __future__ import annotations
+import argparse
+import ctypes
+import ctypes.util
+import importlib
+import json
+import math
+import os
+import sys
+from dataclasses import asdict, dataclass
+from pathlib import Path
+import torch
+ROOT = Path(__file__).resolve().parents[2]
+PACKAGE = ROOT / "flashrt-residual-norm-quant"
+REGISTRATION_INCLUDE = (
+    ROOT.parent
+    / "kernels"
+    / "kernel-builder"
+    / "src"
+    / "pyproject"
+    / "templates"
+    / "torch"
+)
+SHAPES = {
+    "pi05_decoder": (10, 1024),
+    "pi05_vision": (512, 1152),
+    "groot_vl": (1024, 2048),
+    "video_prefill": (2520, 2048),
+}
+SHAPE_GROUPS = {
+    "smoke": ["pi05_decoder"],
+    "headline": ["pi05_decoder", "pi05_vision", "groot_vl"],
+    "all": list(SHAPES.keys()),
+}
+@dataclass
+class Result:
+    shape: str
+    rows: int
+    dim: int
+    kernel: str
+    flashrt_us: float
+    torch_eager_us: float
+    speedup_vs_eager: float
+    max_abs: float
+    mean_abs: float
+    p99_abs: float
+    cosine: float
+    status: str
+class SourceOps:
+    def __init__(self, namespace: str) -> None:
+        self._ops = getattr(torch.ops, namespace)
+    def rms_norm_quant_fp8_static_bf16(self, x, weight, scale, eps=1e-6, out=None):
+        if out is None:
+            out = torch.empty_like(x, dtype=torch.float8_e4m3fn)
+        self._ops.rms_norm_quant_fp8_static_bf16(x, weight, scale, float(eps), out)
+        return out
+    def residual_add_rms_norm_quant_fp8_static_bf16(
+        self, residual, x, weight, scale, eps=1e-6, out=None
+    ):
+        if out is None:
+            out = torch.empty_like(x, dtype=torch.float8_e4m3fn)
+        self._ops.residual_add_rms_norm_quant_fp8_static_bf16(
+            residual, x, weight, scale, float(eps), out
+        )
+        return out
+def _preload_cublaslt() -> None:
+    for parent in Path(torch.__file__).resolve().parents:
+        candidate = parent / "nvidia" / "cublas" / "lib" / "libcublasLt.so.12"
+        if candidate.exists():
+            ctypes.CDLL(str(candidate), mode=ctypes.RTLD_GLOBAL)
+            return
+    library = ctypes.util.find_library("cublasLt")
+    if library:
+        ctypes.CDLL(library, mode=ctypes.RTLD_GLOBAL)
+def _current_arch_list() -> str:
+    major, minor = torch.cuda.get_device_capability(0)
+    return f"{major}.{minor}"
+def load_source_ops() -> SourceOps:
+    from torch.utils.cpp_extension import load
+    if not REGISTRATION_INCLUDE.is_dir():
+        raise RuntimeError(f"missing kernel-builder registration include: {REGISTRATION_INCLUDE}")
+    _preload_cublaslt()
+    os.environ.setdefault("TORCH_CUDA_ARCH_LIST", _current_arch_list())
+    namespace = "flashrt_residual_norm_quant_benchmark"
+    load(
+        name=namespace,
+        sources=[
+            str(PACKAGE / "torch-ext" / "torch_binding.cpp"),
+            str(PACKAGE / "csrc" / "residual_norm_quant.cu"),
+        ],
+        extra_include_paths=[str(PACKAGE / "csrc"), str(REGISTRATION_INCLUDE)],
+        extra_cflags=["-O3", "-DCUDA_KERNEL"],
+        extra_cuda_cflags=["-O3", "--expt-relaxed-constexpr", "-DCUDA_KERNEL"],
+        verbose=False,
+    )
+    return SourceOps(namespace)
+def load_installed_ops(artifact: str | None):
+    if artifact:
+        sys.path.insert(0, artifact)
+    try:
+        return importlib.import_module("flashrt_residual_norm_quant")
+    finally:
+        if artifact:
+            sys.path.remove(artifact)
+def quantize_fp8(x: torch.Tensor, scale: torch.Tensor) -> torch.Tensor:
+    return torch.clamp(x.float() / scale.float(), -448.0, 448.0).to(torch.float8_e4m3fn)
+def torch_rms_norm(x: torch.Tensor, weight: torch.Tensor, eps: float) -> torch.Tensor:
+    rms = torch.rsqrt(torch.mean(x.float() * x.float(), dim=1, keepdim=True) + eps)
+    return x.float() * rms * weight.float()
+def torch_rms_norm_quant(x, weight, scale, eps) -> torch.Tensor:
+    return quantize_fp8(torch_rms_norm(x, weight, eps), scale)
+def torch_residual_add_rms_norm_quant(residual, x, weight, scale, eps) -> torch.Tensor:
+    added = residual.float() + x.float()
+    residual.copy_(added.to(torch.bfloat16))
+    rms = torch.rsqrt(torch.mean(added * added, dim=1, keepdim=True) + eps)
+    return quantize_fp8(residual.float() * rms * weight.float(), scale)
+def make_case(rows: int, dim: int):
+    x = torch.randn((rows, dim), device="cuda", dtype=torch.bfloat16)
+    residual = torch.randn((rows, dim), device="cuda", dtype=torch.bfloat16)
+    weight = (1.0 + 0.1 * torch.randn((dim,), device="cuda", dtype=torch.bfloat16)).contiguous()
+    scale = torch.tensor([0.04], device="cuda", dtype=torch.float32)
+    out = torch.empty((rows, dim), device="cuda", dtype=torch.float8_e4m3fn)
+    return x, residual, weight, scale, out
+def time_us(fn, warmup: int, iters: int) -> float:
+    for _ in range(warmup):
+        fn()
+    torch.cuda.synchronize()
+    start = torch.cuda.Event(enable_timing=True)
+    end = torch.cuda.Event(enable_timing=True)
+    start.record()
+    for _ in range(iters):
+        fn()
+    end.record()
+    torch.cuda.synchronize()
+    return start.elapsed_time(end) * 1000.0 / iters
+def percentile(x: torch.Tensor, q: float) -> torch.Tensor:
+    flat = x.flatten()
+    k = max(1, min(flat.numel(), math.ceil(q * flat.numel())))
+    return flat.kthvalue(k).values
+def metrics(got: torch.Tensor, expected: torch.Tensor):
+    diff = (got.float() - expected.float()).abs().flatten()
+    cosine = torch.nn.functional.cosine_similarity(
+        got.float().flatten(), expected.float().flatten(), dim=0
+    )
+    return {
+        "max_abs": float(diff.max().item()),
+        "mean_abs": float(diff.mean().item()),
+        "p99_abs": float(percentile(diff, 0.99).item()),
+        "cosine": float(cosine.item()),
+    }
+def run_one(ops, name: str, rows: int, dim: int, args) -> list[Result]:
+    x, residual, weight, scale, out = make_case(rows, dim)
+    eps = args.eps
+    results = []
+    got = ops.rms_norm_quant_fp8_static_bf16(x, weight, scale, eps, out)
+    expected = torch_rms_norm_quant(x, weight, scale, eps)
+    m = metrics(got, expected)
+    kernel_us = time_us(
+        lambda: ops.rms_norm_quant_fp8_static_bf16(x, weight, scale, eps, out),
+        args.warmup,
+        args.iters,
+    )
+    torch_us = time_us(lambda: torch_rms_norm_quant(x, weight, scale, eps), args.warmup, args.iters)
+    results.append(
+        Result(
+            shape=name,
+            rows=rows,
+            dim=dim,
+            kernel="rms_norm_quant_fp8_static_bf16",
+            flashrt_us=kernel_us,
+            torch_eager_us=torch_us,
+            speedup_vs_eager=torch_us / kernel_us,
+            status="PASS" if m["p99_abs"] <= args.p99_abs_limit else "FAIL",
+            **m,
+        )
+    )
+    residual0 = residual.clone()
+    residual_kernel = residual0.clone()
+    got = ops.residual_add_rms_norm_quant_fp8_static_bf16(
+        residual_kernel, x, weight, scale, eps, out
+    )
+    residual_ref = residual0.clone()
+    expected = torch_residual_add_rms_norm_quant(residual_ref, x, weight, scale, eps)
+    m = metrics(got, expected)
+    residual_kernel = residual0.clone()
+    residual_ref = residual0.clone()
+    kernel_us = time_us(
+        lambda: ops.residual_add_rms_norm_quant_fp8_static_bf16(
+            residual_kernel, x, weight, scale, eps, out
+        ),
+        args.warmup,
+        args.iters,
+    )
+    torch_us = time_us(
+        lambda: torch_residual_add_rms_norm_quant(residual_ref, x, weight, scale, eps),
+        args.warmup,
+        args.iters,
+    )
+    results.append(
+        Result(
+            shape=name,
+            rows=rows,
+            dim=dim,
+            kernel="residual_add_rms_norm_quant_fp8_static_bf16",
+            flashrt_us=kernel_us,
+            torch_eager_us=torch_us,
+            speedup_vs_eager=torch_us / kernel_us,
+            status="PASS" if m["p99_abs"] <= args.p99_abs_limit else "FAIL",
+            **m,
+        )
+    )
+    return results
+def write_markdown(path: Path, results: list[Result]) -> None:
+    lines = [
+        "| Shape | Rows,Dim | Kernel | FlashRT us | Eager us | vs eager | Max abs | Mean abs | P99 abs | Cosine | Status |",
+        "|---|---:|---|---:|---:|---:|---:|---:|---:|---:|---|",
+    ]
+    for r in results:
+        lines.append(
+            f"| {r.shape} | {r.rows},{r.dim} | {r.kernel} | {r.flashrt_us:.3f} | "
+            f"{r.torch_eager_us:.3f} | {r.speedup_vs_eager:.2f}x | "
+            f"{r.max_abs:.6f} | {r.mean_abs:.6f} | {r.p99_abs:.6f} | "
+            f"{r.cosine:.8f} | {r.status} |"
+        )
+    path.write_text("\n".join(lines) + "\n")
+def main() -> None:
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--backend", choices=["source", "installed"], default="source")
+    parser.add_argument("--artifact", default=None)
+    parser.add_argument("--shapes", choices=sorted(SHAPE_GROUPS), default="smoke")
+    parser.add_argument("--warmup", type=int, default=5)
+    parser.add_argument("--iters", type=int, default=20)
+    parser.add_argument("--eps", type=float, default=1e-6)
+    parser.add_argument("--p99-abs-limit", type=float, default=0.5)
+    parser.add_argument("--output", default=None)
+    parser.add_argument("--markdown", default=None)
+    args = parser.parse_args()
+    if not torch.cuda.is_available():
+        raise SystemExit("CUDA is required")
+    torch.manual_seed(29)
+    ops = load_source_ops() if args.backend == "source" else load_installed_ops(args.artifact)
+    results = []
+    for name in SHAPE_GROUPS[args.shapes]:
+        rows, dim = SHAPES[name]
+        results.extend(run_one(ops, name, rows, dim, args))
+    for r in results:
+        print(
+            f"{r.status} {r.shape}/{r.kernel}: flashrt={r.flashrt_us:.3f}us "
+            f"eager={r.torch_eager_us:.3f}us speedup={r.speedup_vs_eager:.2f}x "
+            f"p99_abs={r.p99_abs:.6f} cosine={r.cosine:.8f}"
+        )
+    if args.output:
+        Path(args.output).parent.mkdir(parents=True, exist_ok=True)
+        Path(args.output).write_text(json.dumps([asdict(r) for r in results], indent=2) + "\n")
+    if args.markdown:
+        Path(args.markdown).parent.mkdir(parents=True, exist_ok=True)
+        write_markdown(Path(args.markdown), results)
+    if any(r.status != "PASS" for r in results):
+        raise SystemExit(1)
+if __name__ == "__main__":
+    main()

build/torch210-cxx11-cu128-x86_64-linux/__init__.py ADDED Viewed

	@@ -0,0 +1,123 @@

+"""FlashRT residual/RMSNorm/static-FP8 quantization kernels."""
+from __future__ import annotations
+import torch
+from ._ops import add_op_namespace_prefix, ops
+def _check_rank2_same_shape(x: torch.Tensor, out: torch.Tensor, out_name: str) -> None:
+    if x.dim() != 2:
+        raise RuntimeError("x must be rank-2")
+    if out.shape != x.shape:
+        raise RuntimeError(f"{out_name} must have the same shape as x")
+@torch.library.register_fake(add_op_namespace_prefix("rms_norm_bf16"))
+def _rms_norm_bf16_fake(
+    x: torch.Tensor,
+    weight: torch.Tensor,
+    eps: float,
+    out: torch.Tensor,
+) -> None:
+    _check_rank2_same_shape(x, out, "out")
+    if weight.shape != (x.shape[1],):
+        raise RuntimeError("weight must have shape (x.shape[1],)")
+    return None
+@torch.library.register_fake(add_op_namespace_prefix("rms_norm_quant_fp8_static_bf16"))
+def _rms_norm_quant_fp8_static_bf16_fake(
+    x: torch.Tensor,
+    weight: torch.Tensor,
+    scale: torch.Tensor,
+    eps: float,
+    out: torch.Tensor,
+) -> None:
+    _check_rank2_same_shape(x, out, "out")
+    if weight.shape != (x.shape[1],):
+        raise RuntimeError("weight must have shape (x.shape[1],)")
+    if scale.numel() != 1:
+        raise RuntimeError("scale must contain exactly one value")
+    return None
+@torch.library.register_fake(
+    add_op_namespace_prefix("residual_add_rms_norm_quant_fp8_static_bf16")
+)
+def _residual_add_rms_norm_quant_fp8_static_bf16_fake(
+    residual: torch.Tensor,
+    x: torch.Tensor,
+    weight: torch.Tensor,
+    scale: torch.Tensor,
+    eps: float,
+    out: torch.Tensor,
+) -> None:
+    if residual.shape != x.shape:
+        raise RuntimeError("residual and x must have the same shape")
+    _check_rank2_same_shape(x, out, "out")
+    if weight.shape != (x.shape[1],):
+        raise RuntimeError("weight must have shape (x.shape[1],)")
+    if scale.numel() != 1:
+        raise RuntimeError("scale must contain exactly one value")
+    return None
+def rms_norm_bf16(
+    x: torch.Tensor,
+    weight: torch.Tensor,
+    eps: float = 1e-6,
+    out: torch.Tensor | None = None,
+) -> torch.Tensor:
+    """BF16 RMSNorm with affine weight."""
+    if out is None:
+        out = torch.empty_like(x, dtype=torch.bfloat16)
+    ops.rms_norm_bf16(x, weight, float(eps), out)
+    return out
+def rms_norm_quant_fp8_static_bf16(
+    x: torch.Tensor,
+    weight: torch.Tensor,
+    scale: torch.Tensor,
+    eps: float = 1e-6,
+    out: torch.Tensor | None = None,
+) -> torch.Tensor:
+    """BF16 RMSNorm followed by static-scale FP8 E4M3 quantization."""
+    if out is None:
+        out = torch.empty_like(x, dtype=torch.float8_e4m3fn)
+    ops.rms_norm_quant_fp8_static_bf16(x, weight, scale, float(eps), out)
+    return out
+def residual_add_rms_norm_quant_fp8_static_bf16(
+    residual: torch.Tensor,
+    x: torch.Tensor,
+    weight: torch.Tensor,
+    scale: torch.Tensor,
+    eps: float = 1e-6,
+    out: torch.Tensor | None = None,
+) -> torch.Tensor:
+    """In-place ``residual += x`` then RMSNorm and static FP8 quantization."""
+    if out is None:
+        out = torch.empty_like(x, dtype=torch.float8_e4m3fn)
+    ops.residual_add_rms_norm_quant_fp8_static_bf16(
+        residual,
+        x,
+        weight,
+        scale,
+        float(eps),
+        out,
+    )
+    return out
+__all__ = [
+    "residual_add_rms_norm_quant_fp8_static_bf16",
+    "rms_norm_bf16",
+    "rms_norm_quant_fp8_static_bf16",
+]

build/torch210-cxx11-cu128-x86_64-linux/_flashrt_residual_norm_quant_cuda_cf903dd.abi3.so ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:244000c5b33e1f609987b8b9aef434d0d6bee50bdf5287442ac889b2ac0c0df4
+size 2471360

build/torch210-cxx11-cu128-x86_64-linux/_ops.py ADDED Viewed

	@@ -0,0 +1,9 @@

+import torch
+from . import _flashrt_residual_norm_quant_cuda_cf903dd
+ops = torch.ops._flashrt_residual_norm_quant_cuda_cf903dd
+def add_op_namespace_prefix(op_name: str):
+    """
+    Prefix op by namespace.
+    """
+    return f"_flashrt_residual_norm_quant_cuda_cf903dd::{op_name}"

build/torch210-cxx11-cu128-x86_64-linux/flashrt_residual_norm_quant/__init__.py ADDED Viewed

	@@ -0,0 +1,26 @@

+import ctypes
+import importlib.util
+import sys
+from pathlib import Path
+from types import ModuleType
+def _import_from_path(file_path: Path) -> ModuleType:
+    # We cannot use the module name as-is, after adding it to `sys.modules`,
+    # it would also be used for other imports. So, we make a module name that
+    # depends on the path for it to be unique using the hex-encoded hash of
+    # the path.
+    path_hash = "{:x}".format(ctypes.c_size_t(hash(file_path.absolute())).value)
+    module_name = path_hash
+    spec = importlib.util.spec_from_file_location(module_name, file_path)
+    if spec is None:
+        raise ImportError(f"Cannot load spec for {module_name} from {file_path}")
+    module = importlib.util.module_from_spec(spec)
+    if module is None:
+        raise ImportError(f"Cannot load module {module_name} from spec")
+    sys.modules[module_name] = module
+    spec.loader.exec_module(module)  # type: ignore
+    return module
+globals().update(vars(_import_from_path(Path(__file__).parent.parent / "__init__.py")))

build/torch210-cxx11-cu128-x86_64-linux/metadata.json ADDED Viewed

	@@ -0,0 +1,23 @@

+{
+  "name": "flashrt-residual-norm-quant",
+  "id": "_flashrt_residual_norm_quant_cuda_cf903dd",
+  "version": 1,
+  "license": "Apache-2.0",
+  "python-depends": [],
+  "backend": {
+    "type": "cuda",
+    "archs": [
+      "10.0",
+      "10.1",
+      "12.0+PTX",
+      "7.0",
+      "7.2",
+      "7.5",
+      "8.0",
+      "8.6",
+      "8.7",
+      "8.9",
+      "9.0"
+    ]
+  }
+}

build/torch210-cxx11-cu130-x86_64-linux/__init__.py ADDED Viewed

	@@ -0,0 +1,123 @@

+"""FlashRT residual/RMSNorm/static-FP8 quantization kernels."""
+from __future__ import annotations
+import torch
+from ._ops import add_op_namespace_prefix, ops
+def _check_rank2_same_shape(x: torch.Tensor, out: torch.Tensor, out_name: str) -> None:
+    if x.dim() != 2:
+        raise RuntimeError("x must be rank-2")
+    if out.shape != x.shape:
+        raise RuntimeError(f"{out_name} must have the same shape as x")
+@torch.library.register_fake(add_op_namespace_prefix("rms_norm_bf16"))
+def _rms_norm_bf16_fake(
+    x: torch.Tensor,
+    weight: torch.Tensor,
+    eps: float,
+    out: torch.Tensor,
+) -> None:
+    _check_rank2_same_shape(x, out, "out")
+    if weight.shape != (x.shape[1],):
+        raise RuntimeError("weight must have shape (x.shape[1],)")
+    return None
+@torch.library.register_fake(add_op_namespace_prefix("rms_norm_quant_fp8_static_bf16"))
+def _rms_norm_quant_fp8_static_bf16_fake(
+    x: torch.Tensor,
+    weight: torch.Tensor,
+    scale: torch.Tensor,
+    eps: float,
+    out: torch.Tensor,
+) -> None:
+    _check_rank2_same_shape(x, out, "out")
+    if weight.shape != (x.shape[1],):
+        raise RuntimeError("weight must have shape (x.shape[1],)")
+    if scale.numel() != 1:
+        raise RuntimeError("scale must contain exactly one value")
+    return None
+@torch.library.register_fake(
+    add_op_namespace_prefix("residual_add_rms_norm_quant_fp8_static_bf16")
+)
+def _residual_add_rms_norm_quant_fp8_static_bf16_fake(
+    residual: torch.Tensor,
+    x: torch.Tensor,
+    weight: torch.Tensor,
+    scale: torch.Tensor,
+    eps: float,
+    out: torch.Tensor,
+) -> None:
+    if residual.shape != x.shape:
+        raise RuntimeError("residual and x must have the same shape")
+    _check_rank2_same_shape(x, out, "out")
+    if weight.shape != (x.shape[1],):
+        raise RuntimeError("weight must have shape (x.shape[1],)")
+    if scale.numel() != 1:
+        raise RuntimeError("scale must contain exactly one value")
+    return None
+def rms_norm_bf16(
+    x: torch.Tensor,
+    weight: torch.Tensor,
+    eps: float = 1e-6,
+    out: torch.Tensor | None = None,
+) -> torch.Tensor:
+    """BF16 RMSNorm with affine weight."""
+    if out is None:
+        out = torch.empty_like(x, dtype=torch.bfloat16)
+    ops.rms_norm_bf16(x, weight, float(eps), out)
+    return out
+def rms_norm_quant_fp8_static_bf16(
+    x: torch.Tensor,
+    weight: torch.Tensor,
+    scale: torch.Tensor,
+    eps: float = 1e-6,
+    out: torch.Tensor | None = None,
+) -> torch.Tensor:
+    """BF16 RMSNorm followed by static-scale FP8 E4M3 quantization."""
+    if out is None:
+        out = torch.empty_like(x, dtype=torch.float8_e4m3fn)
+    ops.rms_norm_quant_fp8_static_bf16(x, weight, scale, float(eps), out)
+    return out
+def residual_add_rms_norm_quant_fp8_static_bf16(
+    residual: torch.Tensor,
+    x: torch.Tensor,
+    weight: torch.Tensor,
+    scale: torch.Tensor,
+    eps: float = 1e-6,
+    out: torch.Tensor | None = None,
+) -> torch.Tensor:
+    """In-place ``residual += x`` then RMSNorm and static FP8 quantization."""
+    if out is None:
+        out = torch.empty_like(x, dtype=torch.float8_e4m3fn)
+    ops.residual_add_rms_norm_quant_fp8_static_bf16(
+        residual,
+        x,
+        weight,
+        scale,
+        float(eps),
+        out,
+    )
+    return out
+__all__ = [
+    "residual_add_rms_norm_quant_fp8_static_bf16",
+    "rms_norm_bf16",
+    "rms_norm_quant_fp8_static_bf16",
+]

build/torch210-cxx11-cu130-x86_64-linux/_flashrt_residual_norm_quant_cuda_cf903dd.abi3.so ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4ac048c6ebb52c526e68aa3e2325e0bc28fcd0bf11ceabc084c26b7c1dcb7710
+size 2414152

build/torch210-cxx11-cu130-x86_64-linux/_ops.py ADDED Viewed

	@@ -0,0 +1,9 @@

+import torch
+from . import _flashrt_residual_norm_quant_cuda_cf903dd
+ops = torch.ops._flashrt_residual_norm_quant_cuda_cf903dd
+def add_op_namespace_prefix(op_name: str):
+    """
+    Prefix op by namespace.
+    """
+    return f"_flashrt_residual_norm_quant_cuda_cf903dd::{op_name}"

build/torch210-cxx11-cu130-x86_64-linux/flashrt_residual_norm_quant/__init__.py ADDED Viewed

	@@ -0,0 +1,26 @@

+import ctypes
+import importlib.util
+import sys
+from pathlib import Path
+from types import ModuleType
+def _import_from_path(file_path: Path) -> ModuleType:
+    # We cannot use the module name as-is, after adding it to `sys.modules`,
+    # it would also be used for other imports. So, we make a module name that
+    # depends on the path for it to be unique using the hex-encoded hash of
+    # the path.
+    path_hash = "{:x}".format(ctypes.c_size_t(hash(file_path.absolute())).value)
+    module_name = path_hash
+    spec = importlib.util.spec_from_file_location(module_name, file_path)
+    if spec is None:
+        raise ImportError(f"Cannot load spec for {module_name} from {file_path}")
+    module = importlib.util.module_from_spec(spec)
+    if module is None:
+        raise ImportError(f"Cannot load module {module_name} from spec")
+    sys.modules[module_name] = module
+    spec.loader.exec_module(module)  # type: ignore
+    return module
+globals().update(vars(_import_from_path(Path(__file__).parent.parent / "__init__.py")))

build/torch210-cxx11-cu130-x86_64-linux/metadata.json ADDED Viewed

	@@ -0,0 +1,21 @@

+{
+  "name": "flashrt-residual-norm-quant",
+  "id": "_flashrt_residual_norm_quant_cuda_cf903dd",
+  "version": 1,
+  "license": "Apache-2.0",
+  "python-depends": [],
+  "backend": {
+    "type": "cuda",
+    "archs": [
+      "10.0",
+      "11.0",
+      "12.0+PTX",
+      "7.5",
+      "8.0",
+      "8.6",
+      "8.7",
+      "8.9",
+      "9.0"
+    ]
+  }
+}

build/torch211-cxx11-cu128-x86_64-linux/__init__.py ADDED Viewed

	@@ -0,0 +1,123 @@

+"""FlashRT residual/RMSNorm/static-FP8 quantization kernels."""
+from __future__ import annotations
+import torch
+from ._ops import add_op_namespace_prefix, ops
+def _check_rank2_same_shape(x: torch.Tensor, out: torch.Tensor, out_name: str) -> None:
+    if x.dim() != 2:
+        raise RuntimeError("x must be rank-2")
+    if out.shape != x.shape:
+        raise RuntimeError(f"{out_name} must have the same shape as x")
+@torch.library.register_fake(add_op_namespace_prefix("rms_norm_bf16"))
+def _rms_norm_bf16_fake(
+    x: torch.Tensor,
+    weight: torch.Tensor,
+    eps: float,
+    out: torch.Tensor,
+) -> None:
+    _check_rank2_same_shape(x, out, "out")
+    if weight.shape != (x.shape[1],):
+        raise RuntimeError("weight must have shape (x.shape[1],)")
+    return None
+@torch.library.register_fake(add_op_namespace_prefix("rms_norm_quant_fp8_static_bf16"))
+def _rms_norm_quant_fp8_static_bf16_fake(
+    x: torch.Tensor,
+    weight: torch.Tensor,
+    scale: torch.Tensor,
+    eps: float,
+    out: torch.Tensor,
+) -> None:
+    _check_rank2_same_shape(x, out, "out")
+    if weight.shape != (x.shape[1],):
+        raise RuntimeError("weight must have shape (x.shape[1],)")
+    if scale.numel() != 1:
+        raise RuntimeError("scale must contain exactly one value")
+    return None
+@torch.library.register_fake(
+    add_op_namespace_prefix("residual_add_rms_norm_quant_fp8_static_bf16")
+)
+def _residual_add_rms_norm_quant_fp8_static_bf16_fake(
+    residual: torch.Tensor,
+    x: torch.Tensor,
+    weight: torch.Tensor,
+    scale: torch.Tensor,
+    eps: float,
+    out: torch.Tensor,
+) -> None:
+    if residual.shape != x.shape:
+        raise RuntimeError("residual and x must have the same shape")
+    _check_rank2_same_shape(x, out, "out")
+    if weight.shape != (x.shape[1],):
+        raise RuntimeError("weight must have shape (x.shape[1],)")
+    if scale.numel() != 1:
+        raise RuntimeError("scale must contain exactly one value")
+    return None
+def rms_norm_bf16(
+    x: torch.Tensor,
+    weight: torch.Tensor,
+    eps: float = 1e-6,
+    out: torch.Tensor | None = None,
+) -> torch.Tensor:
+    """BF16 RMSNorm with affine weight."""
+    if out is None:
+        out = torch.empty_like(x, dtype=torch.bfloat16)
+    ops.rms_norm_bf16(x, weight, float(eps), out)
+    return out
+def rms_norm_quant_fp8_static_bf16(
+    x: torch.Tensor,
+    weight: torch.Tensor,
+    scale: torch.Tensor,
+    eps: float = 1e-6,
+    out: torch.Tensor | None = None,
+) -> torch.Tensor:
+    """BF16 RMSNorm followed by static-scale FP8 E4M3 quantization."""
+    if out is None:
+        out = torch.empty_like(x, dtype=torch.float8_e4m3fn)
+    ops.rms_norm_quant_fp8_static_bf16(x, weight, scale, float(eps), out)
+    return out
+def residual_add_rms_norm_quant_fp8_static_bf16(
+    residual: torch.Tensor,
+    x: torch.Tensor,
+    weight: torch.Tensor,
+    scale: torch.Tensor,
+    eps: float = 1e-6,
+    out: torch.Tensor | None = None,
+) -> torch.Tensor:
+    """In-place ``residual += x`` then RMSNorm and static FP8 quantization."""
+    if out is None:
+        out = torch.empty_like(x, dtype=torch.float8_e4m3fn)
+    ops.residual_add_rms_norm_quant_fp8_static_bf16(
+        residual,
+        x,
+        weight,
+        scale,
+        float(eps),
+        out,
+    )
+    return out
+__all__ = [
+    "residual_add_rms_norm_quant_fp8_static_bf16",
+    "rms_norm_bf16",
+    "rms_norm_quant_fp8_static_bf16",
+]

build/torch211-cxx11-cu128-x86_64-linux/_flashrt_residual_norm_quant_cuda_cf903dd.abi3.so ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:562ffcdba2e68ce168e6b0be94d3ace7b54ee7f5f3cb701850bcf90b93f2f106
+size 2464400

build/torch211-cxx11-cu128-x86_64-linux/_ops.py ADDED Viewed

	@@ -0,0 +1,9 @@

+import torch
+from . import _flashrt_residual_norm_quant_cuda_cf903dd
+ops = torch.ops._flashrt_residual_norm_quant_cuda_cf903dd
+def add_op_namespace_prefix(op_name: str):
+    """
+    Prefix op by namespace.
+    """
+    return f"_flashrt_residual_norm_quant_cuda_cf903dd::{op_name}"

build/torch211-cxx11-cu128-x86_64-linux/flashrt_residual_norm_quant/__init__.py ADDED Viewed

	@@ -0,0 +1,26 @@

+import ctypes
+import importlib.util
+import sys
+from pathlib import Path
+from types import ModuleType
+def _import_from_path(file_path: Path) -> ModuleType:
+    # We cannot use the module name as-is, after adding it to `sys.modules`,
+    # it would also be used for other imports. So, we make a module name that
+    # depends on the path for it to be unique using the hex-encoded hash of
+    # the path.
+    path_hash = "{:x}".format(ctypes.c_size_t(hash(file_path.absolute())).value)
+    module_name = path_hash
+    spec = importlib.util.spec_from_file_location(module_name, file_path)
+    if spec is None:
+        raise ImportError(f"Cannot load spec for {module_name} from {file_path}")
+    module = importlib.util.module_from_spec(spec)
+    if module is None:
+        raise ImportError(f"Cannot load module {module_name} from spec")
+    sys.modules[module_name] = module
+    spec.loader.exec_module(module)  # type: ignore
+    return module
+globals().update(vars(_import_from_path(Path(__file__).parent.parent / "__init__.py")))

build/torch211-cxx11-cu128-x86_64-linux/metadata.json ADDED Viewed

	@@ -0,0 +1,23 @@

+{
+  "name": "flashrt-residual-norm-quant",
+  "id": "_flashrt_residual_norm_quant_cuda_cf903dd",
+  "version": 1,
+  "license": "Apache-2.0",
+  "python-depends": [],
+  "backend": {
+    "type": "cuda",
+    "archs": [
+      "10.0",
+      "10.1",
+      "12.0+PTX",
+      "7.0",
+      "7.2",
+      "7.5",
+      "8.0",
+      "8.6",
+      "8.7",
+      "8.9",
+      "9.0"
+    ]
+  }
+}

build/torch211-cxx11-cu130-x86_64-linux/__init__.py ADDED Viewed

	@@ -0,0 +1,123 @@

+"""FlashRT residual/RMSNorm/static-FP8 quantization kernels."""
+from __future__ import annotations
+import torch
+from ._ops import add_op_namespace_prefix, ops
+def _check_rank2_same_shape(x: torch.Tensor, out: torch.Tensor, out_name: str) -> None:
+    if x.dim() != 2:
+        raise RuntimeError("x must be rank-2")
+    if out.shape != x.shape:
+        raise RuntimeError(f"{out_name} must have the same shape as x")
+@torch.library.register_fake(add_op_namespace_prefix("rms_norm_bf16"))
+def _rms_norm_bf16_fake(
+    x: torch.Tensor,
+    weight: torch.Tensor,
+    eps: float,
+    out: torch.Tensor,
+) -> None:
+    _check_rank2_same_shape(x, out, "out")
+    if weight.shape != (x.shape[1],):
+        raise RuntimeError("weight must have shape (x.shape[1],)")
+    return None
+@torch.library.register_fake(add_op_namespace_prefix("rms_norm_quant_fp8_static_bf16"))
+def _rms_norm_quant_fp8_static_bf16_fake(
+    x: torch.Tensor,
+    weight: torch.Tensor,
+    scale: torch.Tensor,
+    eps: float,
+    out: torch.Tensor,
+) -> None:
+    _check_rank2_same_shape(x, out, "out")
+    if weight.shape != (x.shape[1],):
+        raise RuntimeError("weight must have shape (x.shape[1],)")
+    if scale.numel() != 1:
+        raise RuntimeError("scale must contain exactly one value")
+    return None
+@torch.library.register_fake(
+    add_op_namespace_prefix("residual_add_rms_norm_quant_fp8_static_bf16")
+)
+def _residual_add_rms_norm_quant_fp8_static_bf16_fake(
+    residual: torch.Tensor,
+    x: torch.Tensor,
+    weight: torch.Tensor,
+    scale: torch.Tensor,
+    eps: float,
+    out: torch.Tensor,
+) -> None:
+    if residual.shape != x.shape:
+        raise RuntimeError("residual and x must have the same shape")
+    _check_rank2_same_shape(x, out, "out")
+    if weight.shape != (x.shape[1],):
+        raise RuntimeError("weight must have shape (x.shape[1],)")
+    if scale.numel() != 1:
+        raise RuntimeError("scale must contain exactly one value")
+    return None
+def rms_norm_bf16(
+    x: torch.Tensor,
+    weight: torch.Tensor,
+    eps: float = 1e-6,
+    out: torch.Tensor | None = None,
+) -> torch.Tensor:
+    """BF16 RMSNorm with affine weight."""
+    if out is None:
+        out = torch.empty_like(x, dtype=torch.bfloat16)
+    ops.rms_norm_bf16(x, weight, float(eps), out)
+    return out
+def rms_norm_quant_fp8_static_bf16(
+    x: torch.Tensor,
+    weight: torch.Tensor,
+    scale: torch.Tensor,
+    eps: float = 1e-6,
+    out: torch.Tensor | None = None,
+) -> torch.Tensor:
+    """BF16 RMSNorm followed by static-scale FP8 E4M3 quantization."""
+    if out is None:
+        out = torch.empty_like(x, dtype=torch.float8_e4m3fn)
+    ops.rms_norm_quant_fp8_static_bf16(x, weight, scale, float(eps), out)
+    return out
+def residual_add_rms_norm_quant_fp8_static_bf16(
+    residual: torch.Tensor,
+    x: torch.Tensor,
+    weight: torch.Tensor,
+    scale: torch.Tensor,
+    eps: float = 1e-6,
+    out: torch.Tensor | None = None,
+) -> torch.Tensor:
+    """In-place ``residual += x`` then RMSNorm and static FP8 quantization."""
+    if out is None:
+        out = torch.empty_like(x, dtype=torch.float8_e4m3fn)
+    ops.residual_add_rms_norm_quant_fp8_static_bf16(
+        residual,
+        x,
+        weight,
+        scale,
+        float(eps),
+        out,
+    )
+    return out
+__all__ = [
+    "residual_add_rms_norm_quant_fp8_static_bf16",
+    "rms_norm_bf16",
+    "rms_norm_quant_fp8_static_bf16",
+]

build/torch211-cxx11-cu130-x86_64-linux/_flashrt_residual_norm_quant_cuda_cf903dd.abi3.so ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b197c38ab8ee5e98974c81fe7a883057c027bb0238c737e2494ceb627872af6f
+size 2398992

build/torch211-cxx11-cu130-x86_64-linux/_ops.py ADDED Viewed

	@@ -0,0 +1,9 @@

+import torch
+from . import _flashrt_residual_norm_quant_cuda_cf903dd
+ops = torch.ops._flashrt_residual_norm_quant_cuda_cf903dd
+def add_op_namespace_prefix(op_name: str):
+    """
+    Prefix op by namespace.
+    """
+    return f"_flashrt_residual_norm_quant_cuda_cf903dd::{op_name}"

build/torch211-cxx11-cu130-x86_64-linux/flashrt_residual_norm_quant/__init__.py ADDED Viewed

	@@ -0,0 +1,26 @@

+import ctypes
+import importlib.util
+import sys
+from pathlib import Path
+from types import ModuleType
+def _import_from_path(file_path: Path) -> ModuleType:
+    # We cannot use the module name as-is, after adding it to `sys.modules`,
+    # it would also be used for other imports. So, we make a module name that
+    # depends on the path for it to be unique using the hex-encoded hash of
+    # the path.
+    path_hash = "{:x}".format(ctypes.c_size_t(hash(file_path.absolute())).value)
+    module_name = path_hash
+    spec = importlib.util.spec_from_file_location(module_name, file_path)
+    if spec is None:
+        raise ImportError(f"Cannot load spec for {module_name} from {file_path}")
+    module = importlib.util.module_from_spec(spec)
+    if module is None:
+        raise ImportError(f"Cannot load module {module_name} from spec")
+    sys.modules[module_name] = module
+    spec.loader.exec_module(module)  # type: ignore
+    return module
+globals().update(vars(_import_from_path(Path(__file__).parent.parent / "__init__.py")))

build/torch211-cxx11-cu130-x86_64-linux/metadata.json ADDED Viewed

	@@ -0,0 +1,21 @@

+{
+  "name": "flashrt-residual-norm-quant",
+  "id": "_flashrt_residual_norm_quant_cuda_cf903dd",
+  "version": 1,
+  "license": "Apache-2.0",
+  "python-depends": [],
+  "backend": {
+    "type": "cuda",
+    "archs": [
+      "10.0",
+      "11.0",
+      "12.0+PTX",
+      "7.5",
+      "8.0",
+      "8.6",
+      "8.7",
+      "8.9",
+      "9.0"
+    ]
+  }
+}

build/torch212-cxx11-cu130-x86_64-linux/__init__.py ADDED Viewed

	@@ -0,0 +1,123 @@

+"""FlashRT residual/RMSNorm/static-FP8 quantization kernels."""
+from __future__ import annotations
+import torch
+from ._ops import add_op_namespace_prefix, ops
+def _check_rank2_same_shape(x: torch.Tensor, out: torch.Tensor, out_name: str) -> None:
+    if x.dim() != 2:
+        raise RuntimeError("x must be rank-2")
+    if out.shape != x.shape:
+        raise RuntimeError(f"{out_name} must have the same shape as x")
+@torch.library.register_fake(add_op_namespace_prefix("rms_norm_bf16"))
+def _rms_norm_bf16_fake(
+    x: torch.Tensor,
+    weight: torch.Tensor,
+    eps: float,
+    out: torch.Tensor,
+) -> None:
+    _check_rank2_same_shape(x, out, "out")
+    if weight.shape != (x.shape[1],):
+        raise RuntimeError("weight must have shape (x.shape[1],)")
+    return None
+@torch.library.register_fake(add_op_namespace_prefix("rms_norm_quant_fp8_static_bf16"))
+def _rms_norm_quant_fp8_static_bf16_fake(
+    x: torch.Tensor,
+    weight: torch.Tensor,
+    scale: torch.Tensor,
+    eps: float,
+    out: torch.Tensor,
+) -> None:
+    _check_rank2_same_shape(x, out, "out")
+    if weight.shape != (x.shape[1],):
+        raise RuntimeError("weight must have shape (x.shape[1],)")
+    if scale.numel() != 1:
+        raise RuntimeError("scale must contain exactly one value")
+    return None
+@torch.library.register_fake(
+    add_op_namespace_prefix("residual_add_rms_norm_quant_fp8_static_bf16")
+)
+def _residual_add_rms_norm_quant_fp8_static_bf16_fake(
+    residual: torch.Tensor,
+    x: torch.Tensor,
+    weight: torch.Tensor,
+    scale: torch.Tensor,
+    eps: float,
+    out: torch.Tensor,
+) -> None:
+    if residual.shape != x.shape:
+        raise RuntimeError("residual and x must have the same shape")
+    _check_rank2_same_shape(x, out, "out")
+    if weight.shape != (x.shape[1],):
+        raise RuntimeError("weight must have shape (x.shape[1],)")
+    if scale.numel() != 1:
+        raise RuntimeError("scale must contain exactly one value")
+    return None
+def rms_norm_bf16(
+    x: torch.Tensor,
+    weight: torch.Tensor,
+    eps: float = 1e-6,
+    out: torch.Tensor | None = None,
+) -> torch.Tensor:
+    """BF16 RMSNorm with affine weight."""
+    if out is None:
+        out = torch.empty_like(x, dtype=torch.bfloat16)
+    ops.rms_norm_bf16(x, weight, float(eps), out)
+    return out
+def rms_norm_quant_fp8_static_bf16(
+    x: torch.Tensor,
+    weight: torch.Tensor,
+    scale: torch.Tensor,
+    eps: float = 1e-6,
+    out: torch.Tensor | None = None,
+) -> torch.Tensor:
+    """BF16 RMSNorm followed by static-scale FP8 E4M3 quantization."""
+    if out is None:
+        out = torch.empty_like(x, dtype=torch.float8_e4m3fn)
+    ops.rms_norm_quant_fp8_static_bf16(x, weight, scale, float(eps), out)
+    return out
+def residual_add_rms_norm_quant_fp8_static_bf16(
+    residual: torch.Tensor,
+    x: torch.Tensor,
+    weight: torch.Tensor,
+    scale: torch.Tensor,
+    eps: float = 1e-6,
+    out: torch.Tensor | None = None,
+) -> torch.Tensor:
+    """In-place ``residual += x`` then RMSNorm and static FP8 quantization."""
+    if out is None:
+        out = torch.empty_like(x, dtype=torch.float8_e4m3fn)
+    ops.residual_add_rms_norm_quant_fp8_static_bf16(
+        residual,
+        x,
+        weight,
+        scale,
+        float(eps),
+        out,
+    )
+    return out
+__all__ = [
+    "residual_add_rms_norm_quant_fp8_static_bf16",
+    "rms_norm_bf16",
+    "rms_norm_quant_fp8_static_bf16",
+]

build/torch212-cxx11-cu130-x86_64-linux/_flashrt_residual_norm_quant_cuda_cf903dd.abi3.so ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c386ef3e62e6314e355fb016afc4bb3536653e99749178e29900ff04278bc5cc
+size 2400424

build/torch212-cxx11-cu130-x86_64-linux/_ops.py ADDED Viewed

	@@ -0,0 +1,9 @@

+import torch
+from . import _flashrt_residual_norm_quant_cuda_cf903dd
+ops = torch.ops._flashrt_residual_norm_quant_cuda_cf903dd
+def add_op_namespace_prefix(op_name: str):
+    """
+    Prefix op by namespace.
+    """
+    return f"_flashrt_residual_norm_quant_cuda_cf903dd::{op_name}"

build/torch212-cxx11-cu130-x86_64-linux/flashrt_residual_norm_quant/__init__.py ADDED Viewed

	@@ -0,0 +1,26 @@

+import ctypes
+import importlib.util
+import sys
+from pathlib import Path
+from types import ModuleType
+def _import_from_path(file_path: Path) -> ModuleType:
+    # We cannot use the module name as-is, after adding it to `sys.modules`,
+    # it would also be used for other imports. So, we make a module name that
+    # depends on the path for it to be unique using the hex-encoded hash of
+    # the path.
+    path_hash = "{:x}".format(ctypes.c_size_t(hash(file_path.absolute())).value)
+    module_name = path_hash
+    spec = importlib.util.spec_from_file_location(module_name, file_path)
+    if spec is None:
+        raise ImportError(f"Cannot load spec for {module_name} from {file_path}")
+    module = importlib.util.module_from_spec(spec)
+    if module is None:
+        raise ImportError(f"Cannot load module {module_name} from spec")
+    sys.modules[module_name] = module
+    spec.loader.exec_module(module)  # type: ignore
+    return module
+globals().update(vars(_import_from_path(Path(__file__).parent.parent / "__init__.py")))

build/torch212-cxx11-cu130-x86_64-linux/metadata.json ADDED Viewed

	@@ -0,0 +1,21 @@

+{
+  "name": "flashrt-residual-norm-quant",
+  "id": "_flashrt_residual_norm_quant_cuda_cf903dd",
+  "version": 1,
+  "license": "Apache-2.0",
+  "python-depends": [],
+  "backend": {
+    "type": "cuda",
+    "archs": [
+      "10.0",
+      "11.0",
+      "12.0+PTX",
+      "7.5",
+      "8.0",
+      "8.6",
+      "8.7",
+      "8.9",
+      "9.0"
+    ]
+  }
+}

build/torch212-cxx11-cu132-x86_64-linux/__init__.py ADDED Viewed

	@@ -0,0 +1,123 @@

+"""FlashRT residual/RMSNorm/static-FP8 quantization kernels."""
+from __future__ import annotations
+import torch
+from ._ops import add_op_namespace_prefix, ops
+def _check_rank2_same_shape(x: torch.Tensor, out: torch.Tensor, out_name: str) -> None:
+    if x.dim() != 2:
+        raise RuntimeError("x must be rank-2")
+    if out.shape != x.shape:
+        raise RuntimeError(f"{out_name} must have the same shape as x")
+@torch.library.register_fake(add_op_namespace_prefix("rms_norm_bf16"))
+def _rms_norm_bf16_fake(
+    x: torch.Tensor,
+    weight: torch.Tensor,
+    eps: float,
+    out: torch.Tensor,
+) -> None:
+    _check_rank2_same_shape(x, out, "out")
+    if weight.shape != (x.shape[1],):
+        raise RuntimeError("weight must have shape (x.shape[1],)")
+    return None
+@torch.library.register_fake(add_op_namespace_prefix("rms_norm_quant_fp8_static_bf16"))
+def _rms_norm_quant_fp8_static_bf16_fake(
+    x: torch.Tensor,
+    weight: torch.Tensor,
+    scale: torch.Tensor,
+    eps: float,
+    out: torch.Tensor,
+) -> None:
+    _check_rank2_same_shape(x, out, "out")
+    if weight.shape != (x.shape[1],):
+        raise RuntimeError("weight must have shape (x.shape[1],)")
+    if scale.numel() != 1:
+        raise RuntimeError("scale must contain exactly one value")
+    return None
+@torch.library.register_fake(
+    add_op_namespace_prefix("residual_add_rms_norm_quant_fp8_static_bf16")
+)
+def _residual_add_rms_norm_quant_fp8_static_bf16_fake(
+    residual: torch.Tensor,
+    x: torch.Tensor,
+    weight: torch.Tensor,
+    scale: torch.Tensor,
+    eps: float,
+    out: torch.Tensor,
+) -> None:
+    if residual.shape != x.shape:
+        raise RuntimeError("residual and x must have the same shape")
+    _check_rank2_same_shape(x, out, "out")
+    if weight.shape != (x.shape[1],):
+        raise RuntimeError("weight must have shape (x.shape[1],)")
+    if scale.numel() != 1:
+        raise RuntimeError("scale must contain exactly one value")
+    return None
+def rms_norm_bf16(
+    x: torch.Tensor,
+    weight: torch.Tensor,
+    eps: float = 1e-6,
+    out: torch.Tensor | None = None,
+) -> torch.Tensor:
+    """BF16 RMSNorm with affine weight."""
+    if out is None:
+        out = torch.empty_like(x, dtype=torch.bfloat16)
+    ops.rms_norm_bf16(x, weight, float(eps), out)
+    return out
+def rms_norm_quant_fp8_static_bf16(
+    x: torch.Tensor,
+    weight: torch.Tensor,
+    scale: torch.Tensor,
+    eps: float = 1e-6,
+    out: torch.Tensor | None = None,
+) -> torch.Tensor:
+    """BF16 RMSNorm followed by static-scale FP8 E4M3 quantization."""
+    if out is None:
+        out = torch.empty_like(x, dtype=torch.float8_e4m3fn)
+    ops.rms_norm_quant_fp8_static_bf16(x, weight, scale, float(eps), out)
+    return out
+def residual_add_rms_norm_quant_fp8_static_bf16(
+    residual: torch.Tensor,
+    x: torch.Tensor,
+    weight: torch.Tensor,
+    scale: torch.Tensor,
+    eps: float = 1e-6,
+    out: torch.Tensor | None = None,
+) -> torch.Tensor:
+    """In-place ``residual += x`` then RMSNorm and static FP8 quantization."""
+    if out is None:
+        out = torch.empty_like(x, dtype=torch.float8_e4m3fn)
+    ops.residual_add_rms_norm_quant_fp8_static_bf16(
+        residual,
+        x,
+        weight,
+        scale,
+        float(eps),
+        out,
+    )
+    return out
+__all__ = [
+    "residual_add_rms_norm_quant_fp8_static_bf16",
+    "rms_norm_bf16",
+    "rms_norm_quant_fp8_static_bf16",
+]

build/torch212-cxx11-cu132-x86_64-linux/_flashrt_residual_norm_quant_cuda_cf903dd.abi3.so ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9ee5cc9d8958c8f3c04f62671615ec5e5f1969eeaaf5687ea31ccfcca98609d3
+size 2400392

build/torch212-cxx11-cu132-x86_64-linux/_ops.py ADDED Viewed

	@@ -0,0 +1,9 @@

+import torch
+from . import _flashrt_residual_norm_quant_cuda_cf903dd
+ops = torch.ops._flashrt_residual_norm_quant_cuda_cf903dd
+def add_op_namespace_prefix(op_name: str):
+    """
+    Prefix op by namespace.
+    """
+    return f"_flashrt_residual_norm_quant_cuda_cf903dd::{op_name}"

build/torch212-cxx11-cu132-x86_64-linux/flashrt_residual_norm_quant/__init__.py ADDED Viewed

	@@ -0,0 +1,26 @@

+import ctypes
+import importlib.util
+import sys
+from pathlib import Path
+from types import ModuleType
+def _import_from_path(file_path: Path) -> ModuleType:
+    # We cannot use the module name as-is, after adding it to `sys.modules`,
+    # it would also be used for other imports. So, we make a module name that
+    # depends on the path for it to be unique using the hex-encoded hash of
+    # the path.
+    path_hash = "{:x}".format(ctypes.c_size_t(hash(file_path.absolute())).value)
+    module_name = path_hash
+    spec = importlib.util.spec_from_file_location(module_name, file_path)
+    if spec is None:
+        raise ImportError(f"Cannot load spec for {module_name} from {file_path}")
+    module = importlib.util.module_from_spec(spec)
+    if module is None:
+        raise ImportError(f"Cannot load module {module_name} from spec")
+    sys.modules[module_name] = module
+    spec.loader.exec_module(module)  # type: ignore
+    return module
+globals().update(vars(_import_from_path(Path(__file__).parent.parent / "__init__.py")))

build/torch212-cxx11-cu132-x86_64-linux/metadata.json ADDED Viewed

	@@ -0,0 +1,21 @@

+{
+  "name": "flashrt-residual-norm-quant",
+  "id": "_flashrt_residual_norm_quant_cuda_cf903dd",
+  "version": 1,
+  "license": "Apache-2.0",
+  "python-depends": [],
+  "backend": {
+    "type": "cuda",
+    "archs": [
+      "10.0",
+      "11.0",
+      "12.0+PTX",
+      "7.5",
+      "8.0",
+      "8.6",
+      "8.7",
+      "8.9",
+      "9.0"
+    ]
+  }
+}