liangsu9988 commited on 6 days ago

Commit

f3d5b21

verified ·

1 Parent(s): 1bccc89

Uploaded using `kernel-builder`.

Browse files

Files changed (21) hide show

benchmarks/benchmark.py +81 -0
build/torch211-cxx11-cu128-x86_64-linux/__init__.py +186 -0
build/torch211-cxx11-cu128-x86_64-linux/_diffusion_step_ops_cuda_5596053.abi3.so +3 -0
build/torch211-cxx11-cu128-x86_64-linux/_ops.py +9 -0
build/torch211-cxx11-cu128-x86_64-linux/diffusion_step_ops/__init__.py +26 -0
build/torch211-cxx11-cu128-x86_64-linux/metadata.json +23 -0
build/torch211-cxx11-cu130-x86_64-linux/__init__.py +186 -0
build/torch211-cxx11-cu130-x86_64-linux/_diffusion_step_ops_cuda_5596053.abi3.so +3 -0
build/torch211-cxx11-cu130-x86_64-linux/_ops.py +9 -0
build/torch211-cxx11-cu130-x86_64-linux/diffusion_step_ops/__init__.py +26 -0
build/torch211-cxx11-cu130-x86_64-linux/metadata.json +22 -0
build/torch212-cxx11-cu130-x86_64-linux/__init__.py +186 -0
build/torch212-cxx11-cu130-x86_64-linux/_diffusion_step_ops_cuda_5596053.abi3.so +3 -0
build/torch212-cxx11-cu130-x86_64-linux/_ops.py +9 -0
build/torch212-cxx11-cu130-x86_64-linux/diffusion_step_ops/__init__.py +26 -0
build/torch212-cxx11-cu130-x86_64-linux/metadata.json +22 -0
build/torch212-cxx11-cu132-x86_64-linux/__init__.py +186 -0
build/torch212-cxx11-cu132-x86_64-linux/_diffusion_step_ops_cuda_5596053.abi3.so +3 -0
build/torch212-cxx11-cu132-x86_64-linux/_ops.py +9 -0
build/torch212-cxx11-cu132-x86_64-linux/diffusion_step_ops/__init__.py +26 -0
build/torch212-cxx11-cu132-x86_64-linux/metadata.json +22 -0

benchmarks/benchmark.py ADDED Viewed

	@@ -0,0 +1,81 @@

+#!/usr/bin/env python3
+"""Benchmark diffusion-step-ops against PyTorch eager references."""
+from __future__ import annotations
+import argparse
+import sys
+from pathlib import Path
+import torch
+PACKAGE = Path(__file__).resolve().parents[1]
+sys.path.insert(0, str(PACKAGE / "tests"))
+from test_diffusion_step_ops import load_installed_ops, load_source_ops  # noqa: E402
+def bench(fn, warmup: int, iters: int) -> float:
+    for _ in range(warmup):
+        fn()
+    torch.cuda.synchronize()
+    start = torch.cuda.Event(enable_timing=True)
+    end = torch.cuda.Event(enable_timing=True)
+    start.record()
+    for _ in range(iters):
+        fn()
+    end.record()
+    torch.cuda.synchronize()
+    return start.elapsed_time(end) * 1000.0 / iters
+def main() -> int:
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--backend", choices=["source", "installed"], default="source")
+    parser.add_argument("--artifact", default=None)
+    parser.add_argument("--warmup", type=int, default=100)
+    parser.add_argument("--iters", type=int, default=1000)
+    args = parser.parse_args()
+    if not torch.cuda.is_available():
+        raise RuntimeError("CUDA is required")
+    torch.manual_seed(1234)
+    ops = load_source_ops() if args.backend == "source" else load_installed_ops(args.artifact)
+    print("| Workload | Shape | FlashRT us | PyTorch eager us | Speedup |")
+    print("|---|---:|---:|---:|---:|")
+    for shape in [(1024,), (16384,), (2, 16, 32, 64), (1, 16, 17, 64, 64)]:
+        a = torch.randn(shape, device="cuda", dtype=torch.bfloat16)
+        b = torch.randn(shape, device="cuda", dtype=torch.bfloat16)
+        fused = bench(lambda: ops.add_bf16(a, b), args.warmup, args.iters)
+        eager = bench(lambda: (a.float() + b.float()).to(torch.bfloat16), args.warmup, args.iters)
+        print(f"| add_bf16 | {tuple(shape)} | {fused:.3f} | {eager:.3f} | {eager / fused:.2f}x |")
+        fused = bench(lambda: ops.euler_step_bf16(a, b, -0.125), args.warmup, args.iters)
+        eager = bench(lambda: (a.float() + b.float() * -0.125).to(torch.bfloat16), args.warmup, args.iters)
+        print(f"| euler_step_bf16 | {tuple(shape)} | {fused:.3f} | {eager:.3f} | {eager / fused:.2f}x |")
+        residual = torch.randn(shape, device="cuda", dtype=torch.bfloat16)
+        residual_ref = residual.clone()
+        fused = bench(lambda: ops.cfg_combine_into_residual_bf16(residual, a, b, 4.5), args.warmup, args.iters)
+        eager = bench(lambda: residual_ref.add_((b.float() + 4.5 * (a.float() - b.float())).to(torch.bfloat16)), args.warmup, args.iters)
+        print(f"| cfg_combine_bf16 | {tuple(shape)} | {fused:.3f} | {eager:.3f} | {eager / fused:.2f}x |")
+    for shape in [(1, 4, 5, 16, 16), (2, 8, 9, 32, 32), (1, 16, 17, 64, 64)]:
+        video = torch.randn(shape, device="cuda", dtype=torch.bfloat16)
+        cond = torch.randn((shape[0], shape[1], shape[3], shape[4]), device="cuda", dtype=torch.bfloat16)
+        video_ref = video.clone()
+        fused = bench(lambda: ops.teacher_force_first_frame_bf16(video, cond), args.warmup, args.iters)
+        eager = bench(lambda: video_ref[:, :, 0].copy_(cond), args.warmup, args.iters)
+        print(f"| teacher_force_first_frame | {tuple(shape)} | {fused:.3f} | {eager:.3f} | {eager / fused:.2f}x |")
+        fused = bench(lambda: ops.motus_decode_postprocess_bf16_to_fp32(video), args.warmup, args.iters)
+        eager = bench(lambda: ((video[:, :, 1:].float() + 1.0) * 0.5).clamp(0.0, 1.0).contiguous(), args.warmup, args.iters)
+        print(f"| decode_postprocess | {tuple(shape)} | {fused:.3f} | {eager:.3f} | {eager / fused:.2f}x |")
+    return 0
+if __name__ == "__main__":
+    raise SystemExit(main())

build/torch211-cxx11-cu128-x86_64-linux/__init__.py ADDED Viewed

	@@ -0,0 +1,186 @@

+"""FlashRT diffusion step helper kernels."""
+from __future__ import annotations
+from typing import Optional
+import torch
+from ._ops import add_op_namespace_prefix, ops
+def _check_same_shape(a: torch.Tensor, b: torch.Tensor, c: torch.Tensor | None = None) -> None:
+    if a.shape != b.shape:
+        raise RuntimeError("input tensors must have the same shape")
+    if c is not None and a.shape != c.shape:
+        raise RuntimeError("output tensor must have the same shape as inputs")
+@torch.library.register_fake(add_op_namespace_prefix("add_bf16_out"))
+def _add_bf16_out_fake(a: torch.Tensor, b: torch.Tensor, out: torch.Tensor) -> None:
+    _check_same_shape(a, b, out)
+    return None
+@torch.library.register_fake(add_op_namespace_prefix("euler_step_bf16_out"))
+def _euler_step_bf16_out_fake(
+    latent: torch.Tensor,
+    velocity: torch.Tensor,
+    dt: float,
+    out: torch.Tensor,
+) -> None:
+    _check_same_shape(latent, velocity, out)
+    return None
+@torch.library.register_fake(add_op_namespace_prefix("cfg_combine_into_residual_bf16"))
+def _cfg_combine_into_residual_bf16_fake(
+    residual: torch.Tensor,
+    v_cond: torch.Tensor,
+    v_uncond: torch.Tensor,
+    beta: float,
+) -> None:
+    _check_same_shape(residual, v_cond, v_uncond)
+    return None
+@torch.library.register_fake(add_op_namespace_prefix("cfg_combine_into_residual_fp16"))
+def _cfg_combine_into_residual_fp16_fake(
+    residual: torch.Tensor,
+    v_cond: torch.Tensor,
+    v_uncond: torch.Tensor,
+    beta: float,
+) -> None:
+    _check_same_shape(residual, v_cond, v_uncond)
+    return None
+@torch.library.register_fake(add_op_namespace_prefix("teacher_force_first_frame_bf16"))
+def _teacher_force_first_frame_bf16_fake(
+    video_latent: torch.Tensor,
+    cond_latent: torch.Tensor,
+) -> None:
+    if video_latent.dim() != 5:
+        raise RuntimeError("video_latent must have shape (B, C, T, H, W)")
+    if cond_latent.shape != (
+        video_latent.shape[0],
+        video_latent.shape[1],
+        video_latent.shape[3],
+        video_latent.shape[4],
+    ):
+        raise RuntimeError("cond_latent must have shape (B, C, H, W)")
+    return None
+@torch.library.register_fake(add_op_namespace_prefix("motus_decode_postprocess_bf16_to_fp32"))
+def _motus_decode_postprocess_bf16_to_fp32_fake(
+    decoded: torch.Tensor,
+    out: torch.Tensor,
+) -> None:
+    if decoded.dim() != 5:
+        raise RuntimeError("decoded must have shape (B, C, T_in, H, W)")
+    if decoded.shape[2] < 2:
+        raise RuntimeError("decoded T_in must be >= 2")
+    expected = (decoded.shape[0], decoded.shape[1], decoded.shape[2] - 1, decoded.shape[3], decoded.shape[4])
+    if out.shape != expected:
+        raise RuntimeError("out must have shape (B, C, T_in - 1, H, W)")
+    return None
+@torch.library.register_fake(add_op_namespace_prefix("cast_bf16_to_fp32"))
+def _cast_bf16_to_fp32_fake(src: torch.Tensor, dst: torch.Tensor) -> None:
+    if src.shape != dst.shape:
+        raise RuntimeError("src and dst must have the same shape")
+    return None
+def add_bf16(a: torch.Tensor, b: torch.Tensor, *, out: Optional[torch.Tensor] = None) -> torch.Tensor:
+    """Return ``a + b`` for contiguous BF16 CUDA tensors."""
+    if out is None:
+        out = torch.empty_like(a)
+    ops.add_bf16_out(a, b, out)
+    return out
+def euler_step_bf16(
+    latent: torch.Tensor,
+    velocity: torch.Tensor,
+    dt: float,
+    *,
+    out: Optional[torch.Tensor] = None,
+) -> torch.Tensor:
+    """Return ``latent + velocity * dt`` for BF16 CUDA tensors."""
+    if out is None:
+        out = torch.empty_like(latent)
+    ops.euler_step_bf16_out(latent, velocity, float(dt), out)
+    return out
+def cfg_combine_into_residual_bf16(
+    residual: torch.Tensor,
+    v_cond: torch.Tensor,
+    v_uncond: torch.Tensor,
+    beta: float,
+) -> torch.Tensor:
+    """In-place ``residual += v_uncond + beta * (v_cond - v_uncond)``."""
+    ops.cfg_combine_into_residual_bf16(residual, v_cond, v_uncond, float(beta))
+    return residual
+def cfg_combine_into_residual_fp16(
+    residual: torch.Tensor,
+    v_cond: torch.Tensor,
+    v_uncond: torch.Tensor,
+    beta: float,
+) -> torch.Tensor:
+    """FP16 variant of classifier-free guidance residual combine."""
+    ops.cfg_combine_into_residual_fp16(residual, v_cond, v_uncond, float(beta))
+    return residual
+def teacher_force_first_frame_bf16(video_latent: torch.Tensor, cond_latent: torch.Tensor) -> torch.Tensor:
+    """Copy ``cond_latent[:, :, :, :]`` into ``video_latent[:, :, 0, :, :]``."""
+    ops.teacher_force_first_frame_bf16(video_latent, cond_latent)
+    return video_latent
+def motus_decode_postprocess_bf16_to_fp32(
+    decoded: torch.Tensor,
+    *,
+    out: Optional[torch.Tensor] = None,
+) -> torch.Tensor:
+    """Drop the first frame and map BF16 decoded latents from [-1, 1] to [0, 1]."""
+    if out is None:
+        out = torch.empty(
+            (decoded.shape[0], decoded.shape[1], decoded.shape[2] - 1, decoded.shape[3], decoded.shape[4]),
+            device=decoded.device,
+            dtype=torch.float32,
+        )
+    ops.motus_decode_postprocess_bf16_to_fp32(decoded, out)
+    return out
+def cast_bf16_to_fp32(src: torch.Tensor, *, out: Optional[torch.Tensor] = None) -> torch.Tensor:
+    """Cast a BF16 CUDA tensor to FP32."""
+    if out is None:
+        out = torch.empty_like(src, dtype=torch.float32)
+    ops.cast_bf16_to_fp32(src, out)
+    return out
+__all__ = [
+    "add_bf16",
+    "cast_bf16_to_fp32",
+    "cfg_combine_into_residual_bf16",
+    "cfg_combine_into_residual_fp16",
+    "euler_step_bf16",
+    "motus_decode_postprocess_bf16_to_fp32",
+    "teacher_force_first_frame_bf16",
+]

build/torch211-cxx11-cu128-x86_64-linux/_diffusion_step_ops_cuda_5596053.abi3.so ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:59e58699cee217ce4eccccb5528e4cce71fc0c8ea41ba83a9d67136dae7b32fb
+size 790144

build/torch211-cxx11-cu128-x86_64-linux/_ops.py ADDED Viewed

	@@ -0,0 +1,9 @@

+import torch
+from . import _diffusion_step_ops_cuda_5596053
+ops = torch.ops._diffusion_step_ops_cuda_5596053
+def add_op_namespace_prefix(op_name: str):
+    """
+    Prefix op by namespace.
+    """
+    return f"_diffusion_step_ops_cuda_5596053::{op_name}"

build/torch211-cxx11-cu128-x86_64-linux/diffusion_step_ops/__init__.py ADDED Viewed

	@@ -0,0 +1,26 @@

+import ctypes
+import importlib.util
+import sys
+from pathlib import Path
+from types import ModuleType
+def _import_from_path(file_path: Path) -> ModuleType:
+    # We cannot use the module name as-is, after adding it to `sys.modules`,
+    # it would also be used for other imports. So, we make a module name that
+    # depends on the path for it to be unique using the hex-encoded hash of
+    # the path.
+    path_hash = "{:x}".format(ctypes.c_size_t(hash(file_path.absolute())).value)
+    module_name = path_hash
+    spec = importlib.util.spec_from_file_location(module_name, file_path)
+    if spec is None:
+        raise ImportError(f"Cannot load spec for {module_name} from {file_path}")
+    module = importlib.util.module_from_spec(spec)
+    if module is None:
+        raise ImportError(f"Cannot load module {module_name} from spec")
+    sys.modules[module_name] = module
+    spec.loader.exec_module(module)  # type: ignore
+    return module
+globals().update(vars(_import_from_path(Path(__file__).parent.parent / "__init__.py")))

build/torch211-cxx11-cu128-x86_64-linux/metadata.json ADDED Viewed

	@@ -0,0 +1,23 @@

+{
+  "name": "diffusion-step-ops",
+  "id": "_diffusion_step_ops_cuda_5596053",
+  "version": 1,
+  "license": "Apache-2.0",
+  "python-depends": [],
+  "backend": {
+    "type": "cuda",
+    "archs": [
+      "10.0",
+      "10.1",
+      "12.0+PTX",
+      "7.0",
+      "7.2",
+      "7.5",
+      "8.0",
+      "8.6",
+      "8.7",
+      "8.9",
+      "9.0"
+    ]
+  }
+}

build/torch211-cxx11-cu130-x86_64-linux/__init__.py ADDED Viewed

	@@ -0,0 +1,186 @@

+"""FlashRT diffusion step helper kernels."""
+from __future__ import annotations
+from typing import Optional
+import torch
+from ._ops import add_op_namespace_prefix, ops
+def _check_same_shape(a: torch.Tensor, b: torch.Tensor, c: torch.Tensor | None = None) -> None:
+    if a.shape != b.shape:
+        raise RuntimeError("input tensors must have the same shape")
+    if c is not None and a.shape != c.shape:
+        raise RuntimeError("output tensor must have the same shape as inputs")
+@torch.library.register_fake(add_op_namespace_prefix("add_bf16_out"))
+def _add_bf16_out_fake(a: torch.Tensor, b: torch.Tensor, out: torch.Tensor) -> None:
+    _check_same_shape(a, b, out)
+    return None
+@torch.library.register_fake(add_op_namespace_prefix("euler_step_bf16_out"))
+def _euler_step_bf16_out_fake(
+    latent: torch.Tensor,
+    velocity: torch.Tensor,
+    dt: float,
+    out: torch.Tensor,
+) -> None:
+    _check_same_shape(latent, velocity, out)
+    return None
+@torch.library.register_fake(add_op_namespace_prefix("cfg_combine_into_residual_bf16"))
+def _cfg_combine_into_residual_bf16_fake(
+    residual: torch.Tensor,
+    v_cond: torch.Tensor,
+    v_uncond: torch.Tensor,
+    beta: float,
+) -> None:
+    _check_same_shape(residual, v_cond, v_uncond)
+    return None
+@torch.library.register_fake(add_op_namespace_prefix("cfg_combine_into_residual_fp16"))
+def _cfg_combine_into_residual_fp16_fake(
+    residual: torch.Tensor,
+    v_cond: torch.Tensor,
+    v_uncond: torch.Tensor,
+    beta: float,
+) -> None:
+    _check_same_shape(residual, v_cond, v_uncond)
+    return None
+@torch.library.register_fake(add_op_namespace_prefix("teacher_force_first_frame_bf16"))
+def _teacher_force_first_frame_bf16_fake(
+    video_latent: torch.Tensor,
+    cond_latent: torch.Tensor,
+) -> None:
+    if video_latent.dim() != 5:
+        raise RuntimeError("video_latent must have shape (B, C, T, H, W)")
+    if cond_latent.shape != (
+        video_latent.shape[0],
+        video_latent.shape[1],
+        video_latent.shape[3],
+        video_latent.shape[4],
+    ):
+        raise RuntimeError("cond_latent must have shape (B, C, H, W)")
+    return None
+@torch.library.register_fake(add_op_namespace_prefix("motus_decode_postprocess_bf16_to_fp32"))
+def _motus_decode_postprocess_bf16_to_fp32_fake(
+    decoded: torch.Tensor,
+    out: torch.Tensor,
+) -> None:
+    if decoded.dim() != 5:
+        raise RuntimeError("decoded must have shape (B, C, T_in, H, W)")
+    if decoded.shape[2] < 2:
+        raise RuntimeError("decoded T_in must be >= 2")
+    expected = (decoded.shape[0], decoded.shape[1], decoded.shape[2] - 1, decoded.shape[3], decoded.shape[4])
+    if out.shape != expected:
+        raise RuntimeError("out must have shape (B, C, T_in - 1, H, W)")
+    return None
+@torch.library.register_fake(add_op_namespace_prefix("cast_bf16_to_fp32"))
+def _cast_bf16_to_fp32_fake(src: torch.Tensor, dst: torch.Tensor) -> None:
+    if src.shape != dst.shape:
+        raise RuntimeError("src and dst must have the same shape")
+    return None
+def add_bf16(a: torch.Tensor, b: torch.Tensor, *, out: Optional[torch.Tensor] = None) -> torch.Tensor:
+    """Return ``a + b`` for contiguous BF16 CUDA tensors."""
+    if out is None:
+        out = torch.empty_like(a)
+    ops.add_bf16_out(a, b, out)
+    return out
+def euler_step_bf16(
+    latent: torch.Tensor,
+    velocity: torch.Tensor,
+    dt: float,
+    *,
+    out: Optional[torch.Tensor] = None,
+) -> torch.Tensor:
+    """Return ``latent + velocity * dt`` for BF16 CUDA tensors."""
+    if out is None:
+        out = torch.empty_like(latent)
+    ops.euler_step_bf16_out(latent, velocity, float(dt), out)
+    return out
+def cfg_combine_into_residual_bf16(
+    residual: torch.Tensor,
+    v_cond: torch.Tensor,
+    v_uncond: torch.Tensor,
+    beta: float,
+) -> torch.Tensor:
+    """In-place ``residual += v_uncond + beta * (v_cond - v_uncond)``."""
+    ops.cfg_combine_into_residual_bf16(residual, v_cond, v_uncond, float(beta))
+    return residual
+def cfg_combine_into_residual_fp16(
+    residual: torch.Tensor,
+    v_cond: torch.Tensor,
+    v_uncond: torch.Tensor,
+    beta: float,
+) -> torch.Tensor:
+    """FP16 variant of classifier-free guidance residual combine."""
+    ops.cfg_combine_into_residual_fp16(residual, v_cond, v_uncond, float(beta))
+    return residual
+def teacher_force_first_frame_bf16(video_latent: torch.Tensor, cond_latent: torch.Tensor) -> torch.Tensor:
+    """Copy ``cond_latent[:, :, :, :]`` into ``video_latent[:, :, 0, :, :]``."""
+    ops.teacher_force_first_frame_bf16(video_latent, cond_latent)
+    return video_latent
+def motus_decode_postprocess_bf16_to_fp32(
+    decoded: torch.Tensor,
+    *,
+    out: Optional[torch.Tensor] = None,
+) -> torch.Tensor:
+    """Drop the first frame and map BF16 decoded latents from [-1, 1] to [0, 1]."""
+    if out is None:
+        out = torch.empty(
+            (decoded.shape[0], decoded.shape[1], decoded.shape[2] - 1, decoded.shape[3], decoded.shape[4]),
+            device=decoded.device,
+            dtype=torch.float32,
+        )
+    ops.motus_decode_postprocess_bf16_to_fp32(decoded, out)
+    return out
+def cast_bf16_to_fp32(src: torch.Tensor, *, out: Optional[torch.Tensor] = None) -> torch.Tensor:
+    """Cast a BF16 CUDA tensor to FP32."""
+    if out is None:
+        out = torch.empty_like(src, dtype=torch.float32)
+    ops.cast_bf16_to_fp32(src, out)
+    return out
+__all__ = [
+    "add_bf16",
+    "cast_bf16_to_fp32",
+    "cfg_combine_into_residual_bf16",
+    "cfg_combine_into_residual_fp16",
+    "euler_step_bf16",
+    "motus_decode_postprocess_bf16_to_fp32",
+    "teacher_force_first_frame_bf16",
+]

build/torch211-cxx11-cu130-x86_64-linux/_diffusion_step_ops_cuda_5596053.abi3.so ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d50a92d138753c191513d2e5b824782cff1867d6bf496652a6dd120cd591d3c6
+size 752272

build/torch211-cxx11-cu130-x86_64-linux/_ops.py ADDED Viewed

	@@ -0,0 +1,9 @@

+import torch
+from . import _diffusion_step_ops_cuda_5596053
+ops = torch.ops._diffusion_step_ops_cuda_5596053
+def add_op_namespace_prefix(op_name: str):
+    """
+    Prefix op by namespace.
+    """
+    return f"_diffusion_step_ops_cuda_5596053::{op_name}"

build/torch211-cxx11-cu130-x86_64-linux/diffusion_step_ops/__init__.py ADDED Viewed

	@@ -0,0 +1,26 @@

+import ctypes
+import importlib.util
+import sys
+from pathlib import Path
+from types import ModuleType
+def _import_from_path(file_path: Path) -> ModuleType:
+    # We cannot use the module name as-is, after adding it to `sys.modules`,
+    # it would also be used for other imports. So, we make a module name that
+    # depends on the path for it to be unique using the hex-encoded hash of
+    # the path.
+    path_hash = "{:x}".format(ctypes.c_size_t(hash(file_path.absolute())).value)
+    module_name = path_hash
+    spec = importlib.util.spec_from_file_location(module_name, file_path)
+    if spec is None:
+        raise ImportError(f"Cannot load spec for {module_name} from {file_path}")
+    module = importlib.util.module_from_spec(spec)
+    if module is None:
+        raise ImportError(f"Cannot load module {module_name} from spec")
+    sys.modules[module_name] = module
+    spec.loader.exec_module(module)  # type: ignore
+    return module
+globals().update(vars(_import_from_path(Path(__file__).parent.parent / "__init__.py")))

build/torch211-cxx11-cu130-x86_64-linux/metadata.json ADDED Viewed

	@@ -0,0 +1,22 @@

+{
+  "name": "diffusion-step-ops",
+  "id": "_diffusion_step_ops_cuda_5596053",
+  "version": 1,
+  "license": "Apache-2.0",
+  "python-depends": [],
+  "backend": {
+    "type": "cuda",
+    "archs": [
+      "10.0",
+      "11.0",
+      "12.0",
+      "12.1+PTX",
+      "7.5",
+      "8.0",
+      "8.6",
+      "8.7",
+      "8.9",
+      "9.0"
+    ]
+  }
+}

build/torch212-cxx11-cu130-x86_64-linux/__init__.py ADDED Viewed

	@@ -0,0 +1,186 @@

+"""FlashRT diffusion step helper kernels."""
+from __future__ import annotations
+from typing import Optional
+import torch
+from ._ops import add_op_namespace_prefix, ops
+def _check_same_shape(a: torch.Tensor, b: torch.Tensor, c: torch.Tensor | None = None) -> None:
+    if a.shape != b.shape:
+        raise RuntimeError("input tensors must have the same shape")
+    if c is not None and a.shape != c.shape:
+        raise RuntimeError("output tensor must have the same shape as inputs")
+@torch.library.register_fake(add_op_namespace_prefix("add_bf16_out"))
+def _add_bf16_out_fake(a: torch.Tensor, b: torch.Tensor, out: torch.Tensor) -> None:
+    _check_same_shape(a, b, out)
+    return None
+@torch.library.register_fake(add_op_namespace_prefix("euler_step_bf16_out"))
+def _euler_step_bf16_out_fake(
+    latent: torch.Tensor,
+    velocity: torch.Tensor,
+    dt: float,
+    out: torch.Tensor,
+) -> None:
+    _check_same_shape(latent, velocity, out)
+    return None
+@torch.library.register_fake(add_op_namespace_prefix("cfg_combine_into_residual_bf16"))
+def _cfg_combine_into_residual_bf16_fake(
+    residual: torch.Tensor,
+    v_cond: torch.Tensor,
+    v_uncond: torch.Tensor,
+    beta: float,
+) -> None:
+    _check_same_shape(residual, v_cond, v_uncond)
+    return None
+@torch.library.register_fake(add_op_namespace_prefix("cfg_combine_into_residual_fp16"))
+def _cfg_combine_into_residual_fp16_fake(
+    residual: torch.Tensor,
+    v_cond: torch.Tensor,
+    v_uncond: torch.Tensor,
+    beta: float,
+) -> None:
+    _check_same_shape(residual, v_cond, v_uncond)
+    return None
+@torch.library.register_fake(add_op_namespace_prefix("teacher_force_first_frame_bf16"))
+def _teacher_force_first_frame_bf16_fake(
+    video_latent: torch.Tensor,
+    cond_latent: torch.Tensor,
+) -> None:
+    if video_latent.dim() != 5:
+        raise RuntimeError("video_latent must have shape (B, C, T, H, W)")
+    if cond_latent.shape != (
+        video_latent.shape[0],
+        video_latent.shape[1],
+        video_latent.shape[3],
+        video_latent.shape[4],
+    ):
+        raise RuntimeError("cond_latent must have shape (B, C, H, W)")
+    return None
+@torch.library.register_fake(add_op_namespace_prefix("motus_decode_postprocess_bf16_to_fp32"))
+def _motus_decode_postprocess_bf16_to_fp32_fake(
+    decoded: torch.Tensor,
+    out: torch.Tensor,
+) -> None:
+    if decoded.dim() != 5:
+        raise RuntimeError("decoded must have shape (B, C, T_in, H, W)")
+    if decoded.shape[2] < 2:
+        raise RuntimeError("decoded T_in must be >= 2")
+    expected = (decoded.shape[0], decoded.shape[1], decoded.shape[2] - 1, decoded.shape[3], decoded.shape[4])
+    if out.shape != expected:
+        raise RuntimeError("out must have shape (B, C, T_in - 1, H, W)")
+    return None
+@torch.library.register_fake(add_op_namespace_prefix("cast_bf16_to_fp32"))
+def _cast_bf16_to_fp32_fake(src: torch.Tensor, dst: torch.Tensor) -> None:
+    if src.shape != dst.shape:
+        raise RuntimeError("src and dst must have the same shape")
+    return None
+def add_bf16(a: torch.Tensor, b: torch.Tensor, *, out: Optional[torch.Tensor] = None) -> torch.Tensor:
+    """Return ``a + b`` for contiguous BF16 CUDA tensors."""
+    if out is None:
+        out = torch.empty_like(a)
+    ops.add_bf16_out(a, b, out)
+    return out
+def euler_step_bf16(
+    latent: torch.Tensor,
+    velocity: torch.Tensor,
+    dt: float,
+    *,
+    out: Optional[torch.Tensor] = None,
+) -> torch.Tensor:
+    """Return ``latent + velocity * dt`` for BF16 CUDA tensors."""
+    if out is None:
+        out = torch.empty_like(latent)
+    ops.euler_step_bf16_out(latent, velocity, float(dt), out)
+    return out
+def cfg_combine_into_residual_bf16(
+    residual: torch.Tensor,
+    v_cond: torch.Tensor,
+    v_uncond: torch.Tensor,
+    beta: float,
+) -> torch.Tensor:
+    """In-place ``residual += v_uncond + beta * (v_cond - v_uncond)``."""
+    ops.cfg_combine_into_residual_bf16(residual, v_cond, v_uncond, float(beta))
+    return residual
+def cfg_combine_into_residual_fp16(
+    residual: torch.Tensor,
+    v_cond: torch.Tensor,
+    v_uncond: torch.Tensor,
+    beta: float,
+) -> torch.Tensor:
+    """FP16 variant of classifier-free guidance residual combine."""
+    ops.cfg_combine_into_residual_fp16(residual, v_cond, v_uncond, float(beta))
+    return residual
+def teacher_force_first_frame_bf16(video_latent: torch.Tensor, cond_latent: torch.Tensor) -> torch.Tensor:
+    """Copy ``cond_latent[:, :, :, :]`` into ``video_latent[:, :, 0, :, :]``."""
+    ops.teacher_force_first_frame_bf16(video_latent, cond_latent)
+    return video_latent
+def motus_decode_postprocess_bf16_to_fp32(
+    decoded: torch.Tensor,
+    *,
+    out: Optional[torch.Tensor] = None,
+) -> torch.Tensor:
+    """Drop the first frame and map BF16 decoded latents from [-1, 1] to [0, 1]."""
+    if out is None:
+        out = torch.empty(
+            (decoded.shape[0], decoded.shape[1], decoded.shape[2] - 1, decoded.shape[3], decoded.shape[4]),
+            device=decoded.device,
+            dtype=torch.float32,
+        )
+    ops.motus_decode_postprocess_bf16_to_fp32(decoded, out)
+    return out
+def cast_bf16_to_fp32(src: torch.Tensor, *, out: Optional[torch.Tensor] = None) -> torch.Tensor:
+    """Cast a BF16 CUDA tensor to FP32."""
+    if out is None:
+        out = torch.empty_like(src, dtype=torch.float32)
+    ops.cast_bf16_to_fp32(src, out)
+    return out
+__all__ = [
+    "add_bf16",
+    "cast_bf16_to_fp32",
+    "cfg_combine_into_residual_bf16",
+    "cfg_combine_into_residual_fp16",
+    "euler_step_bf16",
+    "motus_decode_postprocess_bf16_to_fp32",
+    "teacher_force_first_frame_bf16",
+]

build/torch212-cxx11-cu130-x86_64-linux/_diffusion_step_ops_cuda_5596053.abi3.so ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:dad2459394957f4fe51171e3fbae0b2b97afb8e17da62e4db8a89a9083e6efd5
+size 762920

build/torch212-cxx11-cu130-x86_64-linux/_ops.py ADDED Viewed

	@@ -0,0 +1,9 @@

+import torch
+from . import _diffusion_step_ops_cuda_5596053
+ops = torch.ops._diffusion_step_ops_cuda_5596053
+def add_op_namespace_prefix(op_name: str):
+    """
+    Prefix op by namespace.
+    """
+    return f"_diffusion_step_ops_cuda_5596053::{op_name}"

build/torch212-cxx11-cu130-x86_64-linux/diffusion_step_ops/__init__.py ADDED Viewed

	@@ -0,0 +1,26 @@

+import ctypes
+import importlib.util
+import sys
+from pathlib import Path
+from types import ModuleType
+def _import_from_path(file_path: Path) -> ModuleType:
+    # We cannot use the module name as-is, after adding it to `sys.modules`,
+    # it would also be used for other imports. So, we make a module name that
+    # depends on the path for it to be unique using the hex-encoded hash of
+    # the path.
+    path_hash = "{:x}".format(ctypes.c_size_t(hash(file_path.absolute())).value)
+    module_name = path_hash
+    spec = importlib.util.spec_from_file_location(module_name, file_path)
+    if spec is None:
+        raise ImportError(f"Cannot load spec for {module_name} from {file_path}")
+    module = importlib.util.module_from_spec(spec)
+    if module is None:
+        raise ImportError(f"Cannot load module {module_name} from spec")
+    sys.modules[module_name] = module
+    spec.loader.exec_module(module)  # type: ignore
+    return module
+globals().update(vars(_import_from_path(Path(__file__).parent.parent / "__init__.py")))

build/torch212-cxx11-cu130-x86_64-linux/metadata.json ADDED Viewed

	@@ -0,0 +1,22 @@

+{
+  "name": "diffusion-step-ops",
+  "id": "_diffusion_step_ops_cuda_5596053",
+  "version": 1,
+  "license": "Apache-2.0",
+  "python-depends": [],
+  "backend": {
+    "type": "cuda",
+    "archs": [
+      "10.0",
+      "11.0",
+      "12.0",
+      "12.1+PTX",
+      "7.5",
+      "8.0",
+      "8.6",
+      "8.7",
+      "8.9",
+      "9.0"
+    ]
+  }
+}

build/torch212-cxx11-cu132-x86_64-linux/__init__.py ADDED Viewed

	@@ -0,0 +1,186 @@

+"""FlashRT diffusion step helper kernels."""
+from __future__ import annotations
+from typing import Optional
+import torch
+from ._ops import add_op_namespace_prefix, ops
+def _check_same_shape(a: torch.Tensor, b: torch.Tensor, c: torch.Tensor | None = None) -> None:
+    if a.shape != b.shape:
+        raise RuntimeError("input tensors must have the same shape")
+    if c is not None and a.shape != c.shape:
+        raise RuntimeError("output tensor must have the same shape as inputs")
+@torch.library.register_fake(add_op_namespace_prefix("add_bf16_out"))
+def _add_bf16_out_fake(a: torch.Tensor, b: torch.Tensor, out: torch.Tensor) -> None:
+    _check_same_shape(a, b, out)
+    return None
+@torch.library.register_fake(add_op_namespace_prefix("euler_step_bf16_out"))
+def _euler_step_bf16_out_fake(
+    latent: torch.Tensor,
+    velocity: torch.Tensor,
+    dt: float,
+    out: torch.Tensor,
+) -> None:
+    _check_same_shape(latent, velocity, out)
+    return None
+@torch.library.register_fake(add_op_namespace_prefix("cfg_combine_into_residual_bf16"))
+def _cfg_combine_into_residual_bf16_fake(
+    residual: torch.Tensor,
+    v_cond: torch.Tensor,
+    v_uncond: torch.Tensor,
+    beta: float,
+) -> None:
+    _check_same_shape(residual, v_cond, v_uncond)
+    return None
+@torch.library.register_fake(add_op_namespace_prefix("cfg_combine_into_residual_fp16"))
+def _cfg_combine_into_residual_fp16_fake(
+    residual: torch.Tensor,
+    v_cond: torch.Tensor,
+    v_uncond: torch.Tensor,
+    beta: float,
+) -> None:
+    _check_same_shape(residual, v_cond, v_uncond)
+    return None
+@torch.library.register_fake(add_op_namespace_prefix("teacher_force_first_frame_bf16"))
+def _teacher_force_first_frame_bf16_fake(
+    video_latent: torch.Tensor,
+    cond_latent: torch.Tensor,
+) -> None:
+    if video_latent.dim() != 5:
+        raise RuntimeError("video_latent must have shape (B, C, T, H, W)")
+    if cond_latent.shape != (
+        video_latent.shape[0],
+        video_latent.shape[1],
+        video_latent.shape[3],
+        video_latent.shape[4],
+    ):
+        raise RuntimeError("cond_latent must have shape (B, C, H, W)")
+    return None
+@torch.library.register_fake(add_op_namespace_prefix("motus_decode_postprocess_bf16_to_fp32"))
+def _motus_decode_postprocess_bf16_to_fp32_fake(
+    decoded: torch.Tensor,
+    out: torch.Tensor,
+) -> None:
+    if decoded.dim() != 5:
+        raise RuntimeError("decoded must have shape (B, C, T_in, H, W)")
+    if decoded.shape[2] < 2:
+        raise RuntimeError("decoded T_in must be >= 2")
+    expected = (decoded.shape[0], decoded.shape[1], decoded.shape[2] - 1, decoded.shape[3], decoded.shape[4])
+    if out.shape != expected:
+        raise RuntimeError("out must have shape (B, C, T_in - 1, H, W)")
+    return None
+@torch.library.register_fake(add_op_namespace_prefix("cast_bf16_to_fp32"))
+def _cast_bf16_to_fp32_fake(src: torch.Tensor, dst: torch.Tensor) -> None:
+    if src.shape != dst.shape:
+        raise RuntimeError("src and dst must have the same shape")
+    return None
+def add_bf16(a: torch.Tensor, b: torch.Tensor, *, out: Optional[torch.Tensor] = None) -> torch.Tensor:
+    """Return ``a + b`` for contiguous BF16 CUDA tensors."""
+    if out is None:
+        out = torch.empty_like(a)
+    ops.add_bf16_out(a, b, out)
+    return out
+def euler_step_bf16(
+    latent: torch.Tensor,
+    velocity: torch.Tensor,
+    dt: float,
+    *,
+    out: Optional[torch.Tensor] = None,
+) -> torch.Tensor:
+    """Return ``latent + velocity * dt`` for BF16 CUDA tensors."""
+    if out is None:
+        out = torch.empty_like(latent)
+    ops.euler_step_bf16_out(latent, velocity, float(dt), out)
+    return out
+def cfg_combine_into_residual_bf16(
+    residual: torch.Tensor,
+    v_cond: torch.Tensor,
+    v_uncond: torch.Tensor,
+    beta: float,
+) -> torch.Tensor:
+    """In-place ``residual += v_uncond + beta * (v_cond - v_uncond)``."""
+    ops.cfg_combine_into_residual_bf16(residual, v_cond, v_uncond, float(beta))
+    return residual
+def cfg_combine_into_residual_fp16(
+    residual: torch.Tensor,
+    v_cond: torch.Tensor,
+    v_uncond: torch.Tensor,
+    beta: float,
+) -> torch.Tensor:
+    """FP16 variant of classifier-free guidance residual combine."""
+    ops.cfg_combine_into_residual_fp16(residual, v_cond, v_uncond, float(beta))
+    return residual
+def teacher_force_first_frame_bf16(video_latent: torch.Tensor, cond_latent: torch.Tensor) -> torch.Tensor:
+    """Copy ``cond_latent[:, :, :, :]`` into ``video_latent[:, :, 0, :, :]``."""
+    ops.teacher_force_first_frame_bf16(video_latent, cond_latent)
+    return video_latent
+def motus_decode_postprocess_bf16_to_fp32(
+    decoded: torch.Tensor,
+    *,
+    out: Optional[torch.Tensor] = None,
+) -> torch.Tensor:
+    """Drop the first frame and map BF16 decoded latents from [-1, 1] to [0, 1]."""
+    if out is None:
+        out = torch.empty(
+            (decoded.shape[0], decoded.shape[1], decoded.shape[2] - 1, decoded.shape[3], decoded.shape[4]),
+            device=decoded.device,
+            dtype=torch.float32,
+        )
+    ops.motus_decode_postprocess_bf16_to_fp32(decoded, out)
+    return out
+def cast_bf16_to_fp32(src: torch.Tensor, *, out: Optional[torch.Tensor] = None) -> torch.Tensor:
+    """Cast a BF16 CUDA tensor to FP32."""
+    if out is None:
+        out = torch.empty_like(src, dtype=torch.float32)
+    ops.cast_bf16_to_fp32(src, out)
+    return out
+__all__ = [
+    "add_bf16",
+    "cast_bf16_to_fp32",
+    "cfg_combine_into_residual_bf16",
+    "cfg_combine_into_residual_fp16",
+    "euler_step_bf16",
+    "motus_decode_postprocess_bf16_to_fp32",
+    "teacher_force_first_frame_bf16",
+]

build/torch212-cxx11-cu132-x86_64-linux/_diffusion_step_ops_cuda_5596053.abi3.so ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1c7471e5d670e1d71d2a45e34b081e1c73a6ea76b796c3a837e1f9767d2e4197
+size 730152

build/torch212-cxx11-cu132-x86_64-linux/_ops.py ADDED Viewed

	@@ -0,0 +1,9 @@

+import torch
+from . import _diffusion_step_ops_cuda_5596053
+ops = torch.ops._diffusion_step_ops_cuda_5596053
+def add_op_namespace_prefix(op_name: str):
+    """
+    Prefix op by namespace.
+    """
+    return f"_diffusion_step_ops_cuda_5596053::{op_name}"

build/torch212-cxx11-cu132-x86_64-linux/diffusion_step_ops/__init__.py ADDED Viewed

	@@ -0,0 +1,26 @@

+import ctypes
+import importlib.util
+import sys
+from pathlib import Path
+from types import ModuleType
+def _import_from_path(file_path: Path) -> ModuleType:
+    # We cannot use the module name as-is, after adding it to `sys.modules`,
+    # it would also be used for other imports. So, we make a module name that
+    # depends on the path for it to be unique using the hex-encoded hash of
+    # the path.
+    path_hash = "{:x}".format(ctypes.c_size_t(hash(file_path.absolute())).value)
+    module_name = path_hash
+    spec = importlib.util.spec_from_file_location(module_name, file_path)
+    if spec is None:
+        raise ImportError(f"Cannot load spec for {module_name} from {file_path}")
+    module = importlib.util.module_from_spec(spec)
+    if module is None:
+        raise ImportError(f"Cannot load module {module_name} from spec")
+    sys.modules[module_name] = module
+    spec.loader.exec_module(module)  # type: ignore
+    return module
+globals().update(vars(_import_from_path(Path(__file__).parent.parent / "__init__.py")))

build/torch212-cxx11-cu132-x86_64-linux/metadata.json ADDED Viewed

	@@ -0,0 +1,22 @@

+{
+  "name": "diffusion-step-ops",
+  "id": "_diffusion_step_ops_cuda_5596053",
+  "version": 1,
+  "license": "Apache-2.0",
+  "python-depends": [],
+  "backend": {
+    "type": "cuda",
+    "archs": [
+      "10.0",
+      "11.0",
+      "12.0",
+      "12.1+PTX",
+      "7.5",
+      "8.0",
+      "8.6",
+      "8.7",
+      "8.9",
+      "9.0"
+    ]
+  }
+}