liangsu9988 commited on
Commit
f3d5b21
·
verified ·
1 Parent(s): 1bccc89

Uploaded using `kernel-builder`.

Browse files
benchmarks/benchmark.py ADDED
@@ -0,0 +1,81 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """Benchmark diffusion-step-ops against PyTorch eager references."""
3
+
4
+ from __future__ import annotations
5
+
6
+ import argparse
7
+ import sys
8
+ from pathlib import Path
9
+
10
+ import torch
11
+
12
+
13
+ PACKAGE = Path(__file__).resolve().parents[1]
14
+ sys.path.insert(0, str(PACKAGE / "tests"))
15
+ from test_diffusion_step_ops import load_installed_ops, load_source_ops # noqa: E402
16
+
17
+
18
+ def bench(fn, warmup: int, iters: int) -> float:
19
+ for _ in range(warmup):
20
+ fn()
21
+ torch.cuda.synchronize()
22
+ start = torch.cuda.Event(enable_timing=True)
23
+ end = torch.cuda.Event(enable_timing=True)
24
+ start.record()
25
+ for _ in range(iters):
26
+ fn()
27
+ end.record()
28
+ torch.cuda.synchronize()
29
+ return start.elapsed_time(end) * 1000.0 / iters
30
+
31
+
32
+ def main() -> int:
33
+ parser = argparse.ArgumentParser()
34
+ parser.add_argument("--backend", choices=["source", "installed"], default="source")
35
+ parser.add_argument("--artifact", default=None)
36
+ parser.add_argument("--warmup", type=int, default=100)
37
+ parser.add_argument("--iters", type=int, default=1000)
38
+ args = parser.parse_args()
39
+
40
+ if not torch.cuda.is_available():
41
+ raise RuntimeError("CUDA is required")
42
+ torch.manual_seed(1234)
43
+ ops = load_source_ops() if args.backend == "source" else load_installed_ops(args.artifact)
44
+
45
+ print("| Workload | Shape | FlashRT us | PyTorch eager us | Speedup |")
46
+ print("|---|---:|---:|---:|---:|")
47
+
48
+ for shape in [(1024,), (16384,), (2, 16, 32, 64), (1, 16, 17, 64, 64)]:
49
+ a = torch.randn(shape, device="cuda", dtype=torch.bfloat16)
50
+ b = torch.randn(shape, device="cuda", dtype=torch.bfloat16)
51
+ fused = bench(lambda: ops.add_bf16(a, b), args.warmup, args.iters)
52
+ eager = bench(lambda: (a.float() + b.float()).to(torch.bfloat16), args.warmup, args.iters)
53
+ print(f"| add_bf16 | {tuple(shape)} | {fused:.3f} | {eager:.3f} | {eager / fused:.2f}x |")
54
+
55
+ fused = bench(lambda: ops.euler_step_bf16(a, b, -0.125), args.warmup, args.iters)
56
+ eager = bench(lambda: (a.float() + b.float() * -0.125).to(torch.bfloat16), args.warmup, args.iters)
57
+ print(f"| euler_step_bf16 | {tuple(shape)} | {fused:.3f} | {eager:.3f} | {eager / fused:.2f}x |")
58
+
59
+ residual = torch.randn(shape, device="cuda", dtype=torch.bfloat16)
60
+ residual_ref = residual.clone()
61
+ fused = bench(lambda: ops.cfg_combine_into_residual_bf16(residual, a, b, 4.5), args.warmup, args.iters)
62
+ eager = bench(lambda: residual_ref.add_((b.float() + 4.5 * (a.float() - b.float())).to(torch.bfloat16)), args.warmup, args.iters)
63
+ print(f"| cfg_combine_bf16 | {tuple(shape)} | {fused:.3f} | {eager:.3f} | {eager / fused:.2f}x |")
64
+
65
+ for shape in [(1, 4, 5, 16, 16), (2, 8, 9, 32, 32), (1, 16, 17, 64, 64)]:
66
+ video = torch.randn(shape, device="cuda", dtype=torch.bfloat16)
67
+ cond = torch.randn((shape[0], shape[1], shape[3], shape[4]), device="cuda", dtype=torch.bfloat16)
68
+ video_ref = video.clone()
69
+ fused = bench(lambda: ops.teacher_force_first_frame_bf16(video, cond), args.warmup, args.iters)
70
+ eager = bench(lambda: video_ref[:, :, 0].copy_(cond), args.warmup, args.iters)
71
+ print(f"| teacher_force_first_frame | {tuple(shape)} | {fused:.3f} | {eager:.3f} | {eager / fused:.2f}x |")
72
+
73
+ fused = bench(lambda: ops.motus_decode_postprocess_bf16_to_fp32(video), args.warmup, args.iters)
74
+ eager = bench(lambda: ((video[:, :, 1:].float() + 1.0) * 0.5).clamp(0.0, 1.0).contiguous(), args.warmup, args.iters)
75
+ print(f"| decode_postprocess | {tuple(shape)} | {fused:.3f} | {eager:.3f} | {eager / fused:.2f}x |")
76
+
77
+ return 0
78
+
79
+
80
+ if __name__ == "__main__":
81
+ raise SystemExit(main())
build/torch211-cxx11-cu128-x86_64-linux/__init__.py ADDED
@@ -0,0 +1,186 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """FlashRT diffusion step helper kernels."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import Optional
6
+
7
+ import torch
8
+
9
+ from ._ops import add_op_namespace_prefix, ops
10
+
11
+
12
+ def _check_same_shape(a: torch.Tensor, b: torch.Tensor, c: torch.Tensor | None = None) -> None:
13
+ if a.shape != b.shape:
14
+ raise RuntimeError("input tensors must have the same shape")
15
+ if c is not None and a.shape != c.shape:
16
+ raise RuntimeError("output tensor must have the same shape as inputs")
17
+
18
+
19
+ @torch.library.register_fake(add_op_namespace_prefix("add_bf16_out"))
20
+ def _add_bf16_out_fake(a: torch.Tensor, b: torch.Tensor, out: torch.Tensor) -> None:
21
+ _check_same_shape(a, b, out)
22
+ return None
23
+
24
+
25
+ @torch.library.register_fake(add_op_namespace_prefix("euler_step_bf16_out"))
26
+ def _euler_step_bf16_out_fake(
27
+ latent: torch.Tensor,
28
+ velocity: torch.Tensor,
29
+ dt: float,
30
+ out: torch.Tensor,
31
+ ) -> None:
32
+ _check_same_shape(latent, velocity, out)
33
+ return None
34
+
35
+
36
+ @torch.library.register_fake(add_op_namespace_prefix("cfg_combine_into_residual_bf16"))
37
+ def _cfg_combine_into_residual_bf16_fake(
38
+ residual: torch.Tensor,
39
+ v_cond: torch.Tensor,
40
+ v_uncond: torch.Tensor,
41
+ beta: float,
42
+ ) -> None:
43
+ _check_same_shape(residual, v_cond, v_uncond)
44
+ return None
45
+
46
+
47
+ @torch.library.register_fake(add_op_namespace_prefix("cfg_combine_into_residual_fp16"))
48
+ def _cfg_combine_into_residual_fp16_fake(
49
+ residual: torch.Tensor,
50
+ v_cond: torch.Tensor,
51
+ v_uncond: torch.Tensor,
52
+ beta: float,
53
+ ) -> None:
54
+ _check_same_shape(residual, v_cond, v_uncond)
55
+ return None
56
+
57
+
58
+ @torch.library.register_fake(add_op_namespace_prefix("teacher_force_first_frame_bf16"))
59
+ def _teacher_force_first_frame_bf16_fake(
60
+ video_latent: torch.Tensor,
61
+ cond_latent: torch.Tensor,
62
+ ) -> None:
63
+ if video_latent.dim() != 5:
64
+ raise RuntimeError("video_latent must have shape (B, C, T, H, W)")
65
+ if cond_latent.shape != (
66
+ video_latent.shape[0],
67
+ video_latent.shape[1],
68
+ video_latent.shape[3],
69
+ video_latent.shape[4],
70
+ ):
71
+ raise RuntimeError("cond_latent must have shape (B, C, H, W)")
72
+ return None
73
+
74
+
75
+ @torch.library.register_fake(add_op_namespace_prefix("motus_decode_postprocess_bf16_to_fp32"))
76
+ def _motus_decode_postprocess_bf16_to_fp32_fake(
77
+ decoded: torch.Tensor,
78
+ out: torch.Tensor,
79
+ ) -> None:
80
+ if decoded.dim() != 5:
81
+ raise RuntimeError("decoded must have shape (B, C, T_in, H, W)")
82
+ if decoded.shape[2] < 2:
83
+ raise RuntimeError("decoded T_in must be >= 2")
84
+ expected = (decoded.shape[0], decoded.shape[1], decoded.shape[2] - 1, decoded.shape[3], decoded.shape[4])
85
+ if out.shape != expected:
86
+ raise RuntimeError("out must have shape (B, C, T_in - 1, H, W)")
87
+ return None
88
+
89
+
90
+ @torch.library.register_fake(add_op_namespace_prefix("cast_bf16_to_fp32"))
91
+ def _cast_bf16_to_fp32_fake(src: torch.Tensor, dst: torch.Tensor) -> None:
92
+ if src.shape != dst.shape:
93
+ raise RuntimeError("src and dst must have the same shape")
94
+ return None
95
+
96
+
97
+ def add_bf16(a: torch.Tensor, b: torch.Tensor, *, out: Optional[torch.Tensor] = None) -> torch.Tensor:
98
+ """Return ``a + b`` for contiguous BF16 CUDA tensors."""
99
+
100
+ if out is None:
101
+ out = torch.empty_like(a)
102
+ ops.add_bf16_out(a, b, out)
103
+ return out
104
+
105
+
106
+ def euler_step_bf16(
107
+ latent: torch.Tensor,
108
+ velocity: torch.Tensor,
109
+ dt: float,
110
+ *,
111
+ out: Optional[torch.Tensor] = None,
112
+ ) -> torch.Tensor:
113
+ """Return ``latent + velocity * dt`` for BF16 CUDA tensors."""
114
+
115
+ if out is None:
116
+ out = torch.empty_like(latent)
117
+ ops.euler_step_bf16_out(latent, velocity, float(dt), out)
118
+ return out
119
+
120
+
121
+ def cfg_combine_into_residual_bf16(
122
+ residual: torch.Tensor,
123
+ v_cond: torch.Tensor,
124
+ v_uncond: torch.Tensor,
125
+ beta: float,
126
+ ) -> torch.Tensor:
127
+ """In-place ``residual += v_uncond + beta * (v_cond - v_uncond)``."""
128
+
129
+ ops.cfg_combine_into_residual_bf16(residual, v_cond, v_uncond, float(beta))
130
+ return residual
131
+
132
+
133
+ def cfg_combine_into_residual_fp16(
134
+ residual: torch.Tensor,
135
+ v_cond: torch.Tensor,
136
+ v_uncond: torch.Tensor,
137
+ beta: float,
138
+ ) -> torch.Tensor:
139
+ """FP16 variant of classifier-free guidance residual combine."""
140
+
141
+ ops.cfg_combine_into_residual_fp16(residual, v_cond, v_uncond, float(beta))
142
+ return residual
143
+
144
+
145
+ def teacher_force_first_frame_bf16(video_latent: torch.Tensor, cond_latent: torch.Tensor) -> torch.Tensor:
146
+ """Copy ``cond_latent[:, :, :, :]`` into ``video_latent[:, :, 0, :, :]``."""
147
+
148
+ ops.teacher_force_first_frame_bf16(video_latent, cond_latent)
149
+ return video_latent
150
+
151
+
152
+ def motus_decode_postprocess_bf16_to_fp32(
153
+ decoded: torch.Tensor,
154
+ *,
155
+ out: Optional[torch.Tensor] = None,
156
+ ) -> torch.Tensor:
157
+ """Drop the first frame and map BF16 decoded latents from [-1, 1] to [0, 1]."""
158
+
159
+ if out is None:
160
+ out = torch.empty(
161
+ (decoded.shape[0], decoded.shape[1], decoded.shape[2] - 1, decoded.shape[3], decoded.shape[4]),
162
+ device=decoded.device,
163
+ dtype=torch.float32,
164
+ )
165
+ ops.motus_decode_postprocess_bf16_to_fp32(decoded, out)
166
+ return out
167
+
168
+
169
+ def cast_bf16_to_fp32(src: torch.Tensor, *, out: Optional[torch.Tensor] = None) -> torch.Tensor:
170
+ """Cast a BF16 CUDA tensor to FP32."""
171
+
172
+ if out is None:
173
+ out = torch.empty_like(src, dtype=torch.float32)
174
+ ops.cast_bf16_to_fp32(src, out)
175
+ return out
176
+
177
+
178
+ __all__ = [
179
+ "add_bf16",
180
+ "cast_bf16_to_fp32",
181
+ "cfg_combine_into_residual_bf16",
182
+ "cfg_combine_into_residual_fp16",
183
+ "euler_step_bf16",
184
+ "motus_decode_postprocess_bf16_to_fp32",
185
+ "teacher_force_first_frame_bf16",
186
+ ]
build/torch211-cxx11-cu128-x86_64-linux/_diffusion_step_ops_cuda_5596053.abi3.so ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:59e58699cee217ce4eccccb5528e4cce71fc0c8ea41ba83a9d67136dae7b32fb
3
+ size 790144
build/torch211-cxx11-cu128-x86_64-linux/_ops.py ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from . import _diffusion_step_ops_cuda_5596053
3
+ ops = torch.ops._diffusion_step_ops_cuda_5596053
4
+
5
+ def add_op_namespace_prefix(op_name: str):
6
+ """
7
+ Prefix op by namespace.
8
+ """
9
+ return f"_diffusion_step_ops_cuda_5596053::{op_name}"
build/torch211-cxx11-cu128-x86_64-linux/diffusion_step_ops/__init__.py ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import ctypes
2
+ import importlib.util
3
+ import sys
4
+ from pathlib import Path
5
+ from types import ModuleType
6
+
7
+
8
+ def _import_from_path(file_path: Path) -> ModuleType:
9
+ # We cannot use the module name as-is, after adding it to `sys.modules`,
10
+ # it would also be used for other imports. So, we make a module name that
11
+ # depends on the path for it to be unique using the hex-encoded hash of
12
+ # the path.
13
+ path_hash = "{:x}".format(ctypes.c_size_t(hash(file_path.absolute())).value)
14
+ module_name = path_hash
15
+ spec = importlib.util.spec_from_file_location(module_name, file_path)
16
+ if spec is None:
17
+ raise ImportError(f"Cannot load spec for {module_name} from {file_path}")
18
+ module = importlib.util.module_from_spec(spec)
19
+ if module is None:
20
+ raise ImportError(f"Cannot load module {module_name} from spec")
21
+ sys.modules[module_name] = module
22
+ spec.loader.exec_module(module) # type: ignore
23
+ return module
24
+
25
+
26
+ globals().update(vars(_import_from_path(Path(__file__).parent.parent / "__init__.py")))
build/torch211-cxx11-cu128-x86_64-linux/metadata.json ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "name": "diffusion-step-ops",
3
+ "id": "_diffusion_step_ops_cuda_5596053",
4
+ "version": 1,
5
+ "license": "Apache-2.0",
6
+ "python-depends": [],
7
+ "backend": {
8
+ "type": "cuda",
9
+ "archs": [
10
+ "10.0",
11
+ "10.1",
12
+ "12.0+PTX",
13
+ "7.0",
14
+ "7.2",
15
+ "7.5",
16
+ "8.0",
17
+ "8.6",
18
+ "8.7",
19
+ "8.9",
20
+ "9.0"
21
+ ]
22
+ }
23
+ }
build/torch211-cxx11-cu130-x86_64-linux/__init__.py ADDED
@@ -0,0 +1,186 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """FlashRT diffusion step helper kernels."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import Optional
6
+
7
+ import torch
8
+
9
+ from ._ops import add_op_namespace_prefix, ops
10
+
11
+
12
+ def _check_same_shape(a: torch.Tensor, b: torch.Tensor, c: torch.Tensor | None = None) -> None:
13
+ if a.shape != b.shape:
14
+ raise RuntimeError("input tensors must have the same shape")
15
+ if c is not None and a.shape != c.shape:
16
+ raise RuntimeError("output tensor must have the same shape as inputs")
17
+
18
+
19
+ @torch.library.register_fake(add_op_namespace_prefix("add_bf16_out"))
20
+ def _add_bf16_out_fake(a: torch.Tensor, b: torch.Tensor, out: torch.Tensor) -> None:
21
+ _check_same_shape(a, b, out)
22
+ return None
23
+
24
+
25
+ @torch.library.register_fake(add_op_namespace_prefix("euler_step_bf16_out"))
26
+ def _euler_step_bf16_out_fake(
27
+ latent: torch.Tensor,
28
+ velocity: torch.Tensor,
29
+ dt: float,
30
+ out: torch.Tensor,
31
+ ) -> None:
32
+ _check_same_shape(latent, velocity, out)
33
+ return None
34
+
35
+
36
+ @torch.library.register_fake(add_op_namespace_prefix("cfg_combine_into_residual_bf16"))
37
+ def _cfg_combine_into_residual_bf16_fake(
38
+ residual: torch.Tensor,
39
+ v_cond: torch.Tensor,
40
+ v_uncond: torch.Tensor,
41
+ beta: float,
42
+ ) -> None:
43
+ _check_same_shape(residual, v_cond, v_uncond)
44
+ return None
45
+
46
+
47
+ @torch.library.register_fake(add_op_namespace_prefix("cfg_combine_into_residual_fp16"))
48
+ def _cfg_combine_into_residual_fp16_fake(
49
+ residual: torch.Tensor,
50
+ v_cond: torch.Tensor,
51
+ v_uncond: torch.Tensor,
52
+ beta: float,
53
+ ) -> None:
54
+ _check_same_shape(residual, v_cond, v_uncond)
55
+ return None
56
+
57
+
58
+ @torch.library.register_fake(add_op_namespace_prefix("teacher_force_first_frame_bf16"))
59
+ def _teacher_force_first_frame_bf16_fake(
60
+ video_latent: torch.Tensor,
61
+ cond_latent: torch.Tensor,
62
+ ) -> None:
63
+ if video_latent.dim() != 5:
64
+ raise RuntimeError("video_latent must have shape (B, C, T, H, W)")
65
+ if cond_latent.shape != (
66
+ video_latent.shape[0],
67
+ video_latent.shape[1],
68
+ video_latent.shape[3],
69
+ video_latent.shape[4],
70
+ ):
71
+ raise RuntimeError("cond_latent must have shape (B, C, H, W)")
72
+ return None
73
+
74
+
75
+ @torch.library.register_fake(add_op_namespace_prefix("motus_decode_postprocess_bf16_to_fp32"))
76
+ def _motus_decode_postprocess_bf16_to_fp32_fake(
77
+ decoded: torch.Tensor,
78
+ out: torch.Tensor,
79
+ ) -> None:
80
+ if decoded.dim() != 5:
81
+ raise RuntimeError("decoded must have shape (B, C, T_in, H, W)")
82
+ if decoded.shape[2] < 2:
83
+ raise RuntimeError("decoded T_in must be >= 2")
84
+ expected = (decoded.shape[0], decoded.shape[1], decoded.shape[2] - 1, decoded.shape[3], decoded.shape[4])
85
+ if out.shape != expected:
86
+ raise RuntimeError("out must have shape (B, C, T_in - 1, H, W)")
87
+ return None
88
+
89
+
90
+ @torch.library.register_fake(add_op_namespace_prefix("cast_bf16_to_fp32"))
91
+ def _cast_bf16_to_fp32_fake(src: torch.Tensor, dst: torch.Tensor) -> None:
92
+ if src.shape != dst.shape:
93
+ raise RuntimeError("src and dst must have the same shape")
94
+ return None
95
+
96
+
97
+ def add_bf16(a: torch.Tensor, b: torch.Tensor, *, out: Optional[torch.Tensor] = None) -> torch.Tensor:
98
+ """Return ``a + b`` for contiguous BF16 CUDA tensors."""
99
+
100
+ if out is None:
101
+ out = torch.empty_like(a)
102
+ ops.add_bf16_out(a, b, out)
103
+ return out
104
+
105
+
106
+ def euler_step_bf16(
107
+ latent: torch.Tensor,
108
+ velocity: torch.Tensor,
109
+ dt: float,
110
+ *,
111
+ out: Optional[torch.Tensor] = None,
112
+ ) -> torch.Tensor:
113
+ """Return ``latent + velocity * dt`` for BF16 CUDA tensors."""
114
+
115
+ if out is None:
116
+ out = torch.empty_like(latent)
117
+ ops.euler_step_bf16_out(latent, velocity, float(dt), out)
118
+ return out
119
+
120
+
121
+ def cfg_combine_into_residual_bf16(
122
+ residual: torch.Tensor,
123
+ v_cond: torch.Tensor,
124
+ v_uncond: torch.Tensor,
125
+ beta: float,
126
+ ) -> torch.Tensor:
127
+ """In-place ``residual += v_uncond + beta * (v_cond - v_uncond)``."""
128
+
129
+ ops.cfg_combine_into_residual_bf16(residual, v_cond, v_uncond, float(beta))
130
+ return residual
131
+
132
+
133
+ def cfg_combine_into_residual_fp16(
134
+ residual: torch.Tensor,
135
+ v_cond: torch.Tensor,
136
+ v_uncond: torch.Tensor,
137
+ beta: float,
138
+ ) -> torch.Tensor:
139
+ """FP16 variant of classifier-free guidance residual combine."""
140
+
141
+ ops.cfg_combine_into_residual_fp16(residual, v_cond, v_uncond, float(beta))
142
+ return residual
143
+
144
+
145
+ def teacher_force_first_frame_bf16(video_latent: torch.Tensor, cond_latent: torch.Tensor) -> torch.Tensor:
146
+ """Copy ``cond_latent[:, :, :, :]`` into ``video_latent[:, :, 0, :, :]``."""
147
+
148
+ ops.teacher_force_first_frame_bf16(video_latent, cond_latent)
149
+ return video_latent
150
+
151
+
152
+ def motus_decode_postprocess_bf16_to_fp32(
153
+ decoded: torch.Tensor,
154
+ *,
155
+ out: Optional[torch.Tensor] = None,
156
+ ) -> torch.Tensor:
157
+ """Drop the first frame and map BF16 decoded latents from [-1, 1] to [0, 1]."""
158
+
159
+ if out is None:
160
+ out = torch.empty(
161
+ (decoded.shape[0], decoded.shape[1], decoded.shape[2] - 1, decoded.shape[3], decoded.shape[4]),
162
+ device=decoded.device,
163
+ dtype=torch.float32,
164
+ )
165
+ ops.motus_decode_postprocess_bf16_to_fp32(decoded, out)
166
+ return out
167
+
168
+
169
+ def cast_bf16_to_fp32(src: torch.Tensor, *, out: Optional[torch.Tensor] = None) -> torch.Tensor:
170
+ """Cast a BF16 CUDA tensor to FP32."""
171
+
172
+ if out is None:
173
+ out = torch.empty_like(src, dtype=torch.float32)
174
+ ops.cast_bf16_to_fp32(src, out)
175
+ return out
176
+
177
+
178
+ __all__ = [
179
+ "add_bf16",
180
+ "cast_bf16_to_fp32",
181
+ "cfg_combine_into_residual_bf16",
182
+ "cfg_combine_into_residual_fp16",
183
+ "euler_step_bf16",
184
+ "motus_decode_postprocess_bf16_to_fp32",
185
+ "teacher_force_first_frame_bf16",
186
+ ]
build/torch211-cxx11-cu130-x86_64-linux/_diffusion_step_ops_cuda_5596053.abi3.so ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d50a92d138753c191513d2e5b824782cff1867d6bf496652a6dd120cd591d3c6
3
+ size 752272
build/torch211-cxx11-cu130-x86_64-linux/_ops.py ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from . import _diffusion_step_ops_cuda_5596053
3
+ ops = torch.ops._diffusion_step_ops_cuda_5596053
4
+
5
+ def add_op_namespace_prefix(op_name: str):
6
+ """
7
+ Prefix op by namespace.
8
+ """
9
+ return f"_diffusion_step_ops_cuda_5596053::{op_name}"
build/torch211-cxx11-cu130-x86_64-linux/diffusion_step_ops/__init__.py ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import ctypes
2
+ import importlib.util
3
+ import sys
4
+ from pathlib import Path
5
+ from types import ModuleType
6
+
7
+
8
+ def _import_from_path(file_path: Path) -> ModuleType:
9
+ # We cannot use the module name as-is, after adding it to `sys.modules`,
10
+ # it would also be used for other imports. So, we make a module name that
11
+ # depends on the path for it to be unique using the hex-encoded hash of
12
+ # the path.
13
+ path_hash = "{:x}".format(ctypes.c_size_t(hash(file_path.absolute())).value)
14
+ module_name = path_hash
15
+ spec = importlib.util.spec_from_file_location(module_name, file_path)
16
+ if spec is None:
17
+ raise ImportError(f"Cannot load spec for {module_name} from {file_path}")
18
+ module = importlib.util.module_from_spec(spec)
19
+ if module is None:
20
+ raise ImportError(f"Cannot load module {module_name} from spec")
21
+ sys.modules[module_name] = module
22
+ spec.loader.exec_module(module) # type: ignore
23
+ return module
24
+
25
+
26
+ globals().update(vars(_import_from_path(Path(__file__).parent.parent / "__init__.py")))
build/torch211-cxx11-cu130-x86_64-linux/metadata.json ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "name": "diffusion-step-ops",
3
+ "id": "_diffusion_step_ops_cuda_5596053",
4
+ "version": 1,
5
+ "license": "Apache-2.0",
6
+ "python-depends": [],
7
+ "backend": {
8
+ "type": "cuda",
9
+ "archs": [
10
+ "10.0",
11
+ "11.0",
12
+ "12.0",
13
+ "12.1+PTX",
14
+ "7.5",
15
+ "8.0",
16
+ "8.6",
17
+ "8.7",
18
+ "8.9",
19
+ "9.0"
20
+ ]
21
+ }
22
+ }
build/torch212-cxx11-cu130-x86_64-linux/__init__.py ADDED
@@ -0,0 +1,186 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """FlashRT diffusion step helper kernels."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import Optional
6
+
7
+ import torch
8
+
9
+ from ._ops import add_op_namespace_prefix, ops
10
+
11
+
12
+ def _check_same_shape(a: torch.Tensor, b: torch.Tensor, c: torch.Tensor | None = None) -> None:
13
+ if a.shape != b.shape:
14
+ raise RuntimeError("input tensors must have the same shape")
15
+ if c is not None and a.shape != c.shape:
16
+ raise RuntimeError("output tensor must have the same shape as inputs")
17
+
18
+
19
+ @torch.library.register_fake(add_op_namespace_prefix("add_bf16_out"))
20
+ def _add_bf16_out_fake(a: torch.Tensor, b: torch.Tensor, out: torch.Tensor) -> None:
21
+ _check_same_shape(a, b, out)
22
+ return None
23
+
24
+
25
+ @torch.library.register_fake(add_op_namespace_prefix("euler_step_bf16_out"))
26
+ def _euler_step_bf16_out_fake(
27
+ latent: torch.Tensor,
28
+ velocity: torch.Tensor,
29
+ dt: float,
30
+ out: torch.Tensor,
31
+ ) -> None:
32
+ _check_same_shape(latent, velocity, out)
33
+ return None
34
+
35
+
36
+ @torch.library.register_fake(add_op_namespace_prefix("cfg_combine_into_residual_bf16"))
37
+ def _cfg_combine_into_residual_bf16_fake(
38
+ residual: torch.Tensor,
39
+ v_cond: torch.Tensor,
40
+ v_uncond: torch.Tensor,
41
+ beta: float,
42
+ ) -> None:
43
+ _check_same_shape(residual, v_cond, v_uncond)
44
+ return None
45
+
46
+
47
+ @torch.library.register_fake(add_op_namespace_prefix("cfg_combine_into_residual_fp16"))
48
+ def _cfg_combine_into_residual_fp16_fake(
49
+ residual: torch.Tensor,
50
+ v_cond: torch.Tensor,
51
+ v_uncond: torch.Tensor,
52
+ beta: float,
53
+ ) -> None:
54
+ _check_same_shape(residual, v_cond, v_uncond)
55
+ return None
56
+
57
+
58
+ @torch.library.register_fake(add_op_namespace_prefix("teacher_force_first_frame_bf16"))
59
+ def _teacher_force_first_frame_bf16_fake(
60
+ video_latent: torch.Tensor,
61
+ cond_latent: torch.Tensor,
62
+ ) -> None:
63
+ if video_latent.dim() != 5:
64
+ raise RuntimeError("video_latent must have shape (B, C, T, H, W)")
65
+ if cond_latent.shape != (
66
+ video_latent.shape[0],
67
+ video_latent.shape[1],
68
+ video_latent.shape[3],
69
+ video_latent.shape[4],
70
+ ):
71
+ raise RuntimeError("cond_latent must have shape (B, C, H, W)")
72
+ return None
73
+
74
+
75
+ @torch.library.register_fake(add_op_namespace_prefix("motus_decode_postprocess_bf16_to_fp32"))
76
+ def _motus_decode_postprocess_bf16_to_fp32_fake(
77
+ decoded: torch.Tensor,
78
+ out: torch.Tensor,
79
+ ) -> None:
80
+ if decoded.dim() != 5:
81
+ raise RuntimeError("decoded must have shape (B, C, T_in, H, W)")
82
+ if decoded.shape[2] < 2:
83
+ raise RuntimeError("decoded T_in must be >= 2")
84
+ expected = (decoded.shape[0], decoded.shape[1], decoded.shape[2] - 1, decoded.shape[3], decoded.shape[4])
85
+ if out.shape != expected:
86
+ raise RuntimeError("out must have shape (B, C, T_in - 1, H, W)")
87
+ return None
88
+
89
+
90
+ @torch.library.register_fake(add_op_namespace_prefix("cast_bf16_to_fp32"))
91
+ def _cast_bf16_to_fp32_fake(src: torch.Tensor, dst: torch.Tensor) -> None:
92
+ if src.shape != dst.shape:
93
+ raise RuntimeError("src and dst must have the same shape")
94
+ return None
95
+
96
+
97
+ def add_bf16(a: torch.Tensor, b: torch.Tensor, *, out: Optional[torch.Tensor] = None) -> torch.Tensor:
98
+ """Return ``a + b`` for contiguous BF16 CUDA tensors."""
99
+
100
+ if out is None:
101
+ out = torch.empty_like(a)
102
+ ops.add_bf16_out(a, b, out)
103
+ return out
104
+
105
+
106
+ def euler_step_bf16(
107
+ latent: torch.Tensor,
108
+ velocity: torch.Tensor,
109
+ dt: float,
110
+ *,
111
+ out: Optional[torch.Tensor] = None,
112
+ ) -> torch.Tensor:
113
+ """Return ``latent + velocity * dt`` for BF16 CUDA tensors."""
114
+
115
+ if out is None:
116
+ out = torch.empty_like(latent)
117
+ ops.euler_step_bf16_out(latent, velocity, float(dt), out)
118
+ return out
119
+
120
+
121
+ def cfg_combine_into_residual_bf16(
122
+ residual: torch.Tensor,
123
+ v_cond: torch.Tensor,
124
+ v_uncond: torch.Tensor,
125
+ beta: float,
126
+ ) -> torch.Tensor:
127
+ """In-place ``residual += v_uncond + beta * (v_cond - v_uncond)``."""
128
+
129
+ ops.cfg_combine_into_residual_bf16(residual, v_cond, v_uncond, float(beta))
130
+ return residual
131
+
132
+
133
+ def cfg_combine_into_residual_fp16(
134
+ residual: torch.Tensor,
135
+ v_cond: torch.Tensor,
136
+ v_uncond: torch.Tensor,
137
+ beta: float,
138
+ ) -> torch.Tensor:
139
+ """FP16 variant of classifier-free guidance residual combine."""
140
+
141
+ ops.cfg_combine_into_residual_fp16(residual, v_cond, v_uncond, float(beta))
142
+ return residual
143
+
144
+
145
+ def teacher_force_first_frame_bf16(video_latent: torch.Tensor, cond_latent: torch.Tensor) -> torch.Tensor:
146
+ """Copy ``cond_latent[:, :, :, :]`` into ``video_latent[:, :, 0, :, :]``."""
147
+
148
+ ops.teacher_force_first_frame_bf16(video_latent, cond_latent)
149
+ return video_latent
150
+
151
+
152
+ def motus_decode_postprocess_bf16_to_fp32(
153
+ decoded: torch.Tensor,
154
+ *,
155
+ out: Optional[torch.Tensor] = None,
156
+ ) -> torch.Tensor:
157
+ """Drop the first frame and map BF16 decoded latents from [-1, 1] to [0, 1]."""
158
+
159
+ if out is None:
160
+ out = torch.empty(
161
+ (decoded.shape[0], decoded.shape[1], decoded.shape[2] - 1, decoded.shape[3], decoded.shape[4]),
162
+ device=decoded.device,
163
+ dtype=torch.float32,
164
+ )
165
+ ops.motus_decode_postprocess_bf16_to_fp32(decoded, out)
166
+ return out
167
+
168
+
169
+ def cast_bf16_to_fp32(src: torch.Tensor, *, out: Optional[torch.Tensor] = None) -> torch.Tensor:
170
+ """Cast a BF16 CUDA tensor to FP32."""
171
+
172
+ if out is None:
173
+ out = torch.empty_like(src, dtype=torch.float32)
174
+ ops.cast_bf16_to_fp32(src, out)
175
+ return out
176
+
177
+
178
+ __all__ = [
179
+ "add_bf16",
180
+ "cast_bf16_to_fp32",
181
+ "cfg_combine_into_residual_bf16",
182
+ "cfg_combine_into_residual_fp16",
183
+ "euler_step_bf16",
184
+ "motus_decode_postprocess_bf16_to_fp32",
185
+ "teacher_force_first_frame_bf16",
186
+ ]
build/torch212-cxx11-cu130-x86_64-linux/_diffusion_step_ops_cuda_5596053.abi3.so ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dad2459394957f4fe51171e3fbae0b2b97afb8e17da62e4db8a89a9083e6efd5
3
+ size 762920
build/torch212-cxx11-cu130-x86_64-linux/_ops.py ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from . import _diffusion_step_ops_cuda_5596053
3
+ ops = torch.ops._diffusion_step_ops_cuda_5596053
4
+
5
+ def add_op_namespace_prefix(op_name: str):
6
+ """
7
+ Prefix op by namespace.
8
+ """
9
+ return f"_diffusion_step_ops_cuda_5596053::{op_name}"
build/torch212-cxx11-cu130-x86_64-linux/diffusion_step_ops/__init__.py ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import ctypes
2
+ import importlib.util
3
+ import sys
4
+ from pathlib import Path
5
+ from types import ModuleType
6
+
7
+
8
+ def _import_from_path(file_path: Path) -> ModuleType:
9
+ # We cannot use the module name as-is, after adding it to `sys.modules`,
10
+ # it would also be used for other imports. So, we make a module name that
11
+ # depends on the path for it to be unique using the hex-encoded hash of
12
+ # the path.
13
+ path_hash = "{:x}".format(ctypes.c_size_t(hash(file_path.absolute())).value)
14
+ module_name = path_hash
15
+ spec = importlib.util.spec_from_file_location(module_name, file_path)
16
+ if spec is None:
17
+ raise ImportError(f"Cannot load spec for {module_name} from {file_path}")
18
+ module = importlib.util.module_from_spec(spec)
19
+ if module is None:
20
+ raise ImportError(f"Cannot load module {module_name} from spec")
21
+ sys.modules[module_name] = module
22
+ spec.loader.exec_module(module) # type: ignore
23
+ return module
24
+
25
+
26
+ globals().update(vars(_import_from_path(Path(__file__).parent.parent / "__init__.py")))
build/torch212-cxx11-cu130-x86_64-linux/metadata.json ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "name": "diffusion-step-ops",
3
+ "id": "_diffusion_step_ops_cuda_5596053",
4
+ "version": 1,
5
+ "license": "Apache-2.0",
6
+ "python-depends": [],
7
+ "backend": {
8
+ "type": "cuda",
9
+ "archs": [
10
+ "10.0",
11
+ "11.0",
12
+ "12.0",
13
+ "12.1+PTX",
14
+ "7.5",
15
+ "8.0",
16
+ "8.6",
17
+ "8.7",
18
+ "8.9",
19
+ "9.0"
20
+ ]
21
+ }
22
+ }
build/torch212-cxx11-cu132-x86_64-linux/__init__.py ADDED
@@ -0,0 +1,186 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """FlashRT diffusion step helper kernels."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import Optional
6
+
7
+ import torch
8
+
9
+ from ._ops import add_op_namespace_prefix, ops
10
+
11
+
12
+ def _check_same_shape(a: torch.Tensor, b: torch.Tensor, c: torch.Tensor | None = None) -> None:
13
+ if a.shape != b.shape:
14
+ raise RuntimeError("input tensors must have the same shape")
15
+ if c is not None and a.shape != c.shape:
16
+ raise RuntimeError("output tensor must have the same shape as inputs")
17
+
18
+
19
+ @torch.library.register_fake(add_op_namespace_prefix("add_bf16_out"))
20
+ def _add_bf16_out_fake(a: torch.Tensor, b: torch.Tensor, out: torch.Tensor) -> None:
21
+ _check_same_shape(a, b, out)
22
+ return None
23
+
24
+
25
+ @torch.library.register_fake(add_op_namespace_prefix("euler_step_bf16_out"))
26
+ def _euler_step_bf16_out_fake(
27
+ latent: torch.Tensor,
28
+ velocity: torch.Tensor,
29
+ dt: float,
30
+ out: torch.Tensor,
31
+ ) -> None:
32
+ _check_same_shape(latent, velocity, out)
33
+ return None
34
+
35
+
36
+ @torch.library.register_fake(add_op_namespace_prefix("cfg_combine_into_residual_bf16"))
37
+ def _cfg_combine_into_residual_bf16_fake(
38
+ residual: torch.Tensor,
39
+ v_cond: torch.Tensor,
40
+ v_uncond: torch.Tensor,
41
+ beta: float,
42
+ ) -> None:
43
+ _check_same_shape(residual, v_cond, v_uncond)
44
+ return None
45
+
46
+
47
+ @torch.library.register_fake(add_op_namespace_prefix("cfg_combine_into_residual_fp16"))
48
+ def _cfg_combine_into_residual_fp16_fake(
49
+ residual: torch.Tensor,
50
+ v_cond: torch.Tensor,
51
+ v_uncond: torch.Tensor,
52
+ beta: float,
53
+ ) -> None:
54
+ _check_same_shape(residual, v_cond, v_uncond)
55
+ return None
56
+
57
+
58
+ @torch.library.register_fake(add_op_namespace_prefix("teacher_force_first_frame_bf16"))
59
+ def _teacher_force_first_frame_bf16_fake(
60
+ video_latent: torch.Tensor,
61
+ cond_latent: torch.Tensor,
62
+ ) -> None:
63
+ if video_latent.dim() != 5:
64
+ raise RuntimeError("video_latent must have shape (B, C, T, H, W)")
65
+ if cond_latent.shape != (
66
+ video_latent.shape[0],
67
+ video_latent.shape[1],
68
+ video_latent.shape[3],
69
+ video_latent.shape[4],
70
+ ):
71
+ raise RuntimeError("cond_latent must have shape (B, C, H, W)")
72
+ return None
73
+
74
+
75
+ @torch.library.register_fake(add_op_namespace_prefix("motus_decode_postprocess_bf16_to_fp32"))
76
+ def _motus_decode_postprocess_bf16_to_fp32_fake(
77
+ decoded: torch.Tensor,
78
+ out: torch.Tensor,
79
+ ) -> None:
80
+ if decoded.dim() != 5:
81
+ raise RuntimeError("decoded must have shape (B, C, T_in, H, W)")
82
+ if decoded.shape[2] < 2:
83
+ raise RuntimeError("decoded T_in must be >= 2")
84
+ expected = (decoded.shape[0], decoded.shape[1], decoded.shape[2] - 1, decoded.shape[3], decoded.shape[4])
85
+ if out.shape != expected:
86
+ raise RuntimeError("out must have shape (B, C, T_in - 1, H, W)")
87
+ return None
88
+
89
+
90
+ @torch.library.register_fake(add_op_namespace_prefix("cast_bf16_to_fp32"))
91
+ def _cast_bf16_to_fp32_fake(src: torch.Tensor, dst: torch.Tensor) -> None:
92
+ if src.shape != dst.shape:
93
+ raise RuntimeError("src and dst must have the same shape")
94
+ return None
95
+
96
+
97
+ def add_bf16(a: torch.Tensor, b: torch.Tensor, *, out: Optional[torch.Tensor] = None) -> torch.Tensor:
98
+ """Return ``a + b`` for contiguous BF16 CUDA tensors."""
99
+
100
+ if out is None:
101
+ out = torch.empty_like(a)
102
+ ops.add_bf16_out(a, b, out)
103
+ return out
104
+
105
+
106
+ def euler_step_bf16(
107
+ latent: torch.Tensor,
108
+ velocity: torch.Tensor,
109
+ dt: float,
110
+ *,
111
+ out: Optional[torch.Tensor] = None,
112
+ ) -> torch.Tensor:
113
+ """Return ``latent + velocity * dt`` for BF16 CUDA tensors."""
114
+
115
+ if out is None:
116
+ out = torch.empty_like(latent)
117
+ ops.euler_step_bf16_out(latent, velocity, float(dt), out)
118
+ return out
119
+
120
+
121
+ def cfg_combine_into_residual_bf16(
122
+ residual: torch.Tensor,
123
+ v_cond: torch.Tensor,
124
+ v_uncond: torch.Tensor,
125
+ beta: float,
126
+ ) -> torch.Tensor:
127
+ """In-place ``residual += v_uncond + beta * (v_cond - v_uncond)``."""
128
+
129
+ ops.cfg_combine_into_residual_bf16(residual, v_cond, v_uncond, float(beta))
130
+ return residual
131
+
132
+
133
+ def cfg_combine_into_residual_fp16(
134
+ residual: torch.Tensor,
135
+ v_cond: torch.Tensor,
136
+ v_uncond: torch.Tensor,
137
+ beta: float,
138
+ ) -> torch.Tensor:
139
+ """FP16 variant of classifier-free guidance residual combine."""
140
+
141
+ ops.cfg_combine_into_residual_fp16(residual, v_cond, v_uncond, float(beta))
142
+ return residual
143
+
144
+
145
+ def teacher_force_first_frame_bf16(video_latent: torch.Tensor, cond_latent: torch.Tensor) -> torch.Tensor:
146
+ """Copy ``cond_latent[:, :, :, :]`` into ``video_latent[:, :, 0, :, :]``."""
147
+
148
+ ops.teacher_force_first_frame_bf16(video_latent, cond_latent)
149
+ return video_latent
150
+
151
+
152
+ def motus_decode_postprocess_bf16_to_fp32(
153
+ decoded: torch.Tensor,
154
+ *,
155
+ out: Optional[torch.Tensor] = None,
156
+ ) -> torch.Tensor:
157
+ """Drop the first frame and map BF16 decoded latents from [-1, 1] to [0, 1]."""
158
+
159
+ if out is None:
160
+ out = torch.empty(
161
+ (decoded.shape[0], decoded.shape[1], decoded.shape[2] - 1, decoded.shape[3], decoded.shape[4]),
162
+ device=decoded.device,
163
+ dtype=torch.float32,
164
+ )
165
+ ops.motus_decode_postprocess_bf16_to_fp32(decoded, out)
166
+ return out
167
+
168
+
169
+ def cast_bf16_to_fp32(src: torch.Tensor, *, out: Optional[torch.Tensor] = None) -> torch.Tensor:
170
+ """Cast a BF16 CUDA tensor to FP32."""
171
+
172
+ if out is None:
173
+ out = torch.empty_like(src, dtype=torch.float32)
174
+ ops.cast_bf16_to_fp32(src, out)
175
+ return out
176
+
177
+
178
+ __all__ = [
179
+ "add_bf16",
180
+ "cast_bf16_to_fp32",
181
+ "cfg_combine_into_residual_bf16",
182
+ "cfg_combine_into_residual_fp16",
183
+ "euler_step_bf16",
184
+ "motus_decode_postprocess_bf16_to_fp32",
185
+ "teacher_force_first_frame_bf16",
186
+ ]
build/torch212-cxx11-cu132-x86_64-linux/_diffusion_step_ops_cuda_5596053.abi3.so ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1c7471e5d670e1d71d2a45e34b081e1c73a6ea76b796c3a837e1f9767d2e4197
3
+ size 730152
build/torch212-cxx11-cu132-x86_64-linux/_ops.py ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from . import _diffusion_step_ops_cuda_5596053
3
+ ops = torch.ops._diffusion_step_ops_cuda_5596053
4
+
5
+ def add_op_namespace_prefix(op_name: str):
6
+ """
7
+ Prefix op by namespace.
8
+ """
9
+ return f"_diffusion_step_ops_cuda_5596053::{op_name}"
build/torch212-cxx11-cu132-x86_64-linux/diffusion_step_ops/__init__.py ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import ctypes
2
+ import importlib.util
3
+ import sys
4
+ from pathlib import Path
5
+ from types import ModuleType
6
+
7
+
8
+ def _import_from_path(file_path: Path) -> ModuleType:
9
+ # We cannot use the module name as-is, after adding it to `sys.modules`,
10
+ # it would also be used for other imports. So, we make a module name that
11
+ # depends on the path for it to be unique using the hex-encoded hash of
12
+ # the path.
13
+ path_hash = "{:x}".format(ctypes.c_size_t(hash(file_path.absolute())).value)
14
+ module_name = path_hash
15
+ spec = importlib.util.spec_from_file_location(module_name, file_path)
16
+ if spec is None:
17
+ raise ImportError(f"Cannot load spec for {module_name} from {file_path}")
18
+ module = importlib.util.module_from_spec(spec)
19
+ if module is None:
20
+ raise ImportError(f"Cannot load module {module_name} from spec")
21
+ sys.modules[module_name] = module
22
+ spec.loader.exec_module(module) # type: ignore
23
+ return module
24
+
25
+
26
+ globals().update(vars(_import_from_path(Path(__file__).parent.parent / "__init__.py")))
build/torch212-cxx11-cu132-x86_64-linux/metadata.json ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "name": "diffusion-step-ops",
3
+ "id": "_diffusion_step_ops_cuda_5596053",
4
+ "version": 1,
5
+ "license": "Apache-2.0",
6
+ "python-depends": [],
7
+ "backend": {
8
+ "type": "cuda",
9
+ "archs": [
10
+ "10.0",
11
+ "11.0",
12
+ "12.0",
13
+ "12.1+PTX",
14
+ "7.5",
15
+ "8.0",
16
+ "8.6",
17
+ "8.7",
18
+ "8.9",
19
+ "9.0"
20
+ ]
21
+ }
22
+ }