yitongl commited on 12 days ago

Commit

1d0c0cc

verified ·

1 Parent(s): 383bb79

Add standalone inference helper for sfp4 checkpoint-700

Browse files

Files changed (34) hide show

standalone_inference/README.md +74 -0
standalone_inference/__pycache__/install_overlay.cpython-313.pyc +0 -0
standalone_inference/__pycache__/run_inference.cpython-313.pyc +0 -0
standalone_inference/install_overlay.py +89 -0
standalone_inference/manifest.sha256 +31 -0
standalone_inference/overlay_files/fastvideo-kernel/python/fastvideo_kernel/block_sparse_attn_ours_p.py +270 -0
standalone_inference/overlay_files/fastvideo-kernel/python/fastvideo_kernel/triton_kernels/block_sparse_attn_triton_ours_p.py +1155 -0
standalone_inference/overlay_files/fastvideo-kernel/python/fastvideo_kernel/triton_kernels/nvfp4_utils.py +250 -0
standalone_inference/overlay_files/fastvideo-kernel/python/fastvideo_kernel/triton_kernels/quant_utils.py +80 -0
standalone_inference/overlay_files/fastvideo/api/compat.py +503 -0
standalone_inference/overlay_files/fastvideo/attention/backends/sparse_fp4_ours_p_attn.py +192 -0
standalone_inference/overlay_files/fastvideo/attention/backends/video_sparse_attn.py +262 -0
standalone_inference/overlay_files/fastvideo/configs/models/dits/base.py +79 -0
standalone_inference/overlay_files/fastvideo/configs/pipelines/wan.py +203 -0
standalone_inference/overlay_files/fastvideo/configs/sample/base.py +292 -0
standalone_inference/overlay_files/fastvideo/configs/sample/wan.py +154 -0
standalone_inference/overlay_files/fastvideo/configs/wan_1.3B_t2v_pipeline.json +40 -0
standalone_inference/overlay_files/fastvideo/entrypoints/cli/generate.py +115 -0
standalone_inference/overlay_files/fastvideo/entrypoints/video_generator.py +797 -0
standalone_inference/overlay_files/fastvideo/fastvideo_args.py +1188 -0
standalone_inference/overlay_files/fastvideo/forward_context.py +100 -0
standalone_inference/overlay_files/fastvideo/pipelines/basic/wan/__init__.py +0 -0
standalone_inference/overlay_files/fastvideo/pipelines/basic/wan/wan_pipeline.py +60 -0
standalone_inference/overlay_files/fastvideo/pipelines/composed_pipeline_base.py +474 -0
standalone_inference/overlay_files/fastvideo/pipelines/stages/denoising.py +1184 -0
standalone_inference/overlay_files/fastvideo/platforms/cuda.py +440 -0
standalone_inference/overlay_files/fastvideo/platforms/interface.py +255 -0
standalone_inference/overlay_files/fastvideo/train/models/wan/wan.py +680 -0
standalone_inference/overlay_files/fastvideo/training/training_pipeline.py +1044 -0
standalone_inference/overlay_files/fastvideo/training/wan_training_pipeline.py +74 -0
standalone_inference/requirements.txt +5 -0
standalone_inference/run.sh +22 -0
standalone_inference/run_inference.py +123 -0
standalone_inference/training_attention_settings.json +62 -0

standalone_inference/README.md ADDED Viewed

	@@ -0,0 +1,74 @@

+# Standalone Inference Helper
+This folder contains a portable inference helper for:
+`sfp4_v4_sparse09_hpo_on_ours_p_init2050_1n_interactive/checkpoint-700`
+It is not a full vendored copy of Wan or FastVideo.  It contains the sparse FP4
+backend overlay and a runner that can be applied to a FastVideo checkout or
+installation so the uploaded checkpoint can be used for normal inference.
+## Contents
+- `run_inference.py`: downloads/loads `transformer/diffusion_pytorch_model.safetensors` from `yitongl/sparse_quant_exp` and runs `VideoGenerator`.
+- `run.sh`: convenience wrapper that installs the overlay into `FASTVIDEO_ROOT` and then runs `run_inference.py`.
+- `install_overlay.py`: copies the bundled sparse FP4 backend files into a FastVideo checkout/install.
+- `overlay_files/`: exact runtime source files needed by `SPARSE_FP4_OURS_P_ATTN`.
+- `training_attention_settings.json`: structured settings for the uploaded checkpoint.
+## Expected Environment
+- A working FastVideo Python environment.
+- FastVideo dependencies installed, including PyTorch, Triton, safetensors, and
+  Hugging Face Hub.
+- Access to the base model `Wan-AI/Wan2.1-T2V-1.3B-Diffusers`.
+- A CUDA GPU supported by the custom Triton kernels.
+## Usage
+From a machine with this HF repo downloaded:
+```bash
+export FASTVIDEO_ROOT=/path/to/FastVideo
+bash standalone_inference/run.sh \
+  --output-path outputs/sfp4_checkpoint_700 \
+  --seed 1000
+```
+The script sets:
+```bash
+FASTVIDEO_ATTENTION_BACKEND=SPARSE_FP4_OURS_P_ATTN
+FASTVIDEO_SPARSE_FP4_USE_HIGH_PREC_O=1
+```
+and downloads the uploaded checkpoint-700 transformer weights unless `--weights`
+is provided.
+To use a local safetensors file:
+```bash
+export FASTVIDEO_ROOT=/path/to/FastVideo
+bash standalone_inference/run.sh \
+  --weights /path/to/diffusion_pytorch_model.safetensors \
+  --prompt "your prompt"
+```
+## Attention Semantics
+- Self-attention uses `SPARSE_FP4_OURS_P_ATTN`.
+- Q/K/V use FP4 fake quantization with STE.
+- VSA tile size is `4 x 4 x 4 = 64` tokens.
+- Selected sparse tiles use group-local P quantization in the Triton kernel.
+- Dropped tiles use tile mean compensation.
+- Cross-attention falls back to dense SDPA and is not sparse/FP4.
+## Checkpoint
+The current HF `main` transformer file is checkpoint-700:
+`transformer/diffusion_pytorch_model.safetensors`
+Local SHA256 used when preparing this helper:
+`4595ca81ea7085c15ccf14b738aa9c0fdf2d2786641f49b55e0bc0e99bf042d2`

standalone_inference/__pycache__/install_overlay.cpython-313.pyc ADDED Viewed

Binary file (4.48 kB). View file

standalone_inference/__pycache__/run_inference.cpython-313.pyc ADDED Viewed

Binary file (6.22 kB). View file

standalone_inference/install_overlay.py ADDED Viewed

	@@ -0,0 +1,89 @@

+#!/usr/bin/env python3
+"""Install the sparse FP4 checkpoint-700 inference overlay into FastVideo.
+The checkpoint depends on local FastVideo attention backend changes that are
+not part of a vanilla install.  This helper copies the bundled overlay files
+into a FastVideo source checkout or site-packages installation.
+"""
+from __future__ import annotations
+import argparse
+import importlib.util
+import shutil
+import sys
+from pathlib import Path
+def _find_fastvideo_root() -> Path:
+    spec = importlib.util.find_spec("fastvideo")
+    if spec is None or spec.origin is None:
+        raise RuntimeError(
+            "Could not import fastvideo. Pass --fastvideo-root explicitly or "
+            "activate a FastVideo environment first.")
+    return Path(spec.origin).resolve().parents[1]
+def _iter_overlay_files(overlay_root: Path):
+    for path in sorted(overlay_root.rglob("*")):
+        if path.is_file() and "__pycache__" not in path.parts:
+            yield path
+def main() -> int:
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--fastvideo-root",
+        type=Path,
+        default=None,
+        help="FastVideo repository/install root. Defaults to import location.",
+    )
+    parser.add_argument(
+        "--backup",
+        action="store_true",
+        help="Write .sfp4_backup copies before overwriting existing files.",
+    )
+    parser.add_argument(
+        "--dry-run",
+        action="store_true",
+        help="Print files that would be copied without modifying anything.",
+    )
+    args = parser.parse_args()
+    bundle_root = Path(__file__).resolve().parent
+    overlay_root = bundle_root / "overlay_files"
+    if not overlay_root.is_dir():
+        raise RuntimeError(f"Missing overlay directory: {overlay_root}")
+    target_root = args.fastvideo_root.resolve() if args.fastvideo_root else _find_fastvideo_root()
+    if not (target_root / "fastvideo").exists():
+        raise RuntimeError(
+            f"{target_root} does not look like a FastVideo root: missing fastvideo/")
+    copied = 0
+    for src in _iter_overlay_files(overlay_root):
+        rel = src.relative_to(overlay_root)
+        dst = target_root / rel
+        print(f"{rel}")
+        if args.dry_run:
+            continue
+        dst.parent.mkdir(parents=True, exist_ok=True)
+        if args.backup and dst.exists():
+            backup = dst.with_suffix(dst.suffix + ".sfp4_backup")
+            if not backup.exists():
+                shutil.copy2(dst, backup)
+        shutil.copy2(src, dst)
+        copied += 1
+    if args.dry_run:
+        print(f"Dry run complete for target root: {target_root}")
+    else:
+        print(f"Installed {copied} files into {target_root}")
+        print(
+            "Use PYTHONPATH='<FastVideo>/fastvideo-kernel/python:"
+            "<FastVideo>/fastvideo-kernel:$PYTHONPATH' when running inference.")
+    return 0
+if __name__ == "__main__":
+    raise SystemExit(main())

standalone_inference/manifest.sha256 ADDED Viewed

	@@ -0,0 +1,31 @@

+fb13abe775d8acd0aa59ce47ebad40178e4f2604fd191b6b02c1e34dd1e95cc4  ./README.md
+eb151afbefca213bbf1595e94b40547e1e431e850e6fc4cd187e506eb8e25b2d  ./install_overlay.py
+9d1d8dc58aab529270fe31eb1735d6a1382c0c6d36fccca122a8dbffa1b714fd  ./overlay_files/fastvideo-kernel/python/fastvideo_kernel/block_sparse_attn_ours_p.py
+211c7f0445fbe9488250f01fa83457c6620e83bd6f3877db791fd155de93c08b  ./overlay_files/fastvideo-kernel/python/fastvideo_kernel/triton_kernels/block_sparse_attn_triton_ours_p.py
+3f3a407a88612ea17ad65e1b6b9cf6b7b02df56956d8301c4b13bffa92095016  ./overlay_files/fastvideo-kernel/python/fastvideo_kernel/triton_kernels/nvfp4_utils.py
+56f17c602dede53c7c3677058f81274681530f1b83c086d9d1d44c6b51feefbb  ./overlay_files/fastvideo-kernel/python/fastvideo_kernel/triton_kernels/quant_utils.py
+58f4ac013e6755336212a7a6c9948b19dab0dafc00f4a3298591598df270cb39  ./overlay_files/fastvideo/api/compat.py
+2b821b0e2e7bdb3581be6312ebbece42380a6ee28a7a982f0cf2dc71fab849c8  ./overlay_files/fastvideo/attention/backends/sparse_fp4_ours_p_attn.py
+a97adcc52d7558c49f418c09395fd1665e988ad290d2276b95f21dfca0f8eb7d  ./overlay_files/fastvideo/attention/backends/video_sparse_attn.py
+79ef6f38ec0f5bfe16b2b98327ad2ccd15f3c863dd87fd03affc5dbdaa0a8224  ./overlay_files/fastvideo/configs/models/dits/base.py
+4bda44746a3626551ea9a9380d890f036087092fb99fce2d302642cce14a97ed  ./overlay_files/fastvideo/configs/pipelines/wan.py
+5926e29a594db13b116922f131db50631bf8adbf90fe5cec00a5e2f446bfb4ca  ./overlay_files/fastvideo/configs/sample/base.py
+d99adcf607d982b38bbb5a70be60bf87f35d0e9f6f50752f3bceb68b34ce46c2  ./overlay_files/fastvideo/configs/sample/wan.py
+49775ce42fd9643c78d8fad4ab8248c1755c7f1524ad771cbd1863d76c513c38  ./overlay_files/fastvideo/configs/wan_1.3B_t2v_pipeline.json
+ae2d8309472b09927da3e450dea52d9715dcabe5d6722fc2917130ae8d85adb4  ./overlay_files/fastvideo/entrypoints/cli/generate.py
+d0466769626e7fd497376c544904d56ba62847745eb52527896d96b99d76ba03  ./overlay_files/fastvideo/entrypoints/video_generator.py
+73afe6b2ebe0f8cfe0a8ec762a7126161621ad97a64ebad628995f4a164b8b0e  ./overlay_files/fastvideo/fastvideo_args.py
+ddcab6f4fd33c9813840571b6bf83bbbcea164b564166951ed4301297db6cef0  ./overlay_files/fastvideo/forward_context.py
+e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855  ./overlay_files/fastvideo/pipelines/basic/wan/__init__.py
+deac1e22530a6a41c501629f5e8fce47a7af4e008f321cc8a4d734c5120ef4fe  ./overlay_files/fastvideo/pipelines/basic/wan/wan_pipeline.py
+8908223b3ff99cdb3206148a68a730c2a13d554a2fb1316db6f2f9672efac9e8  ./overlay_files/fastvideo/pipelines/composed_pipeline_base.py
+6cfd128e782b7787a27ddd28a5e2d50cb4b0e2e9425d51d9780f14c91e8206f0  ./overlay_files/fastvideo/pipelines/stages/denoising.py
+489388dbdd9e5e3ad24db3012bd9b108794509a9729891d7dd315a102abba828  ./overlay_files/fastvideo/platforms/cuda.py
+c046b1914041b59254bcdfe577aed20d6f007a72632ea1fe1ae92fa678eca760  ./overlay_files/fastvideo/platforms/interface.py
+2456d39ca28019e12bb7ab007774e86348f0582a017bf0e6c91e2a01d654a1a0  ./overlay_files/fastvideo/train/models/wan/wan.py
+bc46e84b732567de6c0325223405daecd1226c623e303be33c7be9b5b7fdec08  ./overlay_files/fastvideo/training/training_pipeline.py
+1d3898fa37e21029df6c37e05dc34ed7805a211c2f87de6642db890e5a8c6f2e  ./overlay_files/fastvideo/training/wan_training_pipeline.py
+1b2addfcb414ab65e20034394ee21a8af9ada58220a680b67d3b4233a0952268  ./requirements.txt
+5087bb4ffe5721c41a12d92d8dfe439cd86aa1a5d3b3d259e30ad62711d95081  ./run.sh
+b826c8b059a000af6054ec099c36742d01e6a329ee77bc5936ae7562e9428409  ./run_inference.py
+8ddeea65247d9fa31a4a8a2a5ce5abe068a911ff4d67871453555e1355af8ecf  ./training_attention_settings.json

standalone_inference/overlay_files/fastvideo-kernel/python/fastvideo_kernel/block_sparse_attn_ours_p.py ADDED Viewed

	@@ -0,0 +1,270 @@

+from __future__ import annotations
+import os
+import torch
+def _use_high_prec_output_for_backward() -> bool:
+    value = os.environ.get("FASTVIDEO_SPARSE_FP4_USE_HIGH_PREC_O", "1")
+    return value.lower() not in ("0", "false", "no", "off")
+def _map_to_index(block_map: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
+    if block_map.dim() == 3:
+        block_map = block_map.unsqueeze(0)
+    if block_map.dim() != 4:
+        raise ValueError(
+            f"block_map must be [B,H,Q,KV] or [H,Q,KV], got {tuple(block_map.shape)}"
+        )
+    if block_map.dtype != torch.bool:
+        block_map = block_map.to(torch.bool)
+    if not block_map.is_cuda:
+        raise RuntimeError("block_map must be a CUDA tensor.")
+    try:
+        from fastvideo_kernel.triton_kernels.index import map_to_index as triton_map_to_index
+    except Exception as e:
+        raise ImportError("Triton map_to_index is required for ours-P Sparse FP4.") from e
+    return triton_map_to_index(block_map)
+@torch.library.custom_op(
+    "fastvideo_kernel::block_sparse_attn_ours_p_triton",
+    mutates_args=(),
+    device_types="cuda",
+)
+def block_sparse_attn_ours_p_triton(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    block_map: torch.Tensor,
+    variable_block_sizes: torch.Tensor,
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    q = q.contiguous()
+    k = k.contiguous()
+    v = v.contiguous()
+    block_map = block_map.to(torch.bool)
+    q2k_idx, q2k_num = _map_to_index(block_map)
+    from fastvideo_kernel.triton_kernels.block_sparse_attn_triton_ours_p import (
+        triton_block_sparse_attn_forward,
+    )
+    return triton_block_sparse_attn_forward(
+        q, k, v, q2k_idx, q2k_num, variable_block_sizes, is_qat=True
+    )
+@torch.library.register_fake("fastvideo_kernel::block_sparse_attn_ours_p_triton")
+def _block_sparse_attn_ours_p_triton_fake(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    block_map: torch.Tensor,
+    variable_block_sizes: torch.Tensor,
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    o = torch.empty_like(q)
+    high_prec_o = torch.empty_like(q)
+    M = torch.empty((q.shape[0], q.shape[1], q.shape[2]), device=q.device, dtype=torch.float32)
+    return o, M, high_prec_o
+@torch.library.custom_op(
+    "fastvideo_kernel::block_sparse_attn_ours_p_backward_triton",
+    mutates_args=(),
+    device_types="cuda",
+)
+def block_sparse_attn_ours_p_backward_triton(
+    grad_output: torch.Tensor,
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    o: torch.Tensor,
+    M: torch.Tensor,
+    block_map: torch.Tensor,
+    variable_block_sizes: torch.Tensor,
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    grad_output = grad_output.contiguous()
+    block_map = block_map.to(torch.bool)
+    q2k_idx, q2k_num = _map_to_index(block_map)
+    k2q_idx, k2q_num = _map_to_index(block_map.transpose(-1, -2).contiguous())
+    from fastvideo_kernel.triton_kernels.block_sparse_attn_triton_ours_p import (
+        triton_block_sparse_attn_backward,
+    )
+    return triton_block_sparse_attn_backward(
+        grad_output,
+        q,
+        k,
+        v,
+        o,
+        M,
+        q2k_idx,
+        q2k_num,
+        k2q_idx,
+        k2q_num,
+        variable_block_sizes,
+        is_qat=True,
+    )
+@torch.library.register_fake(
+    "fastvideo_kernel::block_sparse_attn_ours_p_backward_triton"
+)
+def _block_sparse_attn_ours_p_backward_triton_fake(
+    grad_output: torch.Tensor,
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    o: torch.Tensor,
+    M: torch.Tensor,
+    block_map: torch.Tensor,
+    variable_block_sizes: torch.Tensor,
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    return torch.empty_like(q), torch.empty_like(k), torch.empty_like(v)
+def _backward_triton(ctx, grad_o, grad_M, grad_high_prec_o):
+    q, k, v, o_for_bwd, M, block_map, variable_block_sizes = ctx.saved_tensors
+    dq, dk, dv = block_sparse_attn_ours_p_backward_triton(
+        grad_o, q, k, v, o_for_bwd, M, block_map, variable_block_sizes
+    )
+    return dq, dk, dv, None, None
+def _setup_context_triton(ctx, inputs, output):
+    q, k, v, block_map, variable_block_sizes = inputs
+    o, M, high_prec_o = output
+    o_for_bwd = high_prec_o if _use_high_prec_output_for_backward() else o
+    ctx.save_for_backward(q, k, v, o_for_bwd, M, block_map, variable_block_sizes)
+block_sparse_attn_ours_p_triton.register_autograd(
+    _backward_triton, setup_context=_setup_context_triton
+)
+class _BlockSparseAttnOursPTileComp(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, q, k, v, q_mean, k_mean, v_mean, block_map, variable_block_sizes):
+        q = q.contiguous()
+        k = k.contiguous()
+        v = v.contiguous()
+        q_mean = q_mean.contiguous()
+        k_mean = k_mean.contiguous()
+        v_mean = v_mean.contiguous()
+        block_map = block_map.to(torch.bool)
+        dropped_block_map = torch.logical_not(block_map)
+        q2k_idx, q2k_num = _map_to_index(block_map)
+        dropped_q2k_idx, dropped_q2k_num = _map_to_index(dropped_block_map)
+        from fastvideo_kernel.triton_kernels.block_sparse_attn_triton_ours_p import (
+            triton_block_sparse_attn_forward,
+        )
+        o, M, high_prec_o = triton_block_sparse_attn_forward(
+            q,
+            k,
+            v,
+            q2k_idx,
+            q2k_num,
+            variable_block_sizes,
+            is_qat=True,
+            q_mean=q_mean,
+            k_mean=k_mean,
+            v_mean=v_mean,
+            dropped_q2k_index=dropped_q2k_idx,
+            dropped_q2k_num=dropped_q2k_num,
+        )
+        o_for_bwd = high_prec_o if _use_high_prec_output_for_backward() else o
+        ctx.save_for_backward(
+            q,
+            k,
+            v,
+            q_mean,
+            k_mean,
+            v_mean,
+            o_for_bwd,
+            M,
+            block_map,
+            dropped_block_map,
+            variable_block_sizes,
+        )
+        return o, M
+    @staticmethod
+    def backward(ctx, grad_o, grad_M):
+        (
+            q,
+            k,
+            v,
+            q_mean,
+            k_mean,
+            v_mean,
+            o_for_bwd,
+            M,
+            block_map,
+            dropped_block_map,
+            variable_block_sizes,
+        ) = ctx.saved_tensors
+        q2k_idx, q2k_num = _map_to_index(block_map)
+        k2q_idx, k2q_num = _map_to_index(block_map.transpose(-1, -2).contiguous())
+        dropped_q2k_idx, dropped_q2k_num = _map_to_index(dropped_block_map)
+        dropped_k2q_idx, dropped_k2q_num = _map_to_index(
+            dropped_block_map.transpose(-1, -2).contiguous()
+        )
+        from fastvideo_kernel.triton_kernels.block_sparse_attn_triton_ours_p import (
+            triton_block_sparse_attn_backward,
+        )
+        dq, dk, dv = triton_block_sparse_attn_backward(
+            grad_o.contiguous(),
+            q,
+            k,
+            v,
+            o_for_bwd,
+            M,
+            q2k_idx,
+            q2k_num,
+            k2q_idx,
+            k2q_num,
+            variable_block_sizes,
+            is_qat=True,
+            q_mean=q_mean,
+            k_mean=k_mean,
+            v_mean=v_mean,
+            dropped_q2k_index=dropped_q2k_idx,
+            dropped_q2k_num=dropped_q2k_num,
+            dropped_k2q_index=dropped_k2q_idx,
+            dropped_k2q_num=dropped_k2q_num,
+        )
+        return dq, dk, dv, None, None, None, None, None
+def block_sparse_attn_ours_p(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    block_map: torch.Tensor,
+    variable_block_sizes: torch.Tensor,
+    q_mean: torch.Tensor | None = None,
+    k_mean: torch.Tensor | None = None,
+    v_mean: torch.Tensor | None = None,
+) -> tuple[torch.Tensor, torch.Tensor]:
+    if (q_mean is not None) or (k_mean is not None) or (v_mean is not None):
+        if q_mean is None or k_mean is None or v_mean is None:
+            raise ValueError("q_mean, k_mean, and v_mean must be provided together")
+        return _BlockSparseAttnOursPTileComp.apply(
+            q, k, v, q_mean, k_mean, v_mean, block_map, variable_block_sizes
+        )
+    o, M, _ = block_sparse_attn_ours_p_triton(
+        q, k, v, block_map, variable_block_sizes
+    )
+    return o, M

standalone_inference/overlay_files/fastvideo-kernel/python/fastvideo_kernel/triton_kernels/block_sparse_attn_triton_ours_p.py ADDED Viewed

	@@ -0,0 +1,1155 @@

+"""
+Fused Attention
+===============
+This is a Triton implementation of the Flash Attention v2 algorithm from Tri Dao
+(https://tridao.me/publications/flash2/flash2.pdf)
+Credits: OpenAI kernel team
+"""
+import torch
+import triton
+import triton.language as tl
+from .quant_utils import fake_quantize
+# ──────────────────────────── SPARSE ADDITION BEGIN ───────────────────────────
+import math  # small utility needed by the sparse wrapper
+# ──────────────────────────── SPARSE ADDITION END ─────────────────────────────
+# We don't run auto-tuning every time to keep the tutorial fast. Keeping
+# the code below and commenting out the equivalent parameters is convenient for
+# re-tuning.
+configs = [
+    triton.Config({'BLOCK_M': BM, 'BLOCK_N': BN}, num_stages=s, num_warps=w) \
+    for BM in [64]\
+    for BN in [64]\
+    for s in [3, 4, 7]\
+    for w in [4, 8]\
+]
+# ──────────────────────────── SPARSE ADDITION BEGIN ───────────────────────────
+@triton.autotune(configs, key=["N_CTX_Q", "HEAD_DIM"])
+@triton.jit
+def _attn_fwd_sparse(
+        Q,
+        K,
+        V,
+        QMean,
+        KMean,
+        VMean,
+        sm_scale,  #
+        q2k_index,
+        q2k_num,
+        max_kv_blks,  #
+        dropped_q2k_index,
+        dropped_q2k_num,
+        max_dropped_kv_blks,  #
+        variable_block_sizes,
+        M,
+        Out,  #
+        HighPrecOut,  #
+        stride_qz,
+        stride_qh,
+        stride_qm,
+        stride_qk,
+        stride_kz,
+        stride_kh,
+        stride_kn,
+        stride_kk,
+        stride_vz,
+        stride_vh,
+        stride_vk,
+        stride_vn,
+        stride_oz,
+        stride_oh,
+        stride_om,
+        stride_on,
+        Z,
+        H,
+        N_CTX_Q,  #
+        N_CTX_KV,  #
+        HEAD_DIM: tl.constexpr,  #
+        BLOCK_M: tl.constexpr,
+        BLOCK_N: tl.constexpr,
+        STAGE: tl.constexpr,
+        IS_QAT: tl.constexpr = False,
+        USE_TILE_COMP: tl.constexpr = False):
+    """
+    64x64 block-sparse forward kernel for the independent "ours P quant" path.
+    P quantization is group-local: each selected KV tile quantizes
+    exp2(logit - tile_row_max), then applies exp2(tile_row_max - online_max)
+    after the FP4 PV GEMM. This intentionally differs from the QAT-style
+    backend, which quantizes exp2(logit - online_max) directly.
+    """
+    # ----- program-id mapping -----
+    q_blk = tl.program_id(0)  # Q-tile index
+    off_hz = tl.program_id(1)  # fused (batch, head)
+    b = off_hz // H
+    h = off_hz % H
+    q_tiles = N_CTX_Q // BLOCK_M
+    meta_base = ((b * H + h) * q_tiles + q_blk)
+    kv_blocks = tl.load(q2k_num + meta_base)  # int32
+    kv_ptr = q2k_index + meta_base * max_kv_blks  # ptr to list
+    dropped_kv_blocks = tl.load(dropped_q2k_num + meta_base)
+    dropped_kv_ptr = dropped_q2k_index + meta_base * max_dropped_kv_blks
+    # ----- base pointers -----
+    q_off = (b.to(tl.int64) * stride_qz + h.to(tl.int64) * stride_qh)
+    k_off = (b.to(tl.int64) * stride_kz + h.to(tl.int64) * stride_kh)
+    v_off = (b.to(tl.int64) * stride_vz + h.to(tl.int64) * stride_vh)
+    o_off = (b.to(tl.int64) * stride_oz + h.to(tl.int64) * stride_oh)
+    Q_ptr = tl.make_block_ptr(base=Q + q_off,
+                              shape=(N_CTX_Q, HEAD_DIM),
+                              strides=(stride_qm, stride_qk),
+                              offsets=(q_blk * BLOCK_M, 0),
+                              block_shape=(BLOCK_M, HEAD_DIM),
+                              order=(1, 0))
+    K_base = tl.make_block_ptr(base=K + k_off,
+                               shape=(HEAD_DIM, N_CTX_KV),
+                               strides=(stride_kk, stride_kn),
+                               offsets=(0, 0),
+                               block_shape=(HEAD_DIM, BLOCK_N),
+                               order=(0, 1))
+    v_order: tl.constexpr = (0, 1) if V.dtype.element_ty == tl.float8e5 else (1,
+                                                                              0)
+    V_base = tl.make_block_ptr(base=V + v_off,
+                               shape=(N_CTX_KV, HEAD_DIM),
+                               strides=(stride_vk, stride_vn),
+                               offsets=(0, 0),
+                               block_shape=(BLOCK_N, HEAD_DIM),
+                               order=v_order)
+    O_ptr = tl.make_block_ptr(base=Out + o_off,
+                              shape=(N_CTX_Q, HEAD_DIM),
+                              strides=(stride_om, stride_on),
+                              offsets=(q_blk * BLOCK_M, 0),
+                              block_shape=(BLOCK_M, HEAD_DIM),
+                              order=(1, 0))
+    HPO_ptr = tl.make_block_ptr(base=HighPrecOut + o_off,
+                                shape=(N_CTX_Q, HEAD_DIM),
+                                strides=(stride_om, stride_on),
+                                offsets=(q_blk * BLOCK_M, 0),
+                                block_shape=(BLOCK_M, HEAD_DIM),
+                                order=(1, 0))
+    # ----- accumulators -----
+    offs_m = q_blk * BLOCK_M + tl.arange(0, BLOCK_M)
+    m_i = tl.full([BLOCK_M], -float("inf"), tl.float32)
+    l_i = tl.zeros([BLOCK_M], dtype=tl.float32) + 1.0
+    acc = tl.zeros([BLOCK_M, HEAD_DIM], dtype=tl.float32)
+    high_prec_acc = tl.zeros([BLOCK_M, HEAD_DIM], dtype=tl.float32)
+    qk_scale = sm_scale * 1.44269504  # 1/ln2
+    q = tl.load(Q_ptr)
+    offs_d = tl.arange(0, HEAD_DIM)
+    # ----- sparse loop over valid K/V tiles -----
+    for i in range(0, kv_blocks):
+        kv_idx = tl.load(kv_ptr + i).to(tl.int32)
+        block_size = tl.load(variable_block_sizes + kv_idx)
+        K_ptr = tl.advance(K_base, (0, kv_idx * BLOCK_N))
+        V_ptr = tl.advance(V_base, (kv_idx * BLOCK_N, 0))
+        k = tl.load(K_ptr)
+        mask = tl.arange(0, BLOCK_N) < block_size
+        qk = tl.dot(q, k) * qk_scale
+        # mask out invalid columns
+        qk = tl.where(mask[None, :], qk, -float("inf"))
+        group_m = tl.max(qk, 1)
+        m_ij = tl.maximum(m_i, group_m)
+        p_local = tl.math.exp2(qk - group_m[:, None])
+        p_local = tl.where(mask[None, :], p_local, 0.0)
+        p_comp = tl.math.exp2(group_m - m_ij)
+        p_valid = mask[None, :] & (
+            tl.full(shape=p_local.shape, value=1.0,
+                    dtype=p_local.dtype) == 1.0
+        )
+        p_quant, high_prec_p = fake_quantize(
+            src_tensor=p_local, valid_src_mask=p_valid,
+            BLOCK_SIZE_OUT_DIM=BLOCK_M, BLOCK_SIZE_QUANT_DIM=BLOCK_N,
+            dst_dtype=tl.bfloat16, use_global_sf=False,
+        )
+        l_ij = tl.sum(high_prec_p, 1) * p_comp
+        alpha = tl.math.exp2(m_i - m_ij)
+        l_i = l_i * alpha + l_ij
+        acc = acc * alpha[:, None]
+        high_prec_acc = high_prec_acc * alpha[:, None]
+        v = tl.load(V_ptr)
+        acc = acc + tl.dot(
+            p_quant.to(tl.bfloat16),
+            v.to(tl.bfloat16),
+        ) * p_comp[:, None]
+        high_prec_acc = high_prec_acc + tl.dot(
+            high_prec_p.to(tl.bfloat16),
+            v.to(tl.bfloat16),
+        ) * p_comp[:, None]
+        m_i = m_ij
+    if USE_TILE_COMP:
+        q_mean_base = (off_hz * q_tiles + q_blk).to(tl.int64) * HEAD_DIM
+        q_mean = tl.load(QMean + q_mean_base + offs_d).to(tl.float32)
+        kv_tiles = N_CTX_KV // BLOCK_N
+        for i in range(0, dropped_kv_blocks):
+            kv_idx = tl.load(dropped_kv_ptr + i).to(tl.int32)
+            block_size = tl.load(variable_block_sizes + kv_idx).to(tl.float32)
+            kv_mean_base = (off_hz * kv_tiles + kv_idx).to(tl.int64) * HEAD_DIM
+            k_mean = tl.load(KMean + kv_mean_base + offs_d).to(tl.float32)
+            v_mean = tl.load(VMean + kv_mean_base + offs_d).to(tl.float32)
+            score = tl.sum(q_mean * k_mean, axis=0) * qk_scale
+            m_ij = tl.maximum(m_i, score)
+            alpha = tl.math.exp2(m_i - m_ij)
+            beta = tl.math.exp2(score - m_ij)
+            l_i = l_i * alpha + block_size * beta
+            comp = (block_size * beta)[:, None] * v_mean[None, :]
+            acc = acc * alpha[:, None] + comp
+            high_prec_acc = high_prec_acc * alpha[:, None] + comp
+            m_i = m_ij
+    # ----- epilogue -----
+    m_i += tl.math.log2(l_i)
+    acc = acc / l_i[:, None]
+    high_prec_acc = high_prec_acc / l_i[:, None]
+    tl.store(M + off_hz * N_CTX_Q + offs_m, m_i)
+    tl.store(O_ptr, acc.to(Out.type.element_ty))
+    tl.store(HPO_ptr, high_prec_acc.to(HighPrecOut.type.element_ty))
+# ──────────────────────────── SPARSE ADDITION END ─────────────────────────────
+@triton.jit
+def _attn_bwd_preprocess(
+        O,
+        DO,  #
+        Delta,  #
+        Z,
+        H,
+        N_CTX,  #
+        BLOCK_M: tl.constexpr,
+        HEAD_DIM: tl.constexpr  #
+):
+    off_m = tl.program_id(0) * BLOCK_M + tl.arange(0, BLOCK_M)
+    off_hz = tl.program_id(1)
+    off_n = tl.arange(0, HEAD_DIM)
+    # load
+    o = tl.load(O + off_hz * HEAD_DIM * N_CTX + off_m[:, None] * HEAD_DIM +
+                off_n[None, :])
+    do = tl.load(DO + off_hz * HEAD_DIM * N_CTX + off_m[:, None] * HEAD_DIM +
+                 off_n[None, :]).to(tl.float32)
+    delta = tl.sum(o * do, axis=1)
+    # write-back
+    tl.store(Delta + off_hz * N_CTX + off_m, delta)
+# The main inner-loop logic for computing dK and dV.
+@triton.jit
+def _attn_bwd_dkdv(
+        dk,
+        dv,  #
+        Q,
+        k,
+        v,
+        QMean,
+        KMean,
+        VMean,
+        sm_scale,  #
+        DO,  #
+        M,
+        D,  #
+        k2q_index,
+        k2q_num,
+        max_q_blks,
+        dropped_k2q_index,
+        dropped_k2q_num,
+        max_dropped_q_blks,
+        variable_block_sizes,
+        # shared by Q/K/V/DO.
+        stride_tok,
+        stride_d,  #
+        H,
+        N_CTX_KV,
+        BLOCK_M1: tl.constexpr,  #
+        BLOCK_N1: tl.constexpr,  #
+        HEAD_DIM: tl.constexpr,  #
+        # Filled in by the wrapper.
+    start_n,
+        start_m,
+        num_steps,
+        IS_QAT: tl.constexpr = False,
+        USE_TILE_COMP: tl.constexpr = False):
+    offs_m = start_m + tl.arange(0, BLOCK_M1)
+    offs_n = start_n + tl.arange(0, BLOCK_N1)
+    offs_k = tl.arange(0, HEAD_DIM)
+    qT_ptrs = Q + offs_m[None, :] * stride_tok + offs_k[:, None] * stride_d
+    do_ptrs = DO + offs_m[:, None] * stride_tok + offs_k[None, :] * stride_d
+    # BLOCK_N1 must be a multiple of BLOCK_M1, otherwise the code wouldn't work.
+    tl.static_assert(BLOCK_N1 % BLOCK_M1 == 0)
+    step_m = BLOCK_M1
+    kv_blk = tl.program_id(0)  # Q-tile index
+    off_hz = tl.program_id(2)  # fused (batch, head)
+    b = off_hz // H
+    h = off_hz % H
+    kv_tiles = N_CTX_KV // BLOCK_N1
+    meta_base = ((b * H + h) * kv_tiles + kv_blk)
+    q_blocks = tl.load(k2q_num + meta_base)  # int32
+    q_ptr = k2q_index + meta_base * max_q_blks  # ptr to list
+    dropped_q_blocks = tl.load(dropped_k2q_num + meta_base)
+    dropped_q_ptr = dropped_k2q_index + meta_base * max_dropped_q_blks
+    block_size = tl.load(variable_block_sizes + kv_blk)
+    block_size_f = block_size.to(tl.float32)
+    for blk_idx in range(q_blocks * 2):
+        block_sparse_offset = (tl.load(q_ptr + blk_idx // 2).to(tl.int32) * 2 +
+                               blk_idx % 2) * step_m
+        qT = tl.load(qT_ptrs + block_sparse_offset * stride_tok)
+        # Load m before computing qk to reduce pipeline stall.
+        offs_m = start_m + block_sparse_offset + tl.arange(0, BLOCK_M1)
+        m = tl.load(M + offs_m)
+        qkT = tl.dot(k.to(tl.bfloat16), qT)
+        qkT = qkT * sm_scale * 1.44269504
+        mask = tl.arange(0, BLOCK_N1) < block_size
+        qkT = tl.where(mask[:, None], qkT, -float("inf"))
+        group_m = tl.max(qkT, 0)
+        pT = tl.math.exp2(qkT - m[None, :])
+        pT = tl.where(mask[:, None], pT, 0.0)
+        do = tl.load(do_ptrs + block_sparse_offset * stride_tok)
+        # Compute dV with group-local P quantization:
+        # quantize exp2(logit - tile_col_max), then multiply dO by
+        # exp2(tile_col_max - final_lse) to recover the final softmax scale.
+        p_local_T = tl.math.exp2(qkT - group_m[None, :])
+        p_local_T = tl.where(mask[:, None], p_local_T, 0.0)
+        p_comp = tl.math.exp2(group_m - m)
+        p_for_quant = tl.trans(p_local_T)
+        p_valid = mask[None, :] & (
+            tl.full(
+                shape=p_for_quant.shape,
+                value=1.0,
+                dtype=p_for_quant.dtype,
+            ) == 1.0
+        )
+        p_quant, _ = fake_quantize(
+            src_tensor=p_for_quant, valid_src_mask=p_valid,
+            BLOCK_SIZE_OUT_DIM=BLOCK_M1, BLOCK_SIZE_QUANT_DIM=BLOCK_N1,
+            dst_dtype=p_for_quant.dtype, use_global_sf=False,
+        )
+        dv += tl.dot(
+            tl.trans(p_quant.to(tl.bfloat16)),
+            (do * p_comp[:, None]).to(tl.bfloat16),
+        )
+        # D (= delta) is pre-divided by ds_scale.
+        Di = tl.load(D + offs_m)
+        # Compute dP and dS.
+        dpT = tl.dot(v, tl.trans(do)).to(tl.float32)
+        dsT = pT * (dpT - Di[None, :])
+        dsT = dsT.to(tl.bfloat16)
+        dk += tl.dot(dsT, tl.trans(qT))
+        # Increment pointers.
+    if USE_TILE_COMP:
+        k_mean = tl.load(KMean + kv_blk * HEAD_DIM + offs_k).to(tl.float32)
+        v_mean = tl.load(VMean + kv_blk * HEAD_DIM + offs_k).to(tl.float32)
+        qk_scale = sm_scale * 1.44269504
+        for blk_idx in range(dropped_q_blocks * 2):
+            q_blk_idx = tl.load(dropped_q_ptr + blk_idx // 2).to(tl.int32)
+            half = (blk_idx % 2).to(tl.int32)
+            block_sparse_offset = (q_blk_idx * 2 + half) * step_m
+            offs_m = start_m + block_sparse_offset + tl.arange(0, BLOCK_M1)
+            q_mean = tl.load(QMean + q_blk_idx * HEAD_DIM +
+                             offs_k).to(tl.float32)
+            m = tl.load(M + offs_m)
+            do = tl.load(do_ptrs + block_sparse_offset * stride_tok)
+            Di = tl.load(D + offs_m)
+            q_block_size = tl.load(variable_block_sizes +
+                                   q_blk_idx).to(tl.float32)
+            score = tl.sum(q_mean * k_mean, axis=0) * qk_scale
+            p = tl.math.exp2(score - m)
+            dp = tl.sum(do.to(tl.float32) * v_mean[None, :], axis=1)
+            ds = block_size_f * p * (dp - Di)
+            dk_mean = tl.sum(ds[:, None] * q_mean[None, :],
+                             axis=0) / block_size_f
+            dv_mean = tl.sum(p[:, None] * do.to(tl.float32), axis=0)
+            dk += dk_mean[None, :]
+            dv += dv_mean[None, :]
+    return dk, dv
+# the main inner-loop logic for computing dQ
+@triton.jit
+def _attn_bwd_dq(
+        dq,
+        q,
+        K,
+        V,  #
+        QMean,
+        KMean,
+        VMean,
+        do,
+        m,
+        m_vec,
+        D,
+        # shared by Q/K/V/DO.
+        q2k_index,
+        q2k_num,
+        max_kv_blks,
+        dropped_q2k_index,
+        dropped_q2k_num,
+        max_dropped_kv_blks,
+        variable_block_sizes,
+        stride_tok,
+        stride_d,  #
+        H,
+        N_CTX,  #
+        BLOCK_M2: tl.constexpr,  #
+        BLOCK_N2: tl.constexpr,  #
+        HEAD_DIM: tl.constexpr,
+        # Filled in by the wrapper.
+        start_m,
+        start_n,
+        num_steps,
+        sm_scale=1.0,
+        IS_QAT: tl.constexpr = False,
+        USE_TILE_COMP: tl.constexpr = False):
+    offs_m = start_m + tl.arange(0, BLOCK_M2)
+    offs_n = start_n + tl.arange(0, BLOCK_N2)
+    offs_k = tl.arange(0, HEAD_DIM)
+    kT_ptrs = K + offs_n[None, :] * stride_tok + offs_k[:, None] * stride_d
+    vT_ptrs = V + offs_n[None, :] * stride_tok + offs_k[:, None] * stride_d
+    # D (= delta) is pre-divided by ds_scale.
+    Di = tl.load(D + offs_m)
+    # BLOCK_M2 must be a multiple of BLOCK_N2, otherwise the code wouldn't work.
+    tl.static_assert(BLOCK_M2 % BLOCK_N2 == 0)
+    step_n = BLOCK_N2
+    q_blk = tl.program_id(0)  # Q-tile index
+    off_hz = tl.program_id(2)  # fused (batch, head)
+    b = off_hz // H
+    h = off_hz % H
+    q_tiles = N_CTX // BLOCK_M2
+    meta_base = ((b * H + h) * q_tiles + q_blk)
+    kv_blocks = tl.load(q2k_num + meta_base)  # int32
+    kv_ptr = q2k_index + meta_base * max_kv_blks  # ptr to list
+    dropped_kv_blocks = tl.load(dropped_q2k_num + meta_base)
+    dropped_kv_ptr = dropped_q2k_index + meta_base * max_dropped_kv_blks
+    for blk_idx in range(kv_blocks * 2):
+        kv_idx = tl.load(kv_ptr + blk_idx // 2).to(tl.int32)
+        # variable_block_sizes is defined per KV block (tile). Mask must therefore
+        # use kv_idx (not q_blk). Also, because we split each 64-token block into
+        # two 32-token halves, the mask must account for the half-block offset.
+        block_size = tl.load(variable_block_sizes + kv_idx).to(tl.int32)
+        half = (blk_idx % 2).to(tl.int32)
+        block_sparse_offset = (kv_idx * 2 + half) * step_n * stride_tok
+        kT = tl.load(kT_ptrs + block_sparse_offset)
+        vT = tl.load(vT_ptrs + block_sparse_offset)
+        qk = tl.dot(q, kT)
+        qk = qk * sm_scale * 1.44269504
+        p = tl.math.exp2(qk - m)
+        offs_in_block = half * step_n + tl.arange(0, BLOCK_N2)
+        mask = offs_in_block < block_size
+        p = tl.where(mask[None, :], p, 0.0)
+        # Compute dP and dS.
+        dp = tl.dot(do, vT).to(tl.float32)
+        ds = p * (dp - Di[:, None])
+        ds = ds.to(tl.bfloat16)
+        # Compute dQ.
+        # NOTE: We need to de-scale dq in the end, because kT was pre-scaled.
+        dq += tl.dot(ds, tl.trans(kT))
+        # Increment pointers.
+    if USE_TILE_COMP:
+        q_mean = tl.load(QMean + q_blk * HEAD_DIM + offs_k).to(tl.float32)
+        q_block_size = tl.load(variable_block_sizes + q_blk).to(tl.float32)
+        qk_scale = sm_scale * 1.44269504
+        dq_mean = tl.zeros([HEAD_DIM], dtype=tl.float32)
+        for blk_idx in range(dropped_kv_blocks):
+            kv_idx = tl.load(dropped_kv_ptr + blk_idx).to(tl.int32)
+            block_size = tl.load(variable_block_sizes + kv_idx).to(tl.float32)
+            k_mean = tl.load(KMean + kv_idx * HEAD_DIM +
+                             offs_k).to(tl.float32)
+            v_mean = tl.load(VMean + kv_idx * HEAD_DIM +
+                             offs_k).to(tl.float32)
+            score = tl.sum(q_mean * k_mean, axis=0) * qk_scale
+            p = tl.math.exp2(score - m_vec)
+            dp = tl.sum(do.to(tl.float32) * v_mean[None, :], axis=1)
+            ds = block_size * p * (dp - Di)
+            dq_mean = dq_mean + tl.sum(ds, axis=0) * k_mean
+        dq += dq_mean[None, :] / q_block_size
+    return dq
+@triton.jit
+def _attn_bwd(
+        Q,
+        K,
+        V,
+        sm_scale,  #
+        DO,  #
+        DQ,
+        DK,
+        DV,  #
+        M,
+        D,
+        q2k_index,
+        q2k_num,
+        max_kv_blks,
+        k2q_index,
+        k2q_num,
+        max_q_blks,
+        variable_block_sizes,
+        # shared by Q/K/V/DO.
+        stride_z,
+        stride_h,
+        stride_tok,
+        stride_d,  #
+        H,
+        N_CTX,  #
+        BLOCK_M1: tl.constexpr,  #
+        BLOCK_N1: tl.constexpr,  #
+        BLOCK_M2: tl.constexpr,  #
+        BLOCK_N2: tl.constexpr,  #
+        HEAD_DIM: tl.constexpr,
+        IS_QAT: tl.constexpr = False):
+    LN2 = 0.6931471824645996  # = ln(2)
+    bhid = tl.program_id(2)
+    off_chz = (bhid * N_CTX).to(tl.int64)
+    adj = (stride_h * (bhid % H) + stride_z * (bhid // H)).to(tl.int64)
+    pid = tl.program_id(0)
+    # offset pointers for batch/head
+    Q += adj
+    K += adj
+    V += adj
+    DO += adj
+    DQ += adj
+    DK += adj
+    DV += adj
+    M += off_chz
+    D += off_chz
+    # load scales
+    offs_k = tl.arange(0, HEAD_DIM)
+    start_n = pid * BLOCK_N1
+    start_m = 0
+    offs_n = start_n + tl.arange(0, BLOCK_N1)
+    dv = tl.zeros([BLOCK_N1, HEAD_DIM], dtype=tl.float32)
+    dk = tl.zeros([BLOCK_N1, HEAD_DIM], dtype=tl.float32)
+    # load K and V: they stay in SRAM throughout the inner loop.
+    k = tl.load(K + offs_n[:, None] * stride_tok + offs_k[None, :] * stride_d)
+    v = tl.load(V + offs_n[:, None] * stride_tok + offs_k[None, :] * stride_d)
+    num_steps = N_CTX // BLOCK_M1
+    dk, dv = _attn_bwd_dkdv(  #
+        dk,
+        dv,  #
+        Q,
+        k,
+        v,
+        Q,
+        K,
+        V,
+        sm_scale,  #
+        DO,  #
+        M,
+        D,  #
+        k2q_index,
+        k2q_num,
+        max_q_blks,
+        k2q_index,
+        k2q_num,
+        max_q_blks,
+        variable_block_sizes,
+        stride_tok,
+        stride_d,  #
+        H,
+        N_CTX,  #
+        BLOCK_M1,
+        BLOCK_N1,
+        HEAD_DIM,  #
+        start_n,
+        start_m,
+        num_steps,  #
+        IS_QAT=IS_QAT,
+        USE_TILE_COMP=False,
+    )
+    dv_ptrs = DV + offs_n[:, None] * stride_tok + offs_k[None, :] * stride_d
+    tl.store(dv_ptrs, dv)
+    # Write back dK.
+    dk *= sm_scale
+    dk_ptrs = DK + offs_n[:, None] * stride_tok + offs_k[None, :] * stride_d
+    tl.store(dk_ptrs, dk)
+    # THIS BLOCK DOES DQ:
+    start_m = pid * BLOCK_M2
+    end_n = 0
+    offs_m = start_m + tl.arange(0, BLOCK_M2)
+    q = tl.load(Q + offs_m[:, None] * stride_tok + offs_k[None, :] * stride_d)
+    dq = tl.zeros([BLOCK_M2, HEAD_DIM], dtype=tl.float32)
+    do = tl.load(DO + offs_m[:, None] * stride_tok + offs_k[None, :] * stride_d)
+    m_vec = tl.load(M + offs_m)
+    m = m_vec[:, None]
+    num_steps = N_CTX // BLOCK_N2
+    dq = _attn_bwd_dq(
+        dq,
+        q,
+        K,
+        V,  #
+        Q,
+        K,
+        V,
+        do,
+        m,
+        m_vec,
+        D,  #
+        q2k_index,
+        q2k_num,
+        max_kv_blks,
+        q2k_index,
+        q2k_num,
+        max_kv_blks,
+        variable_block_sizes,
+        stride_tok,
+        stride_d,  #
+        H,
+        N_CTX,  #
+        BLOCK_M2,
+        BLOCK_N2,
+        HEAD_DIM,  #
+        start_m,
+        end_n,
+        num_steps,  #
+        sm_scale=sm_scale,
+        IS_QAT=IS_QAT,
+        USE_TILE_COMP=False,
+    )
+    # Write back dQ.
+    dq_ptrs = DQ + offs_m[:, None] * stride_tok + offs_k[None, :] * stride_d
+    dq *= sm_scale
+    tl.store(dq_ptrs, dq)
+@triton.jit
+def _attn_bwd_dkdv_kernel(
+        Q,
+        K,
+        V,
+        QMean,
+        KMean,
+        VMean,
+        sm_scale,  #
+        DO,  #
+        DK,
+        DV,  #
+        M,
+        D,
+        k2q_index,
+        k2q_num,
+        max_q_blks,
+        dropped_k2q_index,
+        dropped_k2q_num,
+        max_dropped_q_blks,
+        variable_block_sizes,
+        # shared token/dim strides (assumed contiguous along token and dim)
+        stride_tok,
+        stride_d,  #
+        # batch/head strides (may differ between Q and KV)
+        stride_qz,
+        stride_qh,
+        stride_kz,
+        stride_kh,
+        stride_vz,
+        stride_vh,
+        stride_doz,
+        stride_doh,
+        stride_dkz,
+        stride_dkh,
+        stride_dvz,
+        stride_dvh,
+        H,
+        N_CTX_Q,
+        N_CTX_KV,
+        BLOCK_M1: tl.constexpr,  #
+        BLOCK_N1: tl.constexpr,  #
+        HEAD_DIM: tl.constexpr,
+        IS_QAT: tl.constexpr = False,
+        USE_TILE_COMP: tl.constexpr = False):
+    """
+    Backward kernel that computes dK and dV for each KV block (64 tokens).
+    Grid:
+      pid0: kv_blk in [0, N_CTX_KV/BLOCK_N1)
+      pid2: fused (batch, head) in [0, B*H)
+    """
+    bhid = tl.program_id(2)
+    b = bhid // H
+    h = bhid % H
+    kv_blk = tl.program_id(0)
+    q_adj = (b.to(tl.int64) * stride_qz + h.to(tl.int64) * stride_qh)
+    kv_adj_k = (b.to(tl.int64) * stride_kz + h.to(tl.int64) * stride_kh)
+    kv_adj_v = (b.to(tl.int64) * stride_vz + h.to(tl.int64) * stride_vh)
+    do_adj = (b.to(tl.int64) * stride_doz + h.to(tl.int64) * stride_doh)
+    dk_adj = (b.to(tl.int64) * stride_dkz + h.to(tl.int64) * stride_dkh)
+    dv_adj = (b.to(tl.int64) * stride_dvz + h.to(tl.int64) * stride_dvh)
+    Q = Q + q_adj
+    K = K + kv_adj_k
+    V = V + kv_adj_v
+    DO = DO + do_adj
+    DK = DK + dk_adj
+    DV = DV + dv_adj
+    q_tiles = N_CTX_Q // BLOCK_M1 // 2
+    kv_tiles = N_CTX_KV // BLOCK_N1
+    mean_q_adj = (bhid * q_tiles * HEAD_DIM).to(tl.int64)
+    mean_kv_adj = (bhid * kv_tiles * HEAD_DIM).to(tl.int64)
+    QMean = QMean + mean_q_adj
+    KMean = KMean + mean_kv_adj
+    VMean = VMean + mean_kv_adj
+    # M and D (delta) are always sized by Q length.
+    M = M + (bhid * N_CTX_Q).to(tl.int64)
+    D = D + (bhid * N_CTX_Q).to(tl.int64)
+    offs_k = tl.arange(0, HEAD_DIM)
+    start_n = kv_blk * BLOCK_N1
+    offs_n = start_n + tl.arange(0, BLOCK_N1)
+    # load K and V: they stay in SRAM throughout the inner loop.
+    k = tl.load(K + offs_n[:, None] * stride_tok + offs_k[None, :] * stride_d)
+    v = tl.load(V + offs_n[:, None] * stride_tok + offs_k[None, :] * stride_d)
+    dv_acc = tl.zeros([BLOCK_N1, HEAD_DIM], dtype=tl.float32)
+    dk_acc = tl.zeros([BLOCK_N1, HEAD_DIM], dtype=tl.float32)
+    num_steps = N_CTX_Q // BLOCK_M1
+    dk_acc, dv_acc = _attn_bwd_dkdv(
+        dk_acc,
+        dv_acc,
+        Q,
+        k,
+        v,
+        QMean,
+        KMean,
+        VMean,
+        sm_scale,
+        DO,
+        M,
+        D,
+        k2q_index,
+        k2q_num,
+        max_q_blks,
+        dropped_k2q_index,
+        dropped_k2q_num,
+        max_dropped_q_blks,
+        variable_block_sizes,
+        stride_tok,
+        stride_d,
+        H,
+        N_CTX_KV,
+        BLOCK_M1=BLOCK_M1,
+        BLOCK_N1=BLOCK_N1,
+        HEAD_DIM=HEAD_DIM,
+        start_n=start_n,
+        start_m=0,
+        num_steps=num_steps,
+        IS_QAT=IS_QAT,
+        USE_TILE_COMP=USE_TILE_COMP,
+    )
+    dv_ptrs = DV + offs_n[:, None] * stride_tok + offs_k[None, :] * stride_d
+    tl.store(dv_ptrs, dv_acc)
+    dk_acc *= sm_scale
+    dk_ptrs = DK + offs_n[:, None] * stride_tok + offs_k[None, :] * stride_d
+    tl.store(dk_ptrs, dk_acc)
+@triton.jit
+def _attn_bwd_dq_kernel(
+        Q,
+        K,
+        V,
+        QMean,
+        KMean,
+        VMean,
+        DO,  #
+        DQ,
+        M,
+        D,
+        q2k_index,
+        q2k_num,
+        max_kv_blks,
+        dropped_q2k_index,
+        dropped_q2k_num,
+        max_dropped_kv_blks,
+        variable_block_sizes,
+        # shared token/dim strides (assumed contiguous along token and dim)
+        stride_tok,
+        stride_d,  #
+        # batch/head strides (may differ between Q and KV)
+        stride_qz,
+        stride_qh,
+        stride_kz,
+        stride_kh,
+        stride_vz,
+        stride_vh,
+        stride_doz,
+        stride_doh,
+        stride_dqz,
+        stride_dqh,
+        H,
+        N_CTX_Q,
+        sm_scale,
+        BLOCK_M2: tl.constexpr,  #
+        BLOCK_N2: tl.constexpr,  #
+        HEAD_DIM: tl.constexpr,
+        IS_QAT: tl.constexpr = False,
+        USE_TILE_COMP: tl.constexpr = False):
+    """
+    Backward kernel that computes dQ for each Q block (64 tokens).
+    Grid:
+      pid0: q_blk in [0, N_CTX_Q/BLOCK_M2)
+      pid2: fused (batch, head) in [0, B*H)
+    """
+    LN2 = 0.6931471824645996  # = ln(2)
+    bhid = tl.program_id(2)
+    b = bhid // H
+    h = bhid % H
+    q_blk = tl.program_id(0)
+    q_adj = (b.to(tl.int64) * stride_qz + h.to(tl.int64) * stride_qh)
+    kv_adj_k = (b.to(tl.int64) * stride_kz + h.to(tl.int64) * stride_kh)
+    kv_adj_v = (b.to(tl.int64) * stride_vz + h.to(tl.int64) * stride_vh)
+    do_adj = (b.to(tl.int64) * stride_doz + h.to(tl.int64) * stride_doh)
+    dq_adj = (b.to(tl.int64) * stride_dqz + h.to(tl.int64) * stride_dqh)
+    Q = Q + q_adj
+    K = K + kv_adj_k
+    V = V + kv_adj_v
+    DO = DO + do_adj
+    DQ = DQ + dq_adj
+    q_tiles = N_CTX_Q // BLOCK_M2
+    kv_tiles = N_CTX_Q // 64
+    mean_q_adj = (bhid * q_tiles * HEAD_DIM).to(tl.int64)
+    mean_kv_adj = (bhid * kv_tiles * HEAD_DIM).to(tl.int64)
+    QMean = QMean + mean_q_adj
+    KMean = KMean + mean_kv_adj
+    VMean = VMean + mean_kv_adj
+    M = M + (bhid * N_CTX_Q).to(tl.int64)
+    D = D + (bhid * N_CTX_Q).to(tl.int64)
+    offs_k = tl.arange(0, HEAD_DIM)
+    start_m = q_blk * BLOCK_M2
+    offs_m = start_m + tl.arange(0, BLOCK_M2)
+    q = tl.load(Q + offs_m[:, None] * stride_tok + offs_k[None, :] * stride_d)
+    do = tl.load(DO + offs_m[:, None] * stride_tok + offs_k[None, :] * stride_d)
+    m_vec = tl.load(M + offs_m)
+    m = m_vec[:, None]
+    dq_acc = tl.zeros([BLOCK_M2, HEAD_DIM], dtype=tl.float32)
+    num_steps = 0  # unused in _attn_bwd_dq
+    dq_acc = _attn_bwd_dq(
+        dq_acc,
+        q,
+        K,
+        V,
+        QMean,
+        KMean,
+        VMean,
+        do,
+        m,
+        m_vec,
+        D,
+        q2k_index,
+        q2k_num,
+        max_kv_blks,
+        dropped_q2k_index,
+        dropped_q2k_num,
+        max_dropped_kv_blks,
+        variable_block_sizes,
+        stride_tok,
+        stride_d,
+        H,
+        N_CTX_Q,
+        BLOCK_M2=BLOCK_M2,
+        BLOCK_N2=BLOCK_N2,
+        HEAD_DIM=HEAD_DIM,
+        start_m=start_m,
+        start_n=0,
+        num_steps=num_steps,
+        sm_scale=sm_scale,
+        IS_QAT=IS_QAT,
+        USE_TILE_COMP=USE_TILE_COMP,
+    )
+    dq_ptrs = DQ + offs_m[:, None] * stride_tok + offs_k[None, :] * stride_d
+    dq_acc *= sm_scale
+    tl.store(dq_ptrs, dq_acc)
+# ──────────────────────────── SPARSE ADDITION BEGIN ───────────────────────────
+def triton_block_sparse_attn_forward(q, k, v, q2k_index, q2k_num,
+                                     variable_block_sizes, is_qat=False,
+                                     q_mean=None, k_mean=None, v_mean=None,
+                                     dropped_q2k_index=None,
+                                     dropped_q2k_num=None):
+    B, H, Tq, D = q.shape
+    Tkv = k.shape[2]
+    sm_scale = 1.0 / math.sqrt(D)
+    max_kv_blks = q2k_index.shape[-1]
+    use_tile_comp = q_mean is not None
+    if use_tile_comp:
+        assert k_mean is not None and v_mean is not None
+        assert dropped_q2k_index is not None and dropped_q2k_num is not None
+        q_mean = q_mean.contiguous()
+        k_mean = k_mean.contiguous()
+        v_mean = v_mean.contiguous()
+        max_dropped_kv_blks = dropped_q2k_index.shape[-1]
+    else:
+        q_mean = q
+        k_mean = k
+        v_mean = v
+        dropped_q2k_index = q2k_index
+        dropped_q2k_num = q2k_num
+        max_dropped_kv_blks = max_kv_blks
+    assert Tq % 64 == 0, f"q length must be a multiple of 64, but got {Tq}"
+    assert Tkv % 64 == 0, f"kv length must be a multiple of 64, but got {Tkv}"
+    assert q2k_num.shape[
+        -1] == Tq // 64, f"shape mismatch, Tq // 64 = {Tq // 64}, q2k_num.shape[-2] = {q2k_num.shape[-2]}"
+    assert variable_block_sizes.numel() == Tkv // 64, (
+        f"shape mismatch, variable_block_sizes must have length {Tkv // 64}, "
+        f"got {variable_block_sizes.numel()}"
+    )
+    o = torch.empty_like(q)
+    high_prec_o = torch.empty_like(q)
+    M = torch.empty((B, H, Tq), dtype=torch.float32, device=q.device)
+    grid = lambda _: (triton.cdiv(Tq, 64), B * H, 1)
+    _attn_fwd_sparse[grid](q,
+                           k,
+                           v,
+                           q_mean,
+                           k_mean,
+                           v_mean,
+                           sm_scale,
+                           q2k_index,
+                           q2k_num,
+                           max_kv_blks,
+                           dropped_q2k_index,
+                           dropped_q2k_num,
+                           max_dropped_kv_blks,
+                           variable_block_sizes,
+                           M,
+                           o,
+                           high_prec_o,
+                           q.stride(0),
+                           q.stride(1),
+                           q.stride(2),
+                           q.stride(3),
+                           k.stride(0),
+                           k.stride(1),
+                           k.stride(2),
+                           k.stride(3),
+                           v.stride(0),
+                           v.stride(1),
+                           v.stride(2),
+                           v.stride(3),
+                           o.stride(0),
+                           o.stride(1),
+                           o.stride(2),
+                           o.stride(3),
+                           B,
+                           H,
+                           Tq,
+                           Tkv,
+                           HEAD_DIM=D,
+                           STAGE=3,
+                           IS_QAT=is_qat,
+                           USE_TILE_COMP=use_tile_comp)
+    return o, M, high_prec_o
+def triton_block_sparse_attn_backward(do, q, k, v, o, M, q2k_index, q2k_num,
+                                      k2q_index, k2q_num, variable_block_sizes,
+                                      is_qat=False, q_mean=None, k_mean=None,
+                                      v_mean=None, dropped_q2k_index=None,
+                                      dropped_q2k_num=None,
+                                      dropped_k2q_index=None,
+                                      dropped_k2q_num=None):
+    assert do.is_contiguous()
+    B, H, Tq, D = q.shape
+    Tkv = k.shape[2]
+    sm_scale = 1.0 / math.sqrt(D)
+    dq = torch.empty_like(q)
+    dk = torch.empty_like(k)
+    dv = torch.empty_like(v)
+    BATCH, N_HEAD = q.shape[:2]
+    BLOCK_M1, BLOCK_N1, BLOCK_M2, BLOCK_N2 = 32, 64, 64, 32
+    RCP_LN2 = 1.4426950408889634  # = 1.0 / ln(2)
+    # Ours-P mode keeps K unscaled and applies sm_scale inside the bwd kernels.
+    arg_k = k
+    PRE_BLOCK = 64
+    assert Tq % PRE_BLOCK == 0
+    pre_grid = (Tq // PRE_BLOCK, BATCH * N_HEAD)
+    delta = torch.empty_like(M)
+    _attn_bwd_preprocess[pre_grid](
+        o,
+        do,  #
+        delta,  #
+        BATCH,
+        N_HEAD,
+        Tq,  #
+        BLOCK_M=PRE_BLOCK,
+        HEAD_DIM=D  #
+    )
+    max_q_blks = k2q_index.shape[-1]
+    max_kv_blks = q2k_index.shape[-1]
+    use_tile_comp = q_mean is not None
+    if use_tile_comp:
+        assert k_mean is not None and v_mean is not None
+        assert dropped_q2k_index is not None and dropped_q2k_num is not None
+        assert dropped_k2q_index is not None and dropped_k2q_num is not None
+        q_mean = q_mean.contiguous()
+        k_mean = k_mean.contiguous()
+        v_mean = v_mean.contiguous()
+        max_dropped_kv_blks = dropped_q2k_index.shape[-1]
+        max_dropped_q_blks = dropped_k2q_index.shape[-1]
+    else:
+        q_mean = q
+        k_mean = k
+        v_mean = v
+        dropped_q2k_index = q2k_index
+        dropped_q2k_num = q2k_num
+        dropped_k2q_index = k2q_index
+        dropped_k2q_num = k2q_num
+        max_dropped_kv_blks = max_kv_blks
+        max_dropped_q_blks = max_q_blks
+    # dK/dV kernel: grid over KV blocks
+    grid_kv = (Tkv // BLOCK_N1, 1, BATCH * N_HEAD)
+    _attn_bwd_dkdv_kernel[grid_kv](
+        q,
+        arg_k,
+        v,
+        q_mean,
+        k_mean,
+        v_mean,
+        sm_scale,
+        do,
+        dk,
+        dv,
+        M,
+        delta,
+        k2q_index,
+        k2q_num,
+        max_q_blks,
+        dropped_k2q_index,
+        dropped_k2q_num,
+        max_dropped_q_blks,
+        variable_block_sizes,
+        q.stride(2),
+        q.stride(3),
+        q.stride(0),
+        q.stride(1),
+        arg_k.stride(0),
+        arg_k.stride(1),
+        v.stride(0),
+        v.stride(1),
+        do.stride(0),
+        do.stride(1),
+        dk.stride(0),
+        dk.stride(1),
+        dv.stride(0),
+        dv.stride(1),
+        N_HEAD,
+        Tq,
+        Tkv,
+        BLOCK_M1=BLOCK_M1,
+        BLOCK_N1=BLOCK_N1,
+        HEAD_DIM=D,
+        IS_QAT=is_qat,
+        USE_TILE_COMP=use_tile_comp,
+    )
+    # dQ kernel: grid over Q blocks
+    grid_q = (Tq // BLOCK_M2, 1, BATCH * N_HEAD)
+    _attn_bwd_dq_kernel[grid_q](
+        q,
+        arg_k,
+        v,
+        q_mean,
+        k_mean,
+        v_mean,
+        do,
+        dq,
+        M,
+        delta,
+        q2k_index,
+        q2k_num,
+        max_kv_blks,
+        dropped_q2k_index,
+        dropped_q2k_num,
+        max_dropped_kv_blks,
+        variable_block_sizes,
+        q.stride(2),
+        q.stride(3),
+        q.stride(0),
+        q.stride(1),
+        arg_k.stride(0),
+        arg_k.stride(1),
+        v.stride(0),
+        v.stride(1),
+        do.stride(0),
+        do.stride(1),
+        dq.stride(0),
+        dq.stride(1),
+        N_HEAD,
+        Tq,
+        sm_scale,
+        BLOCK_M2=BLOCK_M2,
+        BLOCK_N2=BLOCK_N2,
+        HEAD_DIM=D,
+        IS_QAT=is_qat,
+        USE_TILE_COMP=use_tile_comp,
+    )
+    return dq, dk, dv

standalone_inference/overlay_files/fastvideo-kernel/python/fastvideo_kernel/triton_kernels/nvfp4_utils.py ADDED Viewed

	@@ -0,0 +1,250 @@

+# SPDX-License-Identifier: Apache-2.0
+# Adapted from https://github.com/triton-lang/triton/blob/main/python/triton_kernels/triton_kernels/numerics_details/mxfp_details/_upcast_from_mxfp.py
+# and https://github.com/triton-lang/triton/blob/main/python/triton_kernels/triton_kernels/numerics_details/mxfp_details/_downcast_to_mxfp.py
+import triton
+import triton.language as tl
+try:
+    from triton.language.target_info import cuda_capability_geq
+    _HAS_CAPABILITY_CHECK = True
+except ImportError:
+    cuda_capability_geq = None
+    _HAS_CAPABILITY_CHECK = False
+MXFP_BLOCK_SIZE = tl.constexpr(16)
+@triton.jit
+def _compute_quant_and_scale(
+    src_tensor,
+    valid_src_mask,
+    mx_tensor_dtype: tl.constexpr = tl.uint8,
+    use_global_sf=True,
+    two_level_quant_P=False,
+    IS_BLACKWELL: tl.constexpr = False,
+):
+    BLOCK_SIZE_OUT_DIM: tl.constexpr = src_tensor.shape[0]
+    BLOCK_SIZE_QUANT_DIM: tl.constexpr = src_tensor.shape[1]
+    BLOCK_SIZE_QUANT_MX_SCALE: tl.constexpr = src_tensor.shape[1] // MXFP_BLOCK_SIZE
+    is_fp4: tl.constexpr = mx_tensor_dtype == tl.uint8
+    is_fp8e4: tl.constexpr = mx_tensor_dtype == tl.float8e4nv
+    is_fp8e5: tl.constexpr = mx_tensor_dtype == tl.float8e5
+    tl.static_assert(
+        is_fp4 or (is_fp8e4 or is_fp8e5),
+        "mx_tensor_dtype must be uint8, float8e4nv, or float8e5",
+    )
+    # Explicit cast to fp32 since most ops are not supported on bfloat16. We avoid needless conversions to and from bf16
+    f32_tensor = src_tensor.to(tl.float32)
+    abs_tensor = tl.abs(f32_tensor)
+    abs_tensor = tl.where(valid_src_mask, abs_tensor, -1.0)  # Don't consider padding tensors in scale computation
+    if two_level_quant_P:
+        # row max from SageAttn3 paper
+        global_max_val = tl.max(f32_tensor, axis=1, keep_dims=True)  # (BLOCK_SIZE_OUT_DIM, 1)
+        global_max_val = tl.maximum(global_max_val, 1e-8)
+        s_enc = ((6 * 448) / global_max_val).reshape([BLOCK_SIZE_OUT_DIM, 1, 1])
+        s_dec = (1 / s_enc)
+    abs_tensor = tl.reshape(abs_tensor, [BLOCK_SIZE_OUT_DIM, BLOCK_SIZE_QUANT_MX_SCALE, MXFP_BLOCK_SIZE])
+    if use_global_sf and not two_level_quant_P:
+        global_max_val = tl.max(abs_tensor)
+        # Avoid division by zero: if all values are padding (max is 0), use a default scale
+        global_max_val = tl.maximum(global_max_val, 1e-8)
+        s_enc = (6 * 448) / global_max_val
+        s_dec = (1 / s_enc)
+    elif not two_level_quant_P and not use_global_sf:
+        s_dec = 1.0
+        s_enc = 1.0
+    max_val = tl.max(abs_tensor, axis=2, keep_dims=True)  # (BLOCK_SIZE_OUT_DIM, BLOCK_SIZE_QUANT_MX_SCALE, 1)  # per block maxima
+    s_dec_b = max_val / 6  # (BLOCK_SIZE_OUT_DIM, BLOCK_SIZE_QUANT_MX_SCALE, 1)
+    s_dec_b_e4m3 = (s_dec_b * s_enc).to(tl.float8e4nv)  # (BLOCK_SIZE_OUT_DIM, BLOCK_SIZE_QUANT_MX_SCALE, 1)
+    s_enc_b = 1 / (s_dec_b_e4m3.to(tl.float32) * s_dec)  # (BLOCK_SIZE_OUT_DIM, BLOCK_SIZE_QUANT_MX_SCALE, 1)
+    f32_tensor = tl.reshape(f32_tensor, [BLOCK_SIZE_OUT_DIM, BLOCK_SIZE_QUANT_MX_SCALE, MXFP_BLOCK_SIZE])
+    quant_tensor = f32_tensor * s_enc_b
+    # Reshape the tensors after scaling
+    quant_tensor = quant_tensor.reshape([BLOCK_SIZE_OUT_DIM, BLOCK_SIZE_QUANT_DIM])
+    # Set the invalid portions of the tensor to 0. This will ensure that any padding tensors are 0 in the mx format.
+    quant_tensor = tl.where(valid_src_mask, quant_tensor, 0.0)
+    dequant_scale = s_dec_b_e4m3.reshape([BLOCK_SIZE_OUT_DIM, BLOCK_SIZE_QUANT_MX_SCALE])
+    if is_fp4 and IS_BLACKWELL:
+        # Convert scaled values to two f32 lanes and use PTX cvt to e2m1x2 with two f32 operands.
+        pairs = tl.reshape(quant_tensor, [BLOCK_SIZE_OUT_DIM, BLOCK_SIZE_QUANT_DIM // 2, 2])
+        lo_f, hi_f = tl.split(pairs)
+        lo_f32 = lo_f.to(tl.float32)
+        hi_f32 = hi_f.to(tl.float32)
+        # Inline PTX: cvt.rn.satfinite.e2m1x2.f32 takes two f32 sources and produces one .b8 packed e2m1x2.
+        out_tensor = tl.inline_asm_elementwise(
+            """
+            {
+                .reg .b8 r;
+                cvt.rn.satfinite.e2m1x2.f32 r, $1, $2;
+                mov.b32 $0, {r, r, r, r};
+            }
+            """,
+            constraints="=r,f,f",
+            args=[hi_f32, lo_f32],
+            dtype=tl.uint8,
+            is_pure=True,
+            pack=1,
+        )
+    elif is_fp4:
+        quant_tensor = quant_tensor.to(tl.uint32, bitcast=True)
+        signs = quant_tensor & 0x80000000
+        exponents = (quant_tensor >> 23) & 0xFF
+        mantissas_orig = (quant_tensor & 0x7FFFFF)
+        # For RTNE: 0.25 < x < 0.75 maps to 0.5 (denormal); exactly 0.25 maps to 0.0
+        E8_BIAS = 127
+        E2_BIAS = 1
+        # Move implicit bit 1 at the beginning to mantissa for denormals
+        is_subnormal = exponents < E8_BIAS
+        adjusted_exponents = tl.core.sub(E8_BIAS, exponents + 1, sanitize_overflow=False)
+        mantissas_pre = (0x400000 | (mantissas_orig >> 1))
+        mantissas = tl.where(is_subnormal, mantissas_pre >> adjusted_exponents, mantissas_orig)
+        # For normal numbers, we change the bias from 127 to 1, and for subnormals, we keep exponent as 0.
+        exponents = tl.maximum(exponents, E8_BIAS - E2_BIAS) - (E8_BIAS - E2_BIAS)
+        # Combine sign, exponent, and mantissa, while saturating
+        # Round to nearest, ties to even (RTNE): use guard/sticky and LSB to decide increment
+        m2bits = mantissas >> 21
+        lsb_keep = (m2bits >> 1) & 0x1
+        guard = m2bits & 0x1
+        IS_SRC_FP32: tl.constexpr = src_tensor.dtype == tl.float32
+        if IS_SRC_FP32:
+            bit0_dropped = (mantissas_orig & 0x1) != 0
+            mask = (1 << tl.minimum(adjusted_exponents, 31)) - 1
+            dropped_post = (mantissas_pre & mask) != 0
+            sticky = is_subnormal & (bit0_dropped | dropped_post)
+            sticky |= ((mantissas & 0x1FFFFF) != 0).to(tl.uint32)
+        else:
+            sticky = ((mantissas & 0x1FFFFF) != 0).to(tl.uint32)
+        round_inc = guard & (sticky | lsb_keep)
+        e2m1_tmp = tl.minimum((((exponents << 2) | m2bits) + round_inc) >> 1, 0x7)
+        e2m1_value = ((signs >> 28) | e2m1_tmp).to(tl.uint8)
+        e2m1_value = tl.reshape(e2m1_value, [BLOCK_SIZE_OUT_DIM, BLOCK_SIZE_QUANT_DIM // 2, 2])
+        evens, odds = tl.split(e2m1_value)
+        out_tensor = evens | (odds << 4)
+    else:
+        out_tensor = quant_tensor.to(mx_tensor_dtype)
+    return out_tensor, dequant_scale, s_dec
+@triton.jit
+def _compute_dequant(
+    mx_tensor,
+    scale,
+    s_dec,
+    BLOCK_SIZE_OUT_DIM: tl.constexpr,
+    BLOCK_SIZE_QUANT_DIM: tl.constexpr,
+    dst_dtype: tl.constexpr,
+    IS_BLACKWELL: tl.constexpr = False,
+):
+    tl.static_assert(BLOCK_SIZE_QUANT_DIM % MXFP_BLOCK_SIZE == 0, f"Block size along quantization block must be a multiple of {MXFP_BLOCK_SIZE=}")
+    # uint8 signifies two fp4 e2m1 values packed into a single byte
+    mx_tensor_dtype: tl.constexpr = mx_tensor.dtype
+    _is_f16: tl.constexpr = dst_dtype == tl.float16
+    _is_bf16: tl.constexpr = dst_dtype == tl.bfloat16
+    _is_f32: tl.constexpr = dst_dtype == tl.float32
+    tl.static_assert(_is_f16 or (_is_bf16 or _is_f32))
+    _is_u8: tl.constexpr = mx_tensor_dtype == tl.uint8
+    _is_e4: tl.constexpr = mx_tensor_dtype == tl.float8e4nv
+    _is_e5: tl.constexpr = mx_tensor_dtype == tl.float8e5
+    _is_dst: tl.constexpr = mx_tensor_dtype == dst_dtype
+    tl.static_assert(
+        _is_u8 or ((_is_e4 or _is_e5) or _is_dst),
+        "mx_tensor_ptr must be uint8 or float8 or dst_dtype")
+    tl.static_assert(scale.dtype == tl.float8e4nv, "scale must be float8e4nv")
+    # Determine if we are dealing with fp8 types.
+    is_fp4: tl.constexpr = mx_tensor_dtype == tl.uint8
+    BLOCK_SIZE_QUANT_MX_SCALE: tl.constexpr = BLOCK_SIZE_QUANT_DIM // MXFP_BLOCK_SIZE
+    # Upcast the scale to the destination type.
+    if dst_dtype == tl.bfloat16:
+        dst_scale = scale.to(tl.bfloat16)
+    else:
+        dst_scale = scale.to(tl.float32)
+        if dst_dtype == tl.float16:
+            dst_scale = dst_scale.to(tl.float16)
+    # Now upcast the tensor.
+    intermediate_dtype: tl.constexpr = tl.bfloat16 if dst_dtype == tl.float32 else dst_dtype
+    if IS_BLACKWELL:
+        assert is_fp4
+        packed_u32 = tl.inline_asm_elementwise(
+            asm="""
+            {
+            .reg .b8 in_8;
+            .reg .f16x2 out;
+            cvt.u8.u32 in_8, $1;
+            cvt.rn.f16x2.e2m1x2 out, in_8;
+            mov.b32 $0, out;
+            }
+            """,
+            constraints="=r,r",
+            args=[mx_tensor],  # tl.uint8 passed in as a 32-bit reg with value in low 8 bits
+            dtype=tl.uint32,
+            is_pure=True,
+            pack=1,
+        )
+        lo_u16 = (packed_u32 & 0xFFFF).to(tl.uint16)
+        hi_u16 = (packed_u32 >> 16).to(tl.uint16)
+        lo_f16 = lo_u16.to(tl.float16, bitcast=True)
+        hi_f16 = hi_u16.to(tl.float16, bitcast=True)
+        if intermediate_dtype == tl.float16:
+            x0, x1 = lo_f16, hi_f16
+        else:
+            x0 = lo_f16.to(intermediate_dtype)
+            x1 = hi_f16.to(intermediate_dtype)
+        dst_tensor = tl.interleave(x0, x1)
+    else:
+        assert is_fp4
+        dst_bias: tl.constexpr = 127 if intermediate_dtype == tl.bfloat16 else 15  # exponent bias
+        dst_0p5: tl.constexpr = 16128 if intermediate_dtype == tl.bfloat16 else 0x3800
+        dst_m_bits: tl.constexpr = 7 if intermediate_dtype == tl.bfloat16 else 10  # mantissa bits
+        # e2m1
+        em0 = mx_tensor & 0x07
+        em1 = mx_tensor & 0x70
+        x0 = (em0.to(tl.uint16) << (dst_m_bits - 1)) | ((mx_tensor & 0x08).to(tl.uint16) << 12)
+        x1 = (em1.to(tl.uint16) << (dst_m_bits - 5)) | ((mx_tensor & 0x80).to(tl.uint16) << 8)
+        # Three cases:
+        # 1) x is normal and non-zero: Correct bias
+        x0 = tl.where((em0 & 0x06) != 0, x0 + ((dst_bias - 1) << dst_m_bits), x0)
+        x1 = tl.where((em1 & 0x60) != 0, x1 + ((dst_bias - 1) << dst_m_bits), x1)
+        # 2) x is subnormal (x == 0bs001 where s is the sign): Map to +-0.5 in the dst type
+        x0 = tl.where(em0 == 0x01, dst_0p5 | (x0 & 0x8000), x0)
+        x1 = tl.where(em1 == 0x10, dst_0p5 | (x1 & 0x8000), x1)
+        # 3) x is zero, do nothing
+        dst_tensor = tl.interleave(x0, x1).to(intermediate_dtype, bitcast=True)
+    dst_tensor = dst_tensor.to(dst_dtype)
+    # Reshape for proper broadcasting: the scale was stored with a 16‐sized “inner” grouping.
+    dst_tensor = dst_tensor.reshape([BLOCK_SIZE_OUT_DIM, BLOCK_SIZE_QUANT_MX_SCALE, MXFP_BLOCK_SIZE])
+    dst_scale = dst_scale.reshape([BLOCK_SIZE_OUT_DIM, BLOCK_SIZE_QUANT_MX_SCALE, 1])
+    scale = scale.reshape(dst_scale.shape)
+    out_tensor = dst_tensor * dst_scale * s_dec  # NVFP4 has the additional global scale factor
+    if dst_dtype == tl.float32:
+        max_fin = 3.4028234663852886e+38
+    elif dst_dtype == tl.bfloat16:
+        max_fin = 3.3895313892515355e+38
+    else:
+        tl.static_assert(dst_dtype == tl.float16)
+        max_fin = 65504
+    out_tensor = tl.clamp(out_tensor, min=-max_fin, max=max_fin)
+    out_tensor = out_tensor.reshape([BLOCK_SIZE_OUT_DIM, BLOCK_SIZE_QUANT_DIM])
+    out_tensor = out_tensor.to(dst_dtype)
+    return out_tensor

standalone_inference/overlay_files/fastvideo-kernel/python/fastvideo_kernel/triton_kernels/quant_utils.py ADDED Viewed

	@@ -0,0 +1,80 @@

+import triton
+import triton.language as tl
+from .nvfp4_utils import _compute_quant_and_scale, _compute_dequant
+@triton.jit
+def fake_quantize(src_tensor, valid_src_mask, BLOCK_SIZE_OUT_DIM: tl.constexpr,
+                    BLOCK_SIZE_QUANT_DIM: tl.constexpr,
+                    dst_dtype: tl.constexpr,
+                    mx_tensor_dtype: tl.constexpr = tl.uint8,
+                    use_global_sf: tl.constexpr = True,
+                    two_level_quant_P: tl.constexpr = False):
+    high_prec_src_tensor = src_tensor
+    src_tensor, src_scale, src_s_dec = _compute_quant_and_scale(src_tensor=src_tensor,
+                                                                valid_src_mask=valid_src_mask,
+                                                                mx_tensor_dtype=mx_tensor_dtype,
+                                                                use_global_sf=use_global_sf,
+                                                                two_level_quant_P=two_level_quant_P)
+    src_tensor = _compute_dequant(mx_tensor=src_tensor,
+                                  scale=src_scale,
+                                  s_dec=src_s_dec,
+                                  BLOCK_SIZE_OUT_DIM=BLOCK_SIZE_OUT_DIM,
+                                  BLOCK_SIZE_QUANT_DIM=BLOCK_SIZE_QUANT_DIM,
+                                  dst_dtype=dst_dtype)
+    return src_tensor, high_prec_src_tensor.to(src_tensor.dtype)
+@triton.jit
+def fake_quantize_q(Q, fake_Q, stride_z_q, stride_h_q,
+                    stride_tok_q, stride_d_q,
+                    fake_stride_z_q, fake_stride_h_q,
+                    fake_stride_tok_q, fake_stride_d_q,
+                    H, N_CTX_Q,
+                    BLOCK_M: tl.constexpr,
+                    HEAD_DIM: tl.constexpr,
+                    use_global_sf: tl.constexpr = True):
+    bhid = tl.program_id(1)
+    adj_q = (stride_h_q * (bhid % H) + stride_z_q * (bhid // H))
+    fake_adj_q = (fake_stride_h_q * (bhid % H) + fake_stride_z_q * (bhid // H))
+    Q += adj_q
+    fake_Q += fake_adj_q
+    pid = tl.program_id(0)
+    start_m = pid * BLOCK_M
+    offs_m = start_m + tl.arange(0, BLOCK_M)
+    offs_k = tl.arange(0, HEAD_DIM)
+    q_valid = offs_m < N_CTX_Q
+    q = tl.load(Q + offs_m[:, None] * stride_tok_q + offs_k[None, :] * stride_d_q, mask=q_valid[:, None], other=0.0)
+    q, _ = fake_quantize(src_tensor=q, valid_src_mask=q_valid[:, None], BLOCK_SIZE_OUT_DIM=BLOCK_M, BLOCK_SIZE_QUANT_DIM=HEAD_DIM, dst_dtype=q.dtype, use_global_sf=use_global_sf)
+    tl.store(fake_Q + offs_m[:, None] * fake_stride_tok_q + offs_k[None, :] * fake_stride_d_q, q, mask=q_valid[:, None])
+@triton.jit
+def fake_quantize_kv(K, V, fake_K, fake_V, stride_z_kv, stride_h_kv,
+                    stride_tok_kv, stride_d_kv,
+                    fake_stride_z_kv, fake_stride_h_kv,
+                    fake_stride_tok_kv, fake_stride_d_kv,
+                    H, N_CTX_KV,
+                    BLOCK_N: tl.constexpr,
+                    HEAD_DIM: tl.constexpr,
+                    use_global_sf: tl.constexpr = True):
+    bhid = tl.program_id(1)
+    adj_kv = (stride_h_kv * (bhid % H) + stride_z_kv * (bhid // H))
+    fake_adj_kv = (fake_stride_h_kv * (bhid % H) + fake_stride_z_kv * (bhid // H))
+    K += adj_kv
+    V += adj_kv
+    fake_K += fake_adj_kv
+    fake_V += fake_adj_kv
+    pid = tl.program_id(0)
+    start_n = pid * BLOCK_N
+    offs_n = start_n + tl.arange(0, BLOCK_N)
+    offs_k = tl.arange(0, HEAD_DIM)
+    kv_valid = offs_n < N_CTX_KV
+    k_block = tl.load(K + offs_n[:, None] * stride_tok_kv + offs_k[None, :] * stride_d_kv, mask=kv_valid[:, None], other=0.0)
+    v_block = tl.load(V + offs_n[:, None] * stride_tok_kv + offs_k[None, :] * stride_d_kv, mask=kv_valid[:, None], other=0.0)
+    k, _ = fake_quantize(src_tensor=k_block, valid_src_mask=kv_valid[:, None], BLOCK_SIZE_OUT_DIM=BLOCK_N, BLOCK_SIZE_QUANT_DIM=HEAD_DIM, dst_dtype=k_block.dtype, use_global_sf=use_global_sf)
+    v, _ = fake_quantize(src_tensor=v_block, valid_src_mask=kv_valid[:, None], BLOCK_SIZE_OUT_DIM=BLOCK_N, BLOCK_SIZE_QUANT_DIM=HEAD_DIM, dst_dtype=v_block.dtype, use_global_sf=use_global_sf)
+    tl.store(fake_K + offs_n[:, None] * fake_stride_tok_kv + offs_k[None, :] * fake_stride_d_kv, k, mask=kv_valid[:, None])
+    tl.store(fake_V + offs_n[:, None] * fake_stride_tok_kv + offs_k[None, :] * fake_stride_d_kv, v, mask=kv_valid[:, None])

standalone_inference/overlay_files/fastvideo/api/compat.py ADDED Viewed

	@@ -0,0 +1,503 @@

+# SPDX-License-Identifier: Apache-2.0
+from __future__ import annotations
+from collections.abc import Mapping
+from copy import deepcopy
+from dataclasses import fields, is_dataclass
+from pathlib import Path
+from typing import Any
+from fastvideo.api.overrides import apply_overrides, parse_cli_overrides
+from fastvideo.api.parser import config_to_dict, load_raw_config, parse_config
+from fastvideo.api.schema import (
+    GenerationRequest,
+    GeneratorConfig,
+    InputConfig,
+    OutputConfig,
+    RequestRuntimeConfig,
+    SamplingConfig,
+)
+from fastvideo.configs.sample import SamplingParam
+from fastvideo.fastvideo_args import FastVideoArgs
+from fastvideo.utils import shallow_asdict
+_EXPLICIT_REQUEST_ATTR = "_fastvideo_explicit_request"
+_INPUT_FIELD_NAMES = {field.name for field in fields(InputConfig)}
+_SAMPLING_FIELD_NAMES = {field.name for field in fields(SamplingConfig)}
+_RUNTIME_FIELD_NAMES = {field.name for field in fields(RequestRuntimeConfig)}
+_OUTPUT_FIELD_NAMES = {field.name for field in fields(OutputConfig)}
+_MISSING = object()
+_LEGACY_REQUEST_ALIASES = {
+    "neg_prompt": "negative_prompt",
+}
+_REQUEST_PIPELINE_OVERRIDE_FIELDS = frozenset({
+    "embedded_cfg_scale",
+})
+def normalize_generator_config(config: GeneratorConfig | Mapping[str, Any], ) -> GeneratorConfig:
+    if isinstance(config, GeneratorConfig):
+        return config
+    return parse_config(GeneratorConfig, config)
+def load_generator_config_from_file(
+    path: str | Path,
+    overrides: list[str] | Mapping[str, Any] | None = None,
+) -> GeneratorConfig:
+    raw = load_raw_config(path)
+    normalized_overrides = _normalize_overrides(overrides)
+    if _looks_like_run_or_serve_config(raw):
+        if normalized_overrides:
+            raw = apply_overrides(raw, normalized_overrides)
+        return parse_config(GeneratorConfig, raw["generator"])
+    if normalized_overrides:
+        adjusted = normalized_overrides
+        if all(key.startswith("generator.") for key in adjusted):
+            adjusted = {key[len("generator."):]: value for key, value in adjusted.items()}
+        raw = apply_overrides(raw, adjusted)
+    return parse_config(GeneratorConfig, raw)
+def legacy_from_pretrained_to_config(
+    model_path: str,
+    kwargs: Mapping[str, Any],
+) -> GeneratorConfig:
+    raw: dict[str, Any] = {"model_path": model_path}
+    engine: dict[str, Any] = {}
+    parallelism: dict[str, Any] = {}
+    offload: dict[str, Any] = {}
+    compile_config: dict[str, Any] = {}
+    pipeline: dict[str, Any] = {}
+    components: dict[str, Any] = {}
+    quantization: dict[str, Any] = {}
+    experimental: dict[str, Any] = {}
+    for key, value in kwargs.items():
+        if key == "revision":
+            raw["revision"] = value
+        elif key == "trust_remote_code":
+            raw["trust_remote_code"] = value
+        elif key == "num_gpus":
+            engine["num_gpus"] = value
+        elif key == "distributed_executor_backend":
+            engine["execution_backend"] = value
+        elif key in {"tp_size", "sp_size", "hsdp_replicate_dim", "hsdp_shard_dim", "dist_timeout"}:
+            parallelism[key] = value
+        elif key == "dit_cpu_offload":
+            offload["dit"] = value
+        elif key == "dit_layerwise_offload":
+            offload["dit_layerwise"] = value
+        elif key == "text_encoder_cpu_offload":
+            offload["text_encoder"] = value
+        elif key == "image_encoder_cpu_offload":
+            offload["image_encoder"] = value
+        elif key == "vae_cpu_offload":
+            offload["vae"] = value
+        elif key == "pin_cpu_memory":
+            offload["pin_cpu_memory"] = value
+        elif key == "enable_torch_compile":
+            compile_config["enabled"] = value
+        elif key == "torch_compile_kwargs":
+            compile_config["kwargs"] = deepcopy(value)
+        elif key in {"enable_stage_verification", "use_fsdp_inference", "disable_autocast"}:
+            engine[key] = value
+        elif key == "override_text_encoder_quant":
+            quantization["text_encoder_quant"] = value
+        elif key == "transformer_quant":
+            quantization["transformer_quant"] = value
+        elif key == "workload_type":
+            pipeline["workload_type"] = value
+        elif key == "lora_path":
+            components["lora_path"] = value
+        elif key == "override_pipeline_cls_name":
+            components["override_pipeline_cls_name"] = value
+        elif key == "override_transformer_cls_name":
+            components["override_transformer_cls_name"] = value
+        elif key == "pipeline_config":
+            if isinstance(value, str):
+                components["pipeline_config_path"] = value
+            else:
+                experimental[key] = deepcopy(value)
+        elif key == "override_text_encoder_safetensors":
+            components["text_encoder_weights"] = value
+        elif key == "init_weights_from_safetensors":
+            components["transformer_weights"] = value
+        elif key == "init_weights_from_safetensors_2":
+            components["transformer_2_weights"] = value
+        else:
+            experimental[key] = deepcopy(value)
+    if parallelism:
+        engine["parallelism"] = parallelism
+    if offload:
+        engine["offload"] = offload
+    if compile_config:
+        engine["compile"] = compile_config
+    if quantization:
+        engine["quantization"] = quantization
+    if engine:
+        raw["engine"] = engine
+    if components:
+        pipeline["components"] = components
+    if experimental:
+        pipeline["experimental"] = experimental
+    if pipeline:
+        raw["pipeline"] = pipeline
+    return parse_config(GeneratorConfig, raw)
+def generator_config_to_fastvideo_args(config: GeneratorConfig | Mapping[str, Any], ) -> FastVideoArgs:
+    normalized = normalize_generator_config(config)
+    unsupported = []
+    if normalized.pipeline.profile is not None:
+        unsupported.append("pipeline.profile")
+    if normalized.pipeline.profile_version is not None:
+        unsupported.append("pipeline.profile_version")
+    if normalized.pipeline.components.config_root is not None:
+        unsupported.append("pipeline.components.config_root")
+    if normalized.pipeline.components.vae_weights is not None:
+        unsupported.append("pipeline.components.vae_weights")
+    if normalized.pipeline.components.upsampler_weights is not None:
+        unsupported.append("pipeline.components.upsampler_weights")
+    if unsupported:
+        joined = ", ".join(unsupported)
+        raise NotImplementedError(f"VideoGenerator compatibility adapter does not support {joined} yet")
+    engine = normalized.engine
+    kwargs: dict[str, Any] = {
+        "model_path": normalized.model_path,
+        "revision": normalized.revision,
+        "trust_remote_code": normalized.trust_remote_code,
+        "num_gpus": engine.num_gpus,
+        "distributed_executor_backend": engine.execution_backend,
+        "tp_size": engine.parallelism.tp_size,
+        "sp_size": engine.parallelism.sp_size,
+        "hsdp_replicate_dim": engine.parallelism.hsdp_replicate_dim,
+        "hsdp_shard_dim": engine.parallelism.hsdp_shard_dim,
+        "dist_timeout": engine.parallelism.dist_timeout,
+        "dit_cpu_offload": engine.offload.dit,
+        "dit_layerwise_offload": engine.offload.dit_layerwise,
+        "text_encoder_cpu_offload": engine.offload.text_encoder,
+        "image_encoder_cpu_offload": engine.offload.image_encoder,
+        "vae_cpu_offload": engine.offload.vae,
+        "pin_cpu_memory": engine.offload.pin_cpu_memory,
+        "enable_torch_compile": engine.compile.enabled,
+        "torch_compile_kwargs": deepcopy(engine.compile.kwargs),
+        "enable_stage_verification": engine.enable_stage_verification,
+        "use_fsdp_inference": engine.use_fsdp_inference,
+        "disable_autocast": engine.disable_autocast,
+    }
+    if normalized.pipeline.workload_type is not None:
+        kwargs["workload_type"] = normalized.pipeline.workload_type
+    quantization = engine.quantization
+    if quantization is not None and quantization.text_encoder_quant is not None:
+        kwargs["override_text_encoder_quant"] = quantization.text_encoder_quant
+    if quantization is not None and quantization.transformer_quant is not None:
+        kwargs["transformer_quant"] = quantization.transformer_quant
+    components = normalized.pipeline.components
+    if components.pipeline_config_path is not None:
+        kwargs["pipeline_config"] = components.pipeline_config_path
+    if components.lora_path is not None:
+        kwargs["lora_path"] = components.lora_path
+    if components.override_pipeline_cls_name is not None:
+        kwargs["override_pipeline_cls_name"] = components.override_pipeline_cls_name
+    if components.override_transformer_cls_name is not None:
+        kwargs["override_transformer_cls_name"] = components.override_transformer_cls_name
+    if components.text_encoder_weights is not None:
+        kwargs["override_text_encoder_safetensors"] = components.text_encoder_weights
+    if components.transformer_weights is not None:
+        kwargs["init_weights_from_safetensors"] = components.transformer_weights
+    if components.transformer_2_weights is not None:
+        kwargs["init_weights_from_safetensors_2"] = components.transformer_2_weights
+    kwargs.update(deepcopy(normalized.pipeline.profile_overrides))
+    kwargs.update(deepcopy(normalized.pipeline.experimental))
+    return FastVideoArgs.from_kwargs(**kwargs)
+def normalize_generation_request(request: GenerationRequest | Mapping[str, Any], ) -> GenerationRequest:
+    normalized = (request if isinstance(request, GenerationRequest) else parse_config(GenerationRequest, request))
+    if not hasattr(normalized, _EXPLICIT_REQUEST_ATTR):
+        setattr(normalized, _EXPLICIT_REQUEST_ATTR, _serialize_generation_request(normalized))
+    return normalized
+def legacy_generate_call_to_request(
+    prompt: str | None,
+    sampling_param: SamplingParam | None,
+    *,
+    mouse_cond: Any | None = None,
+    keyboard_cond: Any | None = None,
+    grid_sizes: Any | None = None,
+    legacy_kwargs: Mapping[str, Any] | None = None,
+) -> GenerationRequest:
+    raw = _sampling_param_to_request_raw(sampling_param)
+    if prompt is not None:
+        raw["prompt"] = prompt
+    for key, value in (legacy_kwargs or {}).items():
+        _apply_request_field(raw, key, value)
+    if mouse_cond is not None:
+        raw.setdefault("inputs", {})["mouse_cond"] = mouse_cond
+    if keyboard_cond is not None:
+        raw.setdefault("inputs", {})["keyboard_cond"] = keyboard_cond
+    if grid_sizes is not None:
+        raw.setdefault("inputs", {})["grid_sizes"] = grid_sizes
+    normalized = parse_config(GenerationRequest, raw)
+    setattr(normalized, _EXPLICIT_REQUEST_ATTR, deepcopy(raw))
+    return normalized
+def request_to_sampling_param(
+    request: GenerationRequest,
+    *,
+    model_path: str,
+) -> SamplingParam:
+    if request.plan is not None:
+        raise NotImplementedError("GenerationRequest.plan is not wired into VideoGenerator yet")
+    if request.state is not None:
+        raise NotImplementedError("GenerationRequest.state is not wired into VideoGenerator yet")
+    sampling_param = SamplingParam.from_pretrained(model_path)
+    updates = _explicit_request_updates(request)
+    for key, value in updates.items():
+        if hasattr(sampling_param, key):
+            setattr(sampling_param, key, deepcopy(value))
+        elif key in _REQUEST_PIPELINE_OVERRIDE_FIELDS or _is_supported_as_default_only(key, value):
+            continue
+        else:
+            raise ValueError(f"Request field {key!r} is not supported by sampling params for {model_path}")
+    sampling_param.__post_init__()
+    sampling_param.check_sampling_param()
+    return sampling_param
+def expand_request_prompt_batch(request: GenerationRequest, ) -> list[GenerationRequest]:
+    if not isinstance(request.prompt, list):
+        return [request]
+    requests: list[GenerationRequest] = []
+    for index, prompt in enumerate(request.prompt):
+        single_request = deepcopy(request)
+        single_request.prompt = prompt
+        _fan_out_batched_input_value(request, single_request, "image_path", index)
+        _fan_out_batched_input_value(request, single_request, "video_path", index)
+        _fan_out_explicit_request_metadata(request, single_request, index, prompt)
+        requests.append(single_request)
+    return requests
+def _looks_like_run_or_serve_config(raw: Mapping[str, Any]) -> bool:
+    return isinstance(raw.get("generator"), Mapping)
+def _normalize_overrides(overrides: list[str] | Mapping[str, Any] | None, ) -> dict[str, Any] | None:
+    if not overrides:
+        return None
+    if isinstance(overrides, list):
+        return parse_cli_overrides(overrides)
+    return dict(overrides)
+def _sampling_param_to_request_raw(sampling_param: SamplingParam | None, ) -> dict[str, Any]:
+    if sampling_param is None:
+        return {}
+    raw: dict[str, Any] = {}
+    for key, value in shallow_asdict(sampling_param).items():
+        if key == "prompt":
+            continue
+        _apply_request_field(raw, key, deepcopy(value))
+    return raw
+def _apply_request_field(
+    raw: dict[str, Any],
+    key: str,
+    value: Any,
+) -> None:
+    key = _LEGACY_REQUEST_ALIASES.get(key, key)
+    if key == "negative_prompt":
+        raw["negative_prompt"] = value
+        return
+    if key in _INPUT_FIELD_NAMES:
+        raw.setdefault("inputs", {})[key] = value
+        return
+    if key in _SAMPLING_FIELD_NAMES:
+        raw.setdefault("sampling", {})[key] = value
+        return
+    if key in _RUNTIME_FIELD_NAMES:
+        raw.setdefault("runtime", {})[key] = value
+        return
+    if key in _OUTPUT_FIELD_NAMES:
+        raw.setdefault("output", {})[key] = value
+        return
+    raw.setdefault("extensions", {})[key] = value
+def request_to_pipeline_overrides(request: GenerationRequest) -> dict[str, Any]:
+    overrides: dict[str, Any] = {}
+    for key, value in _explicit_request_updates(request).items():
+        if key in _REQUEST_PIPELINE_OVERRIDE_FIELDS:
+            overrides[key] = deepcopy(value)
+    return overrides
+def _explicit_request_updates(request: GenerationRequest) -> dict[str, Any]:
+    raw = getattr(request, _EXPLICIT_REQUEST_ATTR, None)
+    if raw is None:
+        raw = _serialize_generation_request(request)
+    return _extract_request_updates(raw)
+def _extract_request_updates(raw: Mapping[str, Any]) -> dict[str, Any]:
+    updates: dict[str, Any] = {}
+    if "negative_prompt" in raw:
+        updates["negative_prompt"] = deepcopy(raw["negative_prompt"])
+    for section_name in ("inputs", "sampling", "runtime", "output"):
+        section = raw.get(section_name)
+        if not isinstance(section, Mapping):
+            continue
+        for key, value in section.items():
+            updates[key] = deepcopy(value)
+    stage_overrides = raw.get("stage_overrides")
+    if stage_overrides:
+        updates.update(_flatten_stage_overrides(stage_overrides))
+    extensions = raw.get("extensions")
+    if isinstance(extensions, Mapping):
+        for key, value in extensions.items():
+            updates[key] = deepcopy(value)
+    return updates
+def _flatten_stage_overrides(stage_overrides: Any) -> dict[str, Any]:
+    if not isinstance(stage_overrides, Mapping):
+        raise ValueError("GenerationRequest.stage_overrides must be a mapping")
+    flattened: dict[str, Any] = {}
+    for stage_name, overrides in stage_overrides.items():
+        if not isinstance(overrides, Mapping):
+            raise ValueError(f"GenerationRequest.stage_overrides.{stage_name} must be a mapping")
+        for key, value in overrides.items():
+            if key in flattened and flattened[key] != value:
+                raise ValueError(f"Conflicting stage override for {key!r} across stages")
+            flattened[key] = deepcopy(value)
+    return flattened
+def _serialize_generation_request(request: GenerationRequest) -> dict[str, Any]:
+    return deepcopy(config_to_dict(request))
+def _fan_out_batched_input_value(
+    source_request: GenerationRequest,
+    target_request: GenerationRequest,
+    field_name: str,
+    index: int,
+) -> None:
+    value = getattr(source_request.inputs, field_name)
+    if not isinstance(value, list):
+        return
+    _validate_batched_input_length(source_request.prompt, value, field_name)
+    setattr(target_request.inputs, field_name, deepcopy(value[index]))
+def _fan_out_explicit_request_metadata(
+    source_request: GenerationRequest,
+    target_request: GenerationRequest,
+    index: int,
+    prompt: str,
+) -> None:
+    raw = getattr(source_request, _EXPLICIT_REQUEST_ATTR, None)
+    if raw is None:
+        return
+    raw = deepcopy(raw)
+    raw["prompt"] = prompt
+    inputs = raw.get("inputs")
+    if isinstance(inputs, dict):
+        for field_name in ("image_path", "video_path"):
+            value = inputs.get(field_name)
+            if isinstance(value, list):
+                _validate_batched_input_length(source_request.prompt, value, field_name)
+                inputs[field_name] = deepcopy(value[index])
+    setattr(target_request, _EXPLICIT_REQUEST_ATTR, raw)
+def _validate_batched_input_length(
+    prompts: str | list[str] | None,
+    values: list[Any],
+    field_name: str,
+) -> None:
+    if not isinstance(prompts, list):
+        return
+    if len(values) != len(prompts):
+        raise ValueError(f"GenerationRequest.inputs.{field_name} must have the same length as request.prompt")
+def _is_supported_as_default_only(key: str, value: Any) -> bool:
+    default_value = _DEFAULT_REQUEST_UPDATES.get(key, _MISSING)
+    return default_value is not _MISSING and _values_equal(value, default_value)
+def _collect_non_default_fields(
+    value: Any,
+    default: Any,
+) -> dict[str, Any]:
+    if not (is_dataclass(value) and is_dataclass(default)):
+        return {}
+    result: dict[str, Any] = {}
+    for field in fields(value):
+        current = getattr(value, field.name)
+        default_value = getattr(default, field.name)
+        if is_dataclass(current) and is_dataclass(default_value):
+            nested = _collect_non_default_fields(current, default_value)
+            if nested:
+                result[field.name] = nested
+            continue
+        if not _values_equal(current, default_value):
+            result[field.name] = deepcopy(current)
+    return result
+def _values_equal(left: Any, right: Any) -> bool:
+    if left is right:
+        return True
+    try:
+        return bool(left == right)
+    except Exception:
+        return False
+_DEFAULT_REQUEST_UPDATES = _extract_request_updates(config_to_dict(GenerationRequest()))
+__all__ = [
+    "generator_config_to_fastvideo_args",
+    "legacy_from_pretrained_to_config",
+    "legacy_generate_call_to_request",
+    "load_generator_config_from_file",
+    "normalize_generation_request",
+    "normalize_generator_config",
+    "request_to_pipeline_overrides",
+    "request_to_sampling_param",
+]

standalone_inference/overlay_files/fastvideo/attention/backends/sparse_fp4_ours_p_attn.py ADDED Viewed

	@@ -0,0 +1,192 @@

+# SPDX-License-Identifier: Apache-2.0
+"""Sparse FP4 Attention backend with the independent ours-P quant kernel."""
+import math
+import torch
+import torch.nn.functional as F
+import triton
+from fastvideo_kernel.triton_kernels.quant_utils import (
+    fake_quantize_q,
+    fake_quantize_kv,
+)
+from fastvideo_kernel.block_sparse_attn_ours_p import block_sparse_attn_ours_p
+from fastvideo.forward_context import get_forward_context
+from fastvideo.attention.backends.abstract import (
+    AttentionBackend, AttentionImpl, AttentionMetadata, AttentionMetadataBuilder,
+)
+from fastvideo.attention.backends.video_sparse_attn import (
+    VideoSparseAttentionMetadata,
+    VideoSparseAttentionMetadataBuilder,
+    VSA_TILE_SIZE,
+)
+from fastvideo.distributed import get_sp_group
+from fastvideo.logger import init_logger
+logger = init_logger(__name__)
+def _dense_sdpa_blhd(query, key, value):
+    q = query.transpose(1, 2)
+    k = key.transpose(1, 2)
+    v = value.transpose(1, 2)
+    out = F.scaled_dot_product_attention(q, k, v, is_causal=False)
+    return out.transpose(1, 2)
+def _quantize_qkv_bhld(q, k, v):
+    """FP4 fake quantize Q/K/V in BHLD layout, same as attn_qat_train."""
+    H = q.shape[1]
+    N_Q = q.shape[2]
+    N_KV = k.shape[2]
+    D = q.shape[3]
+    BLOCK = 32
+    fake_q = torch.empty_like(q)
+    fake_k = torch.empty_like(k)
+    fake_v = torch.empty_like(v)
+    grid_q = (triton.cdiv(N_Q, BLOCK), q.shape[0] * H, 1)
+    grid_kv = (triton.cdiv(N_KV, BLOCK), q.shape[0] * H, 1)
+    fake_quantize_q[grid_q](
+        q, fake_q,
+        q.stride(0), q.stride(1), q.stride(2), q.stride(3),
+        fake_q.stride(0), fake_q.stride(1), fake_q.stride(2), fake_q.stride(3),
+        H, N_Q, BLOCK_M=BLOCK, HEAD_DIM=D, use_global_sf=False,
+    )
+    fake_quantize_kv[grid_kv](
+        k, v, fake_k, fake_v,
+        k.stride(0), k.stride(1), k.stride(2), k.stride(3),
+        fake_k.stride(0), fake_k.stride(1), fake_k.stride(2), fake_k.stride(3),
+        H, N_KV, BLOCK_N=BLOCK, HEAD_DIM=D, use_global_sf=False,
+    )
+    return fake_q, fake_k, fake_v
+class SparseFP4OursPAttentionBackend(AttentionBackend):
+    accept_output_buffer: bool = True
+    @staticmethod
+    def get_supported_head_sizes() -> list[int]:
+        return [64, 96, 128, 160, 192, 224, 256]
+    @staticmethod
+    def get_name() -> str:
+        return "SPARSE_FP4_OURS_P_ATTN"
+    @staticmethod
+    def get_impl_cls() -> type["SparseFP4OursPAttentionImpl"]:
+        return SparseFP4OursPAttentionImpl
+    @staticmethod
+    def get_metadata_cls() -> type["VideoSparseAttentionMetadata"]:
+        return VideoSparseAttentionMetadata
+    @staticmethod
+    def get_builder_cls() -> type["VideoSparseAttentionMetadataBuilder"]:
+        return VideoSparseAttentionMetadataBuilder
+class SparseFP4OursPAttentionImpl(AttentionImpl):
+    def __init__(self, num_heads, head_size, causal, softmax_scale,
+                 num_kv_heads=None, prefix="", **extra):
+        self.prefix = prefix
+        self.sp_size = get_sp_group().world_size
+    def tile(self, x, num_tiles, tile_partition_indices, non_pad_index):
+        t_p = num_tiles[0] * VSA_TILE_SIZE[0]
+        h_p = num_tiles[1] * VSA_TILE_SIZE[1]
+        w_p = num_tiles[2] * VSA_TILE_SIZE[2]
+        out = torch.zeros(
+            (x.shape[0], t_p * h_p * w_p, x.shape[-2], x.shape[-1]),
+            device=x.device, dtype=x.dtype,
+        )
+        out[:, non_pad_index] = x[:, tile_partition_indices]
+        return out
+    def untile(self, x, reverse_tile_partition_indices, non_pad_index):
+        return x[:, non_pad_index][:, reverse_tile_partition_indices]
+    def _is_force_dense(self) -> bool:
+        ctx = get_forward_context()
+        return ctx.force_dense
+    def preprocess_qkv(self, qkv, attn_metadata):
+        if attn_metadata is None or self._is_force_dense():
+            return qkv
+        return self.tile(qkv, attn_metadata.num_tiles,
+                         attn_metadata.tile_partition_indices,
+                         attn_metadata.non_pad_index)
+    def postprocess_output(self, output, attn_metadata):
+        if attn_metadata is None or self._is_force_dense():
+            return output
+        return self.untile(output,
+                           attn_metadata.reverse_tile_partition_indices,
+                           attn_metadata.non_pad_index)
+    def forward(self, query, key, value,
+                gate_compress_or_metadata=None, attn_metadata=None):
+        # Handle both call conventions
+        if attn_metadata is None and isinstance(
+                gate_compress_or_metadata, (VideoSparseAttentionMetadata, type(None))):
+            attn_metadata = gate_compress_or_metadata
+        # ── force_dense: true dense BF16 SDPA (for teacher in distillation) ──
+        ctx = get_forward_context()
+        if ctx.force_dense:
+            return _dense_sdpa_blhd(query, key, value)
+        is_cross = query.shape[1] != key.shape[1]
+        # ── Cross-attention/no metadata: keep dense. The sparse VSA metadata only
+        # applies to tiled video self-attention.
+        if attn_metadata is None or is_cross:
+            return _dense_sdpa_blhd(query, key, value)
+        # ── Self-attention: FP4 quant Q/K/V + block-sparse attention ──
+        # BLHD → BHLD
+        q = query.transpose(1, 2).contiguous()
+        k = key.transpose(1, 2).contiguous()
+        v = value.transpose(1, 2).contiguous()
+        # Step 1: FP4 fake quantize Q/K/V with STE (straight-through estimator)
+        with torch.no_grad():
+            fq, fk, fv = _quantize_qkv_bhld(q, k, v)
+        # STE: forward uses quantized values, backward passes gradient through as-is
+        fq = q + (fq - q).detach()
+        fk = k + (fk - k).detach()
+        fv = v + (fv - v).detach()
+        # Step 2: Build sparse block map
+        B, H, S, D = fq.shape
+        block_elements = math.prod(VSA_TILE_SIZE)
+        num_blocks = S // block_elements
+        VSA_sparsity = attn_metadata.VSA_sparsity
+        cur_topk = max(1, math.ceil((1 - VSA_sparsity) * num_blocks))
+        logger.info(f"[SFP4] S={S} num_blocks={num_blocks} sparsity={VSA_sparsity} topk={cur_topk}/{num_blocks}")
+        block_sizes = attn_metadata.variable_block_sizes.to(
+            device=fq.device, dtype=torch.float32).clamp_min(1)
+        block_sizes = block_sizes.view(1, 1, num_blocks, 1)
+        q_c = (fq.view(B, H, num_blocks, block_elements, D).float().sum(3) /
+               block_sizes).to(fq.dtype)
+        k_c = (fk.view(B, H, num_blocks, block_elements, D).float().sum(3) /
+               block_sizes).to(fk.dtype)
+        v_c = (fv.view(B, H, num_blocks, block_elements, D).float().sum(3) /
+               block_sizes).to(fv.dtype)
+        scores = torch.matmul(q_c, k_c.transpose(-2, -1)) / (D ** 0.5)
+        topk_idx = torch.topk(scores, cur_topk, dim=-1).indices
+        block_map = torch.zeros_like(scores, dtype=torch.bool).scatter_(-1, topk_idx, True)
+        # Step 3: Block-sparse attention with independent group-local P quant.
+        out, _ = block_sparse_attn_ours_p(fq, fk, fv, block_map,
+                                          attn_metadata.variable_block_sizes,
+                                          q_c, k_c, v_c)
+        return out.transpose(1, 2)  # BHLD → BLHD

standalone_inference/overlay_files/fastvideo/attention/backends/video_sparse_attn.py ADDED Viewed

	@@ -0,0 +1,262 @@

+# SPDX-License-Identifier: Apache-2.0
+import functools
+import math
+from dataclasses import dataclass
+import torch
+try:
+    from fastvideo_kernel import video_sparse_attn
+except ImportError:
+    video_sparse_attn = None
+from typing import Any
+from fastvideo.attention.backends.abstract import (AttentionBackend, AttentionImpl, AttentionMetadata,
+                                                   AttentionMetadataBuilder)
+from fastvideo.distributed import get_sp_group
+from fastvideo.logger import init_logger
+logger = init_logger(__name__)
+VSA_TILE_SIZE = (4, 4, 4)
+@functools.lru_cache(maxsize=10)
+def get_tile_partition_indices(
+    dit_seq_shape: tuple[int, int, int],
+    tile_size: tuple[int, int, int],
+    device: torch.device,
+) -> torch.LongTensor:
+    T, H, W = dit_seq_shape
+    ts, hs, ws = tile_size
+    indices = torch.arange(T * H * W, device=device, dtype=torch.long).reshape(T, H, W)
+    ls = []
+    for t in range(math.ceil(T / ts)):
+        for h in range(math.ceil(H / hs)):
+            for w in range(math.ceil(W / ws)):
+                ls.append(indices[t * ts:min(t * ts + ts, T), h * hs:min(h * hs + hs, H),
+                                  w * ws:min(w * ws + ws, W)].flatten())
+    index = torch.cat(ls, dim=0)
+    return index
+@functools.lru_cache(maxsize=10)
+def get_reverse_tile_partition_indices(
+    dit_seq_shape: tuple[int, int, int],
+    tile_size: tuple[int, int, int],
+    device: torch.device,
+) -> torch.LongTensor:
+    return torch.argsort(get_tile_partition_indices(dit_seq_shape, tile_size, device))
+@functools.lru_cache(maxsize=10)
+def construct_variable_block_sizes(
+    dit_seq_shape: tuple[int, int, int],
+    num_tiles: tuple[int, int, int],
+    device: torch.device,
+) -> torch.LongTensor:
+    """
+    Compute the number of valid (non‑padded) tokens inside every
+    (ts_t × ts_h × ts_w) tile after padding ‑‑ flattened in the order
+    (t‑tile, h‑tile, w‑tile) that `rearrange` uses.
+    Returns
+    -------
+    torch.LongTensor  # shape: [∏ full_window_size]
+    """
+    # unpack
+    t, h, w = dit_seq_shape
+    ts_t, ts_h, ts_w = VSA_TILE_SIZE
+    n_t, n_h, n_w = num_tiles
+    def _sizes(dim_len: int, tile: int, n_tiles: int) -> torch.LongTensor:
+        """Vector with the size of each tile along one dimension."""
+        sizes = torch.full((n_tiles, ), tile, dtype=torch.int, device=device)
+        # size of last (possibly partial) tile
+        remainder = dim_len - (n_tiles - 1) * tile
+        sizes[-1] = remainder if remainder > 0 else tile
+        return sizes
+    t_sizes = _sizes(t, ts_t, n_t)  # [n_t]
+    h_sizes = _sizes(h, ts_h, n_h)  # [n_h]
+    w_sizes = _sizes(w, ts_w, n_w)  # [n_w]
+    # broadcast‑multiply to get voxels per tile, then flatten
+    block_sizes = (
+        t_sizes[:, None, None]  # [n_t, 1,   1]
+        * h_sizes[None, :, None]  # [1,   n_h, 1]
+        * w_sizes[None, None, :]  # [1,   1,   n_w]
+    ).reshape(-1)  # [n_t * n_h * n_w]
+    return block_sizes
+@functools.lru_cache(maxsize=10)
+def get_non_pad_index(
+    variable_block_sizes: torch.LongTensor,
+    max_block_size: int,
+):
+    n_win = variable_block_sizes.shape[0]
+    device = variable_block_sizes.device
+    starts_pad = torch.arange(n_win, device=device) * max_block_size
+    index_pad = starts_pad[:, None] + torch.arange(max_block_size, device=device)[None, :]
+    index_mask = torch.arange(max_block_size, device=device)[None, :] < variable_block_sizes[:, None]
+    return index_pad[index_mask]
+class VideoSparseAttentionBackend(AttentionBackend):
+    accept_output_buffer: bool = True
+    @staticmethod
+    def get_supported_head_sizes() -> list[int]:
+        return [64, 128]
+    @staticmethod
+    def get_name() -> str:
+        return "VIDEO_SPARSE_ATTN"
+    @staticmethod
+    def get_impl_cls() -> type["VideoSparseAttentionImpl"]:
+        return VideoSparseAttentionImpl
+    @staticmethod
+    def get_metadata_cls() -> type["VideoSparseAttentionMetadata"]:
+        return VideoSparseAttentionMetadata
+    @staticmethod
+    def get_builder_cls() -> type["VideoSparseAttentionMetadataBuilder"]:
+        return VideoSparseAttentionMetadataBuilder
+@dataclass
+class VideoSparseAttentionMetadata(AttentionMetadata):
+    current_timestep: int
+    dit_seq_shape: list[int]
+    num_tiles: list[int]
+    total_seq_length: int
+    tile_partition_indices: torch.LongTensor
+    reverse_tile_partition_indices: torch.LongTensor
+    variable_block_sizes: torch.LongTensor
+    non_pad_index: torch.LongTensor
+class VideoSparseAttentionMetadataBuilder(AttentionMetadataBuilder):
+    def __init__(self) -> None:
+        pass
+    def prepare(self) -> None:
+        pass
+    def build(  # type: ignore
+        self,
+        current_timestep: int,
+        raw_latent_shape: tuple[int, int, int],
+        patch_size: tuple[int, int, int],
+        VSA_sparsity: float,
+        device: torch.device,
+        **kwargs: dict[str, Any],
+    ) -> VideoSparseAttentionMetadata:
+        patch_size = patch_size
+        dit_seq_shape = (raw_latent_shape[0] // patch_size[0], raw_latent_shape[1] // patch_size[1],
+                         raw_latent_shape[2] // patch_size[2])
+        num_tiles = (math.ceil(dit_seq_shape[0] / VSA_TILE_SIZE[0]), math.ceil(dit_seq_shape[1] / VSA_TILE_SIZE[1]),
+                     math.ceil(dit_seq_shape[2] / VSA_TILE_SIZE[2]))
+        total_seq_length = math.prod(dit_seq_shape)
+        tile_partition_indices = get_tile_partition_indices(dit_seq_shape, VSA_TILE_SIZE, device)
+        reverse_tile_partition_indices = get_reverse_tile_partition_indices(dit_seq_shape, VSA_TILE_SIZE, device)
+        variable_block_sizes = construct_variable_block_sizes(dit_seq_shape, num_tiles, device)
+        non_pad_index = get_non_pad_index(variable_block_sizes, math.prod(VSA_TILE_SIZE))
+        return VideoSparseAttentionMetadata(
+            current_timestep=current_timestep,
+            dit_seq_shape=dit_seq_shape,  # type: ignore
+            VSA_sparsity=VSA_sparsity,  # type: ignore
+            num_tiles=num_tiles,  # type: ignore
+            total_seq_length=total_seq_length,  # type: ignore
+            tile_partition_indices=tile_partition_indices,  # type: ignore
+            reverse_tile_partition_indices=reverse_tile_partition_indices,
+            variable_block_sizes=variable_block_sizes,
+            non_pad_index=non_pad_index)
+class VideoSparseAttentionImpl(AttentionImpl):
+    def __init__(
+        self,
+        num_heads: int,
+        head_size: int,
+        causal: bool,
+        softmax_scale: float,
+        num_kv_heads: int | None = None,
+        prefix: str = "",
+        **extra_impl_args,
+    ) -> None:
+        self.prefix = prefix
+        sp_group = get_sp_group()
+        self.sp_size = sp_group.world_size
+    def tile(self, x: torch.Tensor, num_tiles: list[int], tile_partition_indices: torch.LongTensor,
+             non_pad_index: torch.LongTensor) -> torch.Tensor:
+        t_padded_size = num_tiles[0] * VSA_TILE_SIZE[0]
+        h_padded_size = num_tiles[1] * VSA_TILE_SIZE[1]
+        w_padded_size = num_tiles[2] * VSA_TILE_SIZE[2]
+        x_padded = torch.zeros((x.shape[0], t_padded_size * h_padded_size * w_padded_size, x.shape[-2], x.shape[-1]),
+                               device=x.device,
+                               dtype=x.dtype)
+        x_padded[:, non_pad_index] = x[:, tile_partition_indices]
+        return x_padded
+    def untile(self, x: torch.Tensor, reverse_tile_partition_indices: torch.LongTensor,
+               non_pad_index: torch.LongTensor) -> torch.Tensor:
+        x = x[:, non_pad_index][:, reverse_tile_partition_indices]
+        return x
+    def preprocess_qkv(
+        self,
+        qkv: torch.Tensor,
+        attn_metadata: VideoSparseAttentionMetadata,
+    ) -> torch.Tensor:
+        return self.tile(qkv, attn_metadata.num_tiles, attn_metadata.tile_partition_indices,
+                         attn_metadata.non_pad_index)
+    def postprocess_output(
+        self,
+        output: torch.Tensor,
+        attn_metadata: VideoSparseAttentionMetadata,
+    ) -> torch.Tensor:
+        return self.untile(output, attn_metadata.reverse_tile_partition_indices, attn_metadata.non_pad_index)
+    def forward(  # type: ignore[override]
+        self,
+        query: torch.Tensor,
+        key: torch.Tensor,
+        value: torch.Tensor,
+        gate_compress: torch.Tensor,
+        attn_metadata: VideoSparseAttentionMetadata,
+    ) -> torch.Tensor:
+        query = query.transpose(1, 2).contiguous()
+        key = key.transpose(1, 2).contiguous()
+        value = value.transpose(1, 2).contiguous()
+        gate_compress = gate_compress.transpose(1, 2).contiguous()
+        VSA_sparsity = attn_metadata.VSA_sparsity
+        cur_topk = math.ceil((1 - VSA_sparsity) * (attn_metadata.total_seq_length / math.prod(VSA_TILE_SIZE)))
+        if video_sparse_attn is None:
+            raise NotImplementedError("video_sparse_attn is not installed")
+        hidden_states = video_sparse_attn(query,
+                                          key,
+                                          value,
+                                          attn_metadata.variable_block_sizes,
+                                          attn_metadata.variable_block_sizes,
+                                          cur_topk,
+                                          block_size=VSA_TILE_SIZE,
+                                          compress_attn_weight=gate_compress).transpose(1, 2)
+        return hidden_states

standalone_inference/overlay_files/fastvideo/configs/models/dits/base.py ADDED Viewed

	@@ -0,0 +1,79 @@

+# SPDX-License-Identifier: Apache-2.0
+from dataclasses import dataclass, field
+from typing import Any
+from fastvideo.configs.models.base import ArchConfig, ModelConfig
+from fastvideo.layers.quantization import QuantizationConfig
+from fastvideo.platforms import AttentionBackendEnum
+@dataclass
+class DiTArchConfig(ArchConfig):
+    _fsdp_shard_conditions: list = field(default_factory=list)
+    _compile_conditions: list = field(default_factory=list)
+    param_names_mapping: dict = field(default_factory=dict)
+    reverse_param_names_mapping: dict = field(default_factory=dict)
+    lora_param_names_mapping: dict = field(default_factory=dict)
+    _supported_attention_backends: tuple[AttentionBackendEnum,
+                                         ...] = (AttentionBackendEnum.SAGE_ATTN, AttentionBackendEnum.FLASH_ATTN,
+                                                 AttentionBackendEnum.TORCH_SDPA,
+                                                 AttentionBackendEnum.VIDEO_SPARSE_ATTN,
+                                                 AttentionBackendEnum.VMOBA_ATTN, AttentionBackendEnum.SAGE_ATTN_THREE,
+                                                 AttentionBackendEnum.ATTN_QAT_INFER,
+                                                 AttentionBackendEnum.ATTN_QAT_TRAIN, AttentionBackendEnum.SLA_ATTN,
+                                                 AttentionBackendEnum.SAGE_SLA_ATTN,
+                                                 AttentionBackendEnum.SPARSE_FP4_ATTN,
+                                                 AttentionBackendEnum.SPARSE_FP4_OURS_P_ATTN)
+    hidden_size: int = 0
+    num_attention_heads: int = 0
+    num_channels_latents: int = 0
+    in_channels: int | None = 0
+    out_channels: int | None = 0
+    patch_size: int | tuple[int, int, int] | None = None
+    expand_timesteps: bool = False
+    num_layers: int = 0
+    ffn_dim: int = 0
+    exclude_lora_layers: list[str] = field(default_factory=list)
+    boundary_ratio: float | None = None
+    def __post_init__(self) -> None:
+        if not self._compile_conditions:
+            self._compile_conditions = self._fsdp_shard_conditions.copy()
+@dataclass
+class DiTConfig(ModelConfig):
+    arch_config: DiTArchConfig = field(default_factory=DiTArchConfig)
+    # FastVideoDiT-specific parameters
+    prefix: str = ""
+    quant_config: QuantizationConfig | None = None
+    expand_timesteps: bool = False
+    boundary_ratio: float | None = None
+    def __post_init__(self) -> None:
+        super().__post_init__()
+        self.arch_config.expand_timesteps = self.expand_timesteps
+        self.arch_config.boundary_ratio = self.boundary_ratio
+    @staticmethod
+    def add_cli_args(parser: Any, prefix: str = "dit-config") -> Any:
+        """Add CLI arguments for DiTConfig fields"""
+        parser.add_argument(
+            f"--{prefix}.prefix",
+            type=str,
+            dest=f"{prefix.replace('-', '_')}.prefix",
+            default=DiTConfig.prefix,
+            help="Prefix for the DiT model",
+        )
+        parser.add_argument(
+            f"--{prefix}.quant-config",
+            type=str,
+            dest=f"{prefix.replace('-', '_')}.quant_config",
+            default=None,
+            help="Quantization configuration for the DiT model",
+        )
+        return parser

standalone_inference/overlay_files/fastvideo/configs/pipelines/wan.py ADDED Viewed

	@@ -0,0 +1,203 @@

+# SPDX-License-Identifier: Apache-2.0
+from collections.abc import Callable
+from dataclasses import dataclass, field
+import torch
+from fastvideo.configs.models import DiTConfig, EncoderConfig, VAEConfig
+from fastvideo.configs.models.dits import WanVideoConfig
+from fastvideo.configs.models.dits.matrixgame import MatrixGameWanVideoConfig
+from fastvideo.configs.models.encoders import (BaseEncoderOutput, CLIPVisionConfig, T5Config,
+                                               WAN2_1ControlCLIPVisionConfig)
+from fastvideo.configs.models.vaes import WanVAEConfig
+from fastvideo.configs.pipelines.base import PipelineConfig
+def t5_postprocess_text(outputs: BaseEncoderOutput) -> torch.Tensor:
+    mask: torch.Tensor = outputs.attention_mask
+    hidden_state: torch.Tensor = outputs.last_hidden_state
+    seq_lens = mask.gt(0).sum(dim=1).long()
+    assert torch.isnan(hidden_state).sum() == 0
+    prompt_embeds = [u[:v] for u, v in zip(hidden_state, seq_lens, strict=True)]
+    prompt_embeds_tensor: torch.Tensor = torch.stack(
+        [torch.cat([u, u.new_zeros(512 - u.size(0), u.size(1))]) for u in prompt_embeds], dim=0)
+    return prompt_embeds_tensor
+@dataclass
+class WanT2V480PConfig(PipelineConfig):
+    """Base configuration for Wan T2V 1.3B pipeline architecture."""
+    # WanConfig-specific parameters with defaults
+    # DiT
+    dit_config: DiTConfig = field(default_factory=WanVideoConfig)
+    # VAE
+    vae_config: VAEConfig = field(default_factory=WanVAEConfig)
+    vae_tiling: bool = False
+    vae_sp: bool = False
+    # Denoising stage
+    flow_shift: float | None = 3.0
+    # Text encoding stage
+    text_encoder_configs: tuple[EncoderConfig, ...] = field(default_factory=lambda: (T5Config(), ))
+    postprocess_text_funcs: tuple[Callable[[BaseEncoderOutput], torch.Tensor],
+                                  ...] = field(default_factory=lambda: (t5_postprocess_text, ))
+    # Precision for each component
+    precision: str = "bf16"
+    vae_precision: str = "fp32"
+    text_encoder_precisions: tuple[str, ...] = field(default_factory=lambda: ("fp32", ))
+    # self-forcing params
+    warp_denoising_step: bool = True
+    # WanConfig-specific added parameters
+    def __post_init__(self):
+        self.vae_config.load_encoder = False
+        self.vae_config.load_decoder = True
+@dataclass
+class WanT2V720PConfig(WanT2V480PConfig):
+    """Base configuration for Wan T2V 14B 720P pipeline architecture."""
+    # WanConfig-specific parameters with defaults
+    # Denoising stage
+    flow_shift: float | None = 5.0
+@dataclass
+class WanI2V480PConfig(WanT2V480PConfig):
+    """Base configuration for Wan I2V 14B 480P pipeline architecture."""
+    # WanConfig-specific parameters with defaults
+    # Precision for each component
+    image_encoder_config: EncoderConfig = field(default_factory=CLIPVisionConfig)
+    image_encoder_precision: str = "fp32"
+    def __post_init__(self) -> None:
+        self.vae_config.load_encoder = True
+        self.vae_config.load_decoder = True
+@dataclass
+class WanI2V720PConfig(WanI2V480PConfig):
+    """Base configuration for Wan I2V 14B 720P pipeline architecture."""
+    # WanConfig-specific parameters with defaults
+    # Denoising stage
+    flow_shift: float | None = 5.0
+@dataclass
+class WANV2VConfig(WanI2V480PConfig):
+    """Configuration for WAN2.1 1.3B Control pipeline."""
+    image_encoder_config: EncoderConfig = field(default_factory=WAN2_1ControlCLIPVisionConfig)
+    # CLIP encoder precision
+    image_encoder_precision: str = 'bf16'
+@dataclass
+class FastWan2_1_T2V_480P_Config(WanT2V480PConfig):
+    """Base configuration for FastWan T2V 1.3B 480P pipeline architecture with DMD"""
+    # WanConfig-specific parameters with defaults
+    # Denoising stage
+    flow_shift: float | None = 8.0
+    dmd_denoising_steps: list[int] | None = field(default_factory=lambda: [1000, 757, 522])
+@dataclass
+class Wan2_2_TI2V_5B_Config(WanT2V480PConfig):
+    flow_shift: float | None = 5.0
+    ti2v_task: bool = True
+    expand_timesteps: bool = True
+    def __post_init__(self) -> None:
+        self.vae_config.load_encoder = True
+        self.vae_config.load_decoder = True
+        self.dit_config.expand_timesteps = self.expand_timesteps
+@dataclass
+class FastWan2_2_TI2V_5B_Config(Wan2_2_TI2V_5B_Config):
+    flow_shift: float | None = 5.0
+    dmd_denoising_steps: list[int] | None = field(default_factory=lambda: [1000, 757, 522])
+@dataclass
+class Wan2_2_T2V_A14B_Config(WanT2V480PConfig):
+    flow_shift: float | None = 12.0
+    boundary_ratio: float | None = 0.875
+    # self-forcing params
+    dmd_denoising_steps: list[int] | None = field(default_factory=lambda: [1000, 750, 500, 250])
+    warp_denoising_step: bool = True
+    def __post_init__(self) -> None:
+        self.dit_config.boundary_ratio = self.boundary_ratio
+@dataclass
+class Wan2_2_I2V_A14B_Config(WanI2V480PConfig):
+    flow_shift: float | None = 5.0
+    boundary_ratio: float | None = 0.900
+    def __post_init__(self) -> None:
+        super().__post_init__()
+        self.dit_config.boundary_ratio = self.boundary_ratio
+# =============================================
+# ============= Causal Self-Forcing =============
+# =============================================
+@dataclass
+class SelfForcingWanT2V480PConfig(WanT2V480PConfig):
+    is_causal: bool = True
+    flow_shift: float | None = 5.0
+    dmd_denoising_steps: list[int] | None = field(default_factory=lambda: [1000, 750, 500, 250])
+    warp_denoising_step: bool = True
+@dataclass
+class SelfForcingWan2_2_T2V480PConfig(Wan2_2_T2V_A14B_Config):
+    is_causal: bool = True
+    flow_shift: float | None = 12.0
+    boundary_ratio: float | None = 0.875
+    dmd_denoising_steps: list[int] | None = field(default_factory=lambda: [1000, 850, 700, 550, 350, 275, 200, 125])
+    warp_denoising_step: bool = True
+    def __post_init__(self) -> None:
+        self.vae_config.load_encoder = True
+        self.vae_config.load_decoder = True
+# =============================================
+# ============= Matrix Game ===================
+# =============================================
+@dataclass
+class MatrixGameBaseI2V480PConfig(WanI2V480PConfig):
+    dit_config: DiTConfig = field(default_factory=MatrixGameWanVideoConfig)
+    flow_shift: float | None = 5.0
+@dataclass
+class MatrixGameI2V480PConfig(WanI2V480PConfig):
+    dit_config: DiTConfig = field(default_factory=MatrixGameWanVideoConfig)
+    image_encoder_config: EncoderConfig = field(default_factory=WAN2_1ControlCLIPVisionConfig)
+    is_causal: bool = True
+    flow_shift: float | None = 5.0
+    dmd_denoising_steps: list[int] | None = field(default_factory=lambda: [1000, 666, 333])
+    warp_denoising_step: bool = True
+    context_noise: int = 0
+    num_frames_per_block: int = 3
+    # sliding_window_num_frames: int = 15

standalone_inference/overlay_files/fastvideo/configs/sample/base.py ADDED Viewed

	@@ -0,0 +1,292 @@

+# SPDX-License-Identifier: Apache-2.0
+from dataclasses import dataclass
+from typing import Any
+from fastvideo.logger import init_logger
+from fastvideo.utils import StoreBoolean
+logger = init_logger(__name__)
+@dataclass
+class SamplingParam:
+    """
+    Sampling parameters for video generation.
+    """
+    # All fields below are copied from ForwardBatch
+    data_type: str = "video"
+    # Image inputs
+    image_path: str | None = None
+    pil_image: Any | None = None
+    # Video inputs
+    video_path: str | None = None
+    # Action control inputs (Matrix-Game)
+    mouse_cond: Any | None = None  # Shape: (B, T, 2)
+    keyboard_cond: Any | None = None  # Shape: (B, T, K)
+    grid_sizes: Any | None = None  # Shape: (3,) [F,H,W]
+    # Camera control inputs (HYWorld)
+    pose: str | None = None  # Camera trajectory: pose string (e.g., 'w-31') or JSON file path
+    # Camera control inputs (LingBotWorld)
+    c2ws_plucker_emb: Any | None = None  # Plucker embedding: [B, C, F_lat, H_lat, W_lat]
+    # Refine inputs (LongCat 480p->720p upscaling)
+    # Path-based refine (load stage1 video from disk, e.g. MP4)
+    refine_from: str | None = None  # Path to stage1 video (480p output from distill)
+    t_thresh: float = 0.5  # Threshold for timestep scheduling in refinement
+    spatial_refine_only: bool = False  # If True, only spatial (no temporal doubling)
+    num_cond_frames: int = 0  # Number of conditioning frames
+    # In-memory refine input (for two-stage pipeline where stage1 frames are already in memory)
+    # This mirrors LongCat's demo where a list of frames (e.g. np.ndarray or PIL.Image)
+    # is passed directly to the refinement pipeline instead of reloading from disk.
+    stage1_video: Any | None = None
+    # Text inputs
+    prompt: str | list[str] | None = None
+    negative_prompt: str | None = "Bright tones, overexposed, static, blurred details, subtitles, style, works, paintings, images, static, overall gray, worst quality, low quality, JPEG compression residue, ugly, incomplete, extra fingers, poorly drawn hands, poorly drawn faces, deformed, disfigured, misshapen limbs, fused fingers, still picture, messy background, three legs, many people in the background, walking backwards"
+    prompt_path: str | None = None
+    output_path: str = "outputs/"
+    output_video_name: str | None = None
+    # Batch info
+    num_videos_per_prompt: int = 1
+    seed: int = 1024
+    # Original dimensions (before VAE scaling)
+    num_frames: int = 125
+    height: int = 720
+    width: int = 1280
+    height_sr: int = 1072
+    width_sr: int = 1920
+    fps: int = 24
+    # Denoising parameters
+    num_inference_steps: int = 50
+    num_inference_steps_sr: int = 50
+    guidance_scale: float = 1.0
+    guidance_scale_2: float | None = None
+    guidance_rescale: float = 0.0
+    boundary_ratio: float | None = None
+    sigmas: list[float] | None = None
+    # TeaCache parameters
+    enable_teacache: bool = False
+    # GEN3C camera control
+    trajectory_type: str | None = None
+    movement_distance: float | None = None
+    camera_rotation: str | None = None
+    # Misc
+    save_video: bool = True
+    return_frames: bool = True
+    return_trajectory_latents: bool = False  # returns all latents for each timestep
+    return_trajectory_decoded: bool = False  # returns decoded latents for each timestep
+    def __post_init__(self) -> None:
+        self.data_type = "video" if self.num_frames > 1 else "image"
+    def __getattr__(self, name: str) -> Any:
+        raise AttributeError(f"'{type(self).__name__}' object has no attribute '{name}'")
+    def check_sampling_param(self) -> None:
+        if self.prompt_path and not self.prompt_path.endswith(".txt"):
+            raise ValueError("prompt_path must be a txt file")
+    def update(self, source_dict: dict[str, Any]) -> None:
+        for key, value in source_dict.items():
+            if hasattr(self, key):
+                setattr(self, key, value)
+            else:
+                logger.exception("%s has no attribute %s", type(self).__name__, key)
+        self.__post_init__()
+    @classmethod
+    def from_pretrained(cls, model_path: str) -> "SamplingParam":
+        from fastvideo.registry import get_sampling_param_cls_for_name
+        sampling_cls = get_sampling_param_cls_for_name(model_path)
+        if sampling_cls is not None:
+            sampling_param: SamplingParam = sampling_cls()
+        else:
+            logger.warning("Couldn't find an optimal sampling param for %s. Using the default sampling param.",
+                           model_path)
+            sampling_param = cls()
+        return sampling_param
+    @staticmethod
+    def add_cli_args(parser: Any) -> Any:
+        """Add CLI arguments for SamplingParam fields"""
+        parser.add_argument(
+            "--prompt",
+            type=str,
+            default=SamplingParam.prompt,
+            help="Text prompt for video generation",
+        )
+        parser.add_argument(
+            "--negative-prompt",
+            type=str,
+            default=SamplingParam.negative_prompt,
+            help="Negative text prompt for video generation",
+        )
+        parser.add_argument(
+            "--prompt-path",
+            type=str,
+            default=SamplingParam.prompt_path,
+            help="Path to a text file containing the prompt",
+        )
+        parser.add_argument(
+            "--output-path",
+            type=str,
+            default=SamplingParam.output_path,
+            help="Path to save the generated video",
+        )
+        parser.add_argument(
+            "--output-video-name",
+            type=str,
+            default=SamplingParam.output_video_name,
+            help="Name of the output video",
+        )
+        parser.add_argument(
+            "--num-videos-per-prompt",
+            type=int,
+            default=SamplingParam.num_videos_per_prompt,
+            help="Number of videos to generate per prompt",
+        )
+        parser.add_argument(
+            "--seed",
+            type=int,
+            default=SamplingParam.seed,
+            help="Random seed for generation",
+        )
+        parser.add_argument(
+            "--num-frames",
+            type=int,
+            default=SamplingParam.num_frames,
+            help="Number of frames to generate",
+        )
+        parser.add_argument(
+            "--height",
+            type=int,
+            default=SamplingParam.height,
+            help="Height of generated video",
+        )
+        parser.add_argument(
+            "--width",
+            type=int,
+            default=SamplingParam.width,
+            help="Width of generated video",
+        )
+        parser.add_argument(
+            "--fps",
+            type=int,
+            default=SamplingParam.fps,
+            help="Frames per second for saved video",
+        )
+        parser.add_argument(
+            "--num-inference-steps",
+            type=int,
+            default=SamplingParam.num_inference_steps,
+            help="Number of denoising steps",
+        )
+        parser.add_argument(
+            "--guidance-scale",
+            type=float,
+            default=SamplingParam.guidance_scale,
+            help="Classifier-free guidance scale",
+        )
+        parser.add_argument(
+            "--guidance-rescale",
+            type=float,
+            default=SamplingParam.guidance_rescale,
+            help="Guidance rescale factor",
+        )
+        parser.add_argument(
+            "--boundary-ratio",
+            type=float,
+            default=SamplingParam.boundary_ratio,
+            help="Boundary timestep ratio",
+        )
+        parser.add_argument(
+            "--save-video",
+            action="store_true",
+            default=SamplingParam.save_video,
+            help="Whether to save the video to disk",
+        )
+        parser.add_argument(
+            "--no-save-video",
+            action="store_false",
+            dest="save_video",
+            help="Don't save the video to disk",
+        )
+        parser.add_argument(
+            "--return-frames",
+            action="store_true",
+            default=False,
+            help="Whether to return the raw frames",
+        )
+        parser.add_argument(
+            "--image-path",
+            type=str,
+            default=SamplingParam.image_path,
+            help="Path to input image for image-to-video generation",
+        )
+        parser.add_argument(
+            "--video-path",
+            type=str,
+            default=SamplingParam.video_path,
+            help="Path to input video for video-to-video generation",
+        )
+        parser.add_argument(
+            "--refine-from",
+            type=str,
+            default=SamplingParam.refine_from,
+            help="Path to stage1 video for refinement (LongCat 480p->720p)",
+        )
+        parser.add_argument(
+            "--t-thresh",
+            type=float,
+            default=SamplingParam.t_thresh,
+            help="Threshold for timestep scheduling in refinement (default: 0.5)",
+        )
+        parser.add_argument(
+            "--spatial-refine-only",
+            action=StoreBoolean,
+            default=SamplingParam.spatial_refine_only,
+            help="Only perform spatial super-resolution (no temporal doubling)",
+        )
+        parser.add_argument(
+            "--num-cond-frames",
+            type=int,
+            default=SamplingParam.num_cond_frames,
+            help="Number of conditioning frames for refinement",
+        )
+        parser.add_argument(
+            "--moba-config-path",
+            type=str,
+            default=None,
+            help="Path to a JSON file containing V-MoBA specific configurations.",
+        )
+        parser.add_argument(
+            "--return-trajectory-latents",
+            action="store_true",
+            default=SamplingParam.return_trajectory_latents,
+            help="Whether to return the trajectory",
+        )
+        parser.add_argument(
+            "--return-trajectory-decoded",
+            action="store_true",
+            default=SamplingParam.return_trajectory_decoded,
+            help="Whether to return the decoded trajectory",
+        )
+        return parser
+@dataclass
+class CacheParams:
+    cache_type: str = "none"

standalone_inference/overlay_files/fastvideo/configs/sample/wan.py ADDED Viewed

	@@ -0,0 +1,154 @@

+# SPDX-License-Identifier: Apache-2.0
+from dataclasses import dataclass
+from fastvideo.configs.sample.base import SamplingParam
+@dataclass
+class WanT2V_1_3B_SamplingParam(SamplingParam):
+    # Video parameters
+    height: int = 480
+    width: int = 832
+    num_frames: int = 81
+    fps: int = 16
+    # Denoising stage
+    guidance_scale: float = 3.0
+    negative_prompt: str = "Bright tones, overexposed, static, blurred details, subtitles, style, works, paintings, images, static, overall gray, worst quality, low quality, JPEG compression residue, ugly, incomplete, extra fingers, poorly drawn hands, poorly drawn faces, deformed, disfigured, misshapen limbs, fused fingers, still picture, messy background, three legs, many people in the background, walking backwards"
+    num_inference_steps: int = 50
+@dataclass
+class WanT2V_14B_SamplingParam(SamplingParam):
+    # Video parameters
+    height: int = 720
+    width: int = 1280
+    num_frames: int = 81
+    fps: int = 16
+    # Denoising stage
+    guidance_scale: float = 5.0
+    negative_prompt: str = "Bright tones, overexposed, static, blurred details, subtitles, style, works, paintings, images, static, overall gray, worst quality, low quality, JPEG compression residue, ugly, incomplete, extra fingers, poorly drawn hands, poorly drawn faces, deformed, disfigured, misshapen limbs, fused fingers, still picture, messy background, three legs, many people in the background, walking backwards"
+    num_inference_steps: int = 50
+@dataclass
+class WanI2V_14B_480P_SamplingParam(WanT2V_1_3B_SamplingParam):
+    # Denoising stage
+    guidance_scale: float = 5.0
+    num_inference_steps: int = 40
+@dataclass
+class WanI2V_14B_720P_SamplingParam(WanT2V_14B_SamplingParam):
+    # Denoising stage
+    guidance_scale: float = 5.0
+    num_inference_steps: int = 40
+@dataclass
+class FastWanT2V480P_SamplingParam(WanT2V_1_3B_SamplingParam):
+    # DMD parameters
+    # dmd_denoising_steps: list[int] | None = field(default_factory=lambda: [1000, 757, 522])
+    num_inference_steps: int = 3
+    num_frames: int = 61
+    height: int = 448
+    width: int = 832
+    fps: int = 16
+# =============================================
+# ============= Wan2.1 Fun Models =============
+# =============================================
+@dataclass
+class Wan2_1_Fun_1_3B_InP_SamplingParam(SamplingParam):
+    """Sampling parameters for Wan2.1 Fun 1.3B InP model."""
+    height: int = 480
+    width: int = 832
+    num_frames: int = 81
+    fps: int = 16
+    negative_prompt: str | None = "色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部，畸形的，毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走"
+    guidance_scale: float = 6.0
+    num_inference_steps: int = 50
+@dataclass
+class Wan2_1_Fun_1_3B_Control_SamplingParam(SamplingParam):
+    fps: int = 16
+    num_frames: int = 49
+    height: int = 832
+    width: int = 480
+    guidance_scale: float = 6.0
+# =============================================
+# ============= Wan2.2 TI2V Models =============
+# =============================================
+@dataclass
+class Wan2_2_Base_SamplingParam(SamplingParam):
+    """Sampling parameters for Wan2.2 TI2V 5B model."""
+    negative_prompt: str | None = "色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部，畸形的，毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走"
+@dataclass
+class Wan2_2_TI2V_5B_SamplingParam(Wan2_2_Base_SamplingParam):
+    """Sampling parameters for Wan2.2 TI2V 5B model."""
+    height: int = 704
+    width: int = 1280
+    num_frames: int = 121
+    fps: int = 24
+    guidance_scale: float = 5.0
+    num_inference_steps: int = 50
+@dataclass
+class Wan2_2_T2V_A14B_SamplingParam(Wan2_2_Base_SamplingParam):
+    guidance_scale: float = 4.0  # high_noise
+    guidance_scale_2: float = 3.0  # low_noise
+    num_inference_steps: int = 40
+    fps: int = 16
+    # NOTE(will): default boundary timestep is tracked by PipelineConfig, but
+    # can be overridden during sampling
+@dataclass
+class Wan2_2_I2V_A14B_SamplingParam(Wan2_2_Base_SamplingParam):
+    guidance_scale: float = 3.5  # high_noise
+    guidance_scale_2: float = 3.5  # low_noise
+    num_inference_steps: int = 40
+    fps: int = 16
+    # NOTE(will): default boundary timestep is tracked by PipelineConfig, but
+    # can be overridden during sampling
+@dataclass
+class Wan2_2_Fun_A14B_Control_SamplingParam(Wan2_1_Fun_1_3B_Control_SamplingParam):
+    num_frames: int = 81
+# =============================================
+# ============= Causal Self-Forcing =============
+# =============================================
+@dataclass
+class SelfForcingWan2_1_T2V_1_3B_480P_SamplingParam(Wan2_1_Fun_1_3B_InP_SamplingParam):
+    pass
+@dataclass
+class SelfForcingWan2_2_T2V_A14B_480P_SamplingParam(Wan2_2_T2V_A14B_SamplingParam):
+    num_inference_steps: int = 8
+    num_frames: int = 81
+    height: int = 448
+    width: int = 832
+    fps: int = 16
+@dataclass
+class MatrixGame2_SamplingParam(SamplingParam):
+    height: int = 352
+    width: int = 640
+    num_frames: int = 57
+    fps: int = 25
+    guidance_scale: float = 1.0
+    num_inference_steps: int = 3
+    negative_prompt: str | None = None

standalone_inference/overlay_files/fastvideo/configs/wan_1.3B_t2v_pipeline.json ADDED Viewed

	@@ -0,0 +1,40 @@

+{
+  "embedded_cfg_scale": 6.0,
+  "flow_shift": 3,
+  "dit_cpu_offload": true,
+  "disable_autocast": false,
+  "precision": "bf16",
+  "vae_precision": "fp32",
+  "vae_tiling": false,
+  "vae_sp": false,
+  "vae_config": {
+    "load_encoder": false,
+    "load_decoder": true,
+    "tile_sample_min_height": 256,
+    "tile_sample_min_width": 256,
+    "tile_sample_min_num_frames": 16,
+    "tile_sample_stride_height": 192,
+    "tile_sample_stride_width": 192,
+    "tile_sample_stride_num_frames": 12,
+    "blend_num_frames": 8,
+    "use_tiling": false,
+    "use_temporal_tiling": false,
+    "use_parallel_tiling": false,
+    "use_feature_cache": true
+  },
+  "dit_config": {
+    "prefix": "Wan",
+    "quant_config": null
+  },
+  "text_encoder_precisions": [
+    "fp32"
+  ],
+  "text_encoder_configs": [
+    {
+      "prefix": "t5",
+      "quant_config": null,
+      "lora_config": null
+    }
+  ],
+  "enable_torch_compile": false
+}

standalone_inference/overlay_files/fastvideo/entrypoints/cli/generate.py ADDED Viewed

	@@ -0,0 +1,115 @@

+# SPDX-License-Identifier: Apache-2.0
+# adapted from vllm: https://github.com/vllm-project/vllm/blob/v0.7.3/vllm/entrypoints/cli/serve.py
+import argparse
+import dataclasses
+import os
+from typing import cast
+from fastvideo import VideoGenerator
+from fastvideo.configs.sample.base import SamplingParam
+from fastvideo.entrypoints.cli.cli_types import CLISubcommand
+from fastvideo.entrypoints.cli.utils import RaiseNotImplementedAction
+from fastvideo.fastvideo_args import FastVideoArgs
+from fastvideo.logger import init_logger
+from fastvideo.utils import FlexibleArgumentParser
+logger = init_logger(__name__)
+class GenerateSubcommand(CLISubcommand):
+    """The `generate` subcommand for the FastVideo CLI"""
+    def __init__(self) -> None:
+        self.name = "generate"
+        super().__init__()
+        self.init_arg_names = self._get_init_arg_names()
+        self.generation_arg_names = self._get_generation_arg_names()
+    def _get_init_arg_names(self) -> list[str]:
+        """Get names of arguments for VideoGenerator initialization"""
+        return ["num_gpus", "tp_size", "sp_size", "model_path"]
+    def _get_generation_arg_names(self) -> list[str]:
+        """Get names of arguments for generate_video method"""
+        return [field.name for field in dataclasses.fields(SamplingParam)]
+    def cmd(self, args: argparse.Namespace) -> None:
+        excluded_args = ['subparser', 'config', 'dispatch_function']
+        provided_args = {}
+        for k, v in vars(args).items():
+            if (k not in excluded_args and v is not None and hasattr(args, '_provided') and k in args._provided):
+                provided_args[k] = v
+        if 'model_path' in vars(args) and args.model_path is not None:
+            provided_args['model_path'] = args.model_path
+        if 'prompt' in vars(args) and args.prompt is not None:
+            provided_args['prompt'] = args.prompt
+        merged_args = {**provided_args}
+        logger.info('CLI Args: %s', merged_args)
+        if 'model_path' not in merged_args or not merged_args['model_path']:
+            raise ValueError("model_path must be provided either in config file or via --model-path")
+        # Check if either prompt or prompt_txt is provided
+        has_prompt = 'prompt' in merged_args and merged_args['prompt']
+        has_prompt_txt = 'prompt_txt' in merged_args and merged_args['prompt_txt']
+        if not (has_prompt or has_prompt_txt):
+            raise ValueError("Either prompt or prompt_txt must be provided")
+        if has_prompt and has_prompt_txt:
+            raise ValueError("Cannot provide both 'prompt' and 'prompt_txt'. Use only one of them.")
+        init_args = {k: v for k, v in merged_args.items() if k not in self.generation_arg_names}
+        generation_args = {k: v for k, v in merged_args.items() if k in self.generation_arg_names}
+        generation_args.setdefault("return_frames", False)
+        model_path = init_args.pop('model_path')
+        prompt = generation_args.pop('prompt', None)
+        generator = VideoGenerator.from_pretrained(model_path=model_path, **init_args)
+        # Call generate_video - it handles both single and batch modes
+        generator.generate_video(prompt=prompt, **generation_args)
+    def validate(self, args: argparse.Namespace) -> None:
+        """Validate the arguments for this command"""
+        if args.num_gpus is not None and args.num_gpus <= 0:
+            raise ValueError("Number of gpus must be positive")
+        if args.config and not os.path.exists(args.config):
+            raise ValueError(f"Config file not found: {args.config}")
+    def subparser_init(self, subparsers: argparse._SubParsersAction) -> FlexibleArgumentParser:
+        generate_parser = subparsers.add_parser(
+            "generate",
+            help="Run inference on a model",
+            usage="fastvideo generate (--model-path MODEL_PATH_OR_ID --prompt PROMPT) | --config CONFIG_FILE [OPTIONS]")
+        generate_parser.add_argument(
+            "--config",
+            type=str,
+            default='',
+            required=False,
+            help="Read CLI options from a config JSON or YAML file. If provided, --model-path and --prompt are optional."
+        )
+        generate_parser = FastVideoArgs.add_cli_args(generate_parser)
+        generate_parser = SamplingParam.add_cli_args(generate_parser)
+        generate_parser.add_argument(
+            "--text-encoder-configs",
+            action=RaiseNotImplementedAction,
+            help="JSON array of text encoder configurations (NOT YET IMPLEMENTED)",
+        )
+        return cast(FlexibleArgumentParser, generate_parser)
+def cmd_init() -> list[CLISubcommand]:
+    return [GenerateSubcommand()]

standalone_inference/overlay_files/fastvideo/entrypoints/video_generator.py ADDED Viewed

	@@ -0,0 +1,797 @@

+# SPDX-License-Identifier: Apache-2.0
+"""
+VideoGenerator module for FastVideo.
+This module provides a consolidated interface for generating videos using
+diffusion models.
+"""
+import os
+import re
+import shutil
+import threading
+import time
+import tempfile
+import warnings
+from collections.abc import Mapping
+from copy import deepcopy
+from typing import Any
+import imageio
+import numpy as np
+import torch
+import torchvision
+from einops import rearrange
+from fastvideo.api.compat import (
+    expand_request_prompt_batch,
+    generator_config_to_fastvideo_args,
+    legacy_from_pretrained_to_config,
+    load_generator_config_from_file,
+    normalize_generation_request,
+    normalize_generator_config,
+    request_to_pipeline_overrides,
+    request_to_sampling_param,
+)
+from fastvideo.api.results import GenerationResult
+from fastvideo.api.schema import GenerationRequest, GeneratorConfig
+from fastvideo.configs.sample import SamplingParam
+from fastvideo.fastvideo_args import FastVideoArgs
+from fastvideo.logger import init_logger
+from fastvideo.pipelines import ForwardBatch
+from fastvideo.utils import align_to, shallow_asdict
+from fastvideo.worker.executor import Executor
+logger = init_logger(__name__)
+_FROM_PRETRAINED_CONVENIENCE_KWARGS = frozenset({
+    "num_gpus",
+    "revision",
+    "trust_remote_code",
+    "distributed_executor_backend",
+    "tp_size",
+    "sp_size",
+    "hsdp_replicate_dim",
+    "hsdp_shard_dim",
+    "dist_timeout",
+    "use_fsdp_inference",
+    "disable_autocast",
+    "enable_stage_verification",
+    "dit_cpu_offload",
+    "dit_layerwise_offload",
+    "text_encoder_cpu_offload",
+    "image_encoder_cpu_offload",
+    "vae_cpu_offload",
+    "pin_cpu_memory",
+    "enable_torch_compile",
+    "torch_compile_kwargs",
+    "transformer_quant",
+})
+def _infer_latent_batch_size(batch: ForwardBatch) -> int:
+    if isinstance(batch.prompt, list):
+        latent_batch_size = len(batch.prompt)
+    elif batch.prompt is not None:
+        latent_batch_size = 1
+    elif batch.prompt_embeds is not None and len(batch.prompt_embeds) > 0:
+        latent_batch_size = batch.prompt_embeds[0].shape[0]
+    else:
+        raise ValueError("Cannot infer batch size from batch; no prompt or prompt_embeds found")
+    latent_batch_size *= batch.num_videos_per_prompt
+    return latent_batch_size
+class VideoGenerator:
+    """
+    A unified class for generating videos using diffusion models.
+    This class provides a simple interface for video generation with rich
+    customization options, similar to popular frameworks like HF Diffusers.
+    """
+    def __init__(
+        self,
+        fastvideo_args: FastVideoArgs,
+        executor_class: type[Executor],
+        log_stats: bool,
+        *,
+        log_queue=None,
+    ):
+        """
+        Initialize the video generator.
+        Args:
+            fastvideo_args: The inference arguments
+            executor_class: The executor class to use for inference
+            log_stats: Whether to log statistics
+            log_queue: Optional multiprocessing.Queue to forward worker logs to
+        """
+        self.config: GeneratorConfig | None = None
+        self.fastvideo_args = fastvideo_args
+        self.executor = executor_class(fastvideo_args, log_queue=log_queue)
+    @classmethod
+    def from_pretrained(
+        cls,
+        model_path: str | GeneratorConfig | Mapping[str, Any] | None = None,
+        **kwargs,
+    ) -> "VideoGenerator":
+        """
+        Create a video generator from a pretrained model.
+        Args:
+            model_path: Path or identifier for the pretrained model
+            pipeline_config: Pipeline config to use for inference
+            **kwargs: Additional arguments to customize model loading, set any FastVideoArgs or PipelineConfig attributes here.
+        Returns:
+            The created video generator
+        Priority level: Default pipeline config < User's pipeline config < User's kwargs
+        Stable convenience kwargs remain supported here for common engine and
+        offload settings. Advanced model- or pipeline-specific options should
+        move to VideoGenerator.from_config(...).
+        """
+        log_queue = kwargs.pop("log_queue", None)
+        typed_config = kwargs.pop("config", None)
+        if typed_config is not None:
+            if model_path is not None:
+                raise TypeError("Pass either model_path or config to from_pretrained, not both")
+            if kwargs:
+                unexpected = ", ".join(sorted(kwargs))
+                raise TypeError(f"Unexpected keyword arguments with config: {unexpected}")
+            return cls.from_config(typed_config, log_queue=log_queue)
+        if isinstance(model_path, GeneratorConfig | Mapping):
+            if kwargs:
+                unexpected = ", ".join(sorted(kwargs))
+                raise TypeError(f"Unexpected keyword arguments with typed config: {unexpected}")
+            return cls.from_config(model_path, log_queue=log_queue)
+        if model_path is None:
+            raise TypeError("model_path or config is required")
+        legacy_only_kwargs = sorted(set(kwargs) - _FROM_PRETRAINED_CONVENIENCE_KWARGS)
+        if legacy_only_kwargs:
+            warnings.warn(
+                "VideoGenerator.from_pretrained(...) received legacy-only kwargs "
+                f"({', '.join(legacy_only_kwargs)}); prefer VideoGenerator.from_config(...) "
+                "for advanced configuration.",
+                DeprecationWarning,
+                stacklevel=2,
+            )
+        return cls.from_config(
+            legacy_from_pretrained_to_config(model_path, kwargs),
+            log_queue=log_queue,
+        )
+    @classmethod
+    def from_config(
+        cls,
+        config: GeneratorConfig | Mapping[str, Any],
+        *,
+        log_queue=None,
+    ) -> "VideoGenerator":
+        normalized = normalize_generator_config(config)
+        fastvideo_args = generator_config_to_fastvideo_args(normalized)
+        generator = cls.from_fastvideo_args(fastvideo_args, log_queue=log_queue)
+        generator.config = normalized
+        return generator
+    @classmethod
+    def from_file(
+        cls,
+        path: str,
+        overrides: list[str] | Mapping[str, Any] | None = None,
+        *,
+        log_queue=None,
+    ) -> "VideoGenerator":
+        return cls.from_config(
+            load_generator_config_from_file(path, overrides=overrides),
+            log_queue=log_queue,
+        )
+    @classmethod
+    def from_fastvideo_args(
+        cls,
+        fastvideo_args: FastVideoArgs,
+        *,
+        log_queue=None,
+    ) -> "VideoGenerator":
+        """
+        Create a video generator with the specified arguments.
+        Args:
+            fastvideo_args: The inference arguments
+            log_queue: Optional multiprocessing.Queue to forward worker logs to
+        Returns:
+            The created video generator
+        """
+        # Initialize distributed environment if needed
+        # initialize_distributed_and_parallelism(fastvideo_args)
+        executor_class = Executor.get_class(fastvideo_args)
+        return cls(
+            fastvideo_args=fastvideo_args,
+            executor_class=executor_class,
+            log_stats=False,  # TODO: implement
+            log_queue=log_queue,
+        )
+    def generate(
+        self,
+        request: GenerationRequest | Mapping[str, Any],
+        *,
+        log_queue=None,
+    ) -> GenerationResult | list[GenerationResult]:
+        """
+        Generate video or image outputs from a typed inference request.
+        Args:
+            request: A `GenerationRequest` instance or a mapping that can be
+                parsed into one. This is the primary public inference
+                entrypoint for the typed API.
+            log_queue: Optional multiprocessing.Queue to forward worker logs to
+                during this request.
+        Returns:
+            A `GenerationResult` for single-request generation, or a list of
+            `GenerationResult` objects when the request expands into multiple
+            prompts.
+        """
+        normalized_request = normalize_generation_request(request)
+        if log_queue:
+            self.executor.set_log_queue(log_queue)
+        try:
+            return self._generate_request_impl(normalized_request)
+        finally:
+            if log_queue:
+                self.executor.clear_log_queue()
+    def generate_video(
+        self,
+        prompt: str | None = None,
+        sampling_param: SamplingParam | None = None,
+        # Action control inputs (Matrix-Game)
+        mouse_cond: torch.Tensor | None = None,
+        keyboard_cond: torch.Tensor | None = None,
+        grid_sizes: tuple[int, int, int] | list[int] | torch.Tensor
+        | None = None,
+        **kwargs,
+    ) -> dict[str, Any] | list[dict[str, Any]]:
+        """
+        Generate a video based on the given prompt.
+        Args:
+            prompt: The prompt to use for generation (optional if prompt_txt is provided)
+            negative_prompt: The negative prompt to use (overrides the one in fastvideo_args)
+            output_path: Path to save the video (overrides the one in fastvideo_args)
+            prompt_path: Path to prompt file
+            save_video: Whether to save the video to disk
+            return_frames: Whether to include raw frames in the result dict
+            num_inference_steps: Number of denoising steps (overrides fastvideo_args)
+            guidance_scale: Classifier-free guidance scale (overrides fastvideo_args)
+            num_frames: Number of frames to generate (overrides fastvideo_args)
+            height: Height of generated video (overrides fastvideo_args)
+            width: Width of generated video (overrides fastvideo_args)
+            fps: Frames per second for saved video (overrides fastvideo_args)
+            seed: Random seed for generation (overrides fastvideo_args)
+            callback: Callback function called after each step
+            callback_steps: Number of steps between each callback
+        Returns:
+            A metadata dictionary for single-prompt generation, or a list of
+            metadata dictionaries for prompt-file batch generation.
+        """
+        log_queue = kwargs.pop("log_queue", None)
+        warnings.warn(
+            "VideoGenerator.generate_video(...) is deprecated; use "
+            "VideoGenerator.generate(request=...) instead.",
+            DeprecationWarning,
+            stacklevel=2,
+        )
+        if log_queue:
+            self.executor.set_log_queue(log_queue)
+        try:
+            return self._generate_video_impl(
+                prompt=prompt,
+                sampling_param=sampling_param,
+                mouse_cond=mouse_cond,
+                keyboard_cond=keyboard_cond,
+                grid_sizes=grid_sizes,
+                **kwargs,
+            )
+        finally:
+            if log_queue:
+                self.executor.clear_log_queue()
+    def _generate_request_impl(
+        self,
+        request: GenerationRequest,
+    ) -> GenerationResult | list[GenerationResult]:
+        if isinstance(request.prompt, list):
+            if request.inputs.prompt_path is not None:
+                raise ValueError("request.prompt list cannot be combined with request.inputs.prompt_path")
+            results: list[GenerationResult] = []
+            for index, single_request in enumerate(expand_request_prompt_batch(request)):
+                prompt = single_request.prompt
+                wrapped = self._generate_single_request(single_request)
+                if isinstance(wrapped, list):
+                    results.extend(wrapped)
+                    continue
+                wrapped.prompt_index = index
+                if wrapped.prompt is None and isinstance(prompt, str):
+                    wrapped.prompt = prompt
+                results.append(wrapped)
+            return results
+        return self._generate_single_request(request)
+    def _generate_single_request(
+        self,
+        request: GenerationRequest,
+    ) -> GenerationResult | list[GenerationResult]:
+        fastvideo_args = self.fastvideo_args
+        pipeline_overrides = request_to_pipeline_overrides(request)
+        if pipeline_overrides:
+            fastvideo_args = deepcopy(self.fastvideo_args)
+            for key, value in pipeline_overrides.items():
+                if not hasattr(fastvideo_args.pipeline_config, key):
+                    raise ValueError(f"Request field {key!r} is not supported by pipeline config overrides")
+                setattr(fastvideo_args.pipeline_config, key, deepcopy(value))
+        sampling_param = request_to_sampling_param(
+            request,
+            model_path=self.fastvideo_args.model_path,
+        )
+        result = self._generate_video_impl(
+            prompt=request.prompt,
+            sampling_param=sampling_param,
+            fastvideo_args=fastvideo_args,
+        )
+        return self._wrap_legacy_result(result)
+    def _generate_video_impl(
+        self,
+        prompt: str | list[str] | None = None,
+        sampling_param: SamplingParam | None = None,
+        mouse_cond: torch.Tensor | None = None,
+        keyboard_cond: torch.Tensor | None = None,
+        grid_sizes: tuple[int, int, int] | list[int] | torch.Tensor
+        | None = None,
+        fastvideo_args: FastVideoArgs | None = None,
+        **kwargs,
+    ) -> dict[str, Any] | list[np.ndarray] | list[dict[str, Any]]:
+        """Internal implementation of generate_video."""
+        if fastvideo_args is None:
+            fastvideo_args = self.fastvideo_args
+        # Handle batch processing from text file
+        if sampling_param is None:
+            sampling_param = SamplingParam.from_pretrained(fastvideo_args.model_path)
+        # Add action control inputs to kwargs if provided
+        if mouse_cond is not None:
+            kwargs['mouse_cond'] = mouse_cond
+        if keyboard_cond is not None:
+            kwargs['keyboard_cond'] = keyboard_cond
+        if grid_sizes is not None:
+            kwargs['grid_sizes'] = grid_sizes
+        sampling_param.update(kwargs)
+        if fastvideo_args.prompt_txt is not None or sampling_param.prompt_path is not None:
+            prompt_txt_path = sampling_param.prompt_path or fastvideo_args.prompt_txt
+            if not prompt_txt_path or not os.path.exists(prompt_txt_path):
+                raise FileNotFoundError(f"Prompt text file not found: {prompt_txt_path}")
+            # Read prompts from file
+            with open(prompt_txt_path, encoding='utf-8') as f:
+                prompts = [line.strip() for line in f if line.strip()]
+            if not prompts:
+                raise ValueError(f"No prompts found in file: {prompt_txt_path}")
+            logger.info("Found %d prompts in %s", len(prompts), prompt_txt_path)
+            results = []
+            for i, batch_prompt in enumerate(prompts):
+                logger.info("Processing prompt %d/%d: %s...", i + 1, len(prompts), batch_prompt[:100])
+                try:
+                    # Generate video for this prompt using the same logic below
+                    output_path = self._prepare_output_path(sampling_param.output_path, batch_prompt)
+                    kwargs["output_path"] = output_path
+                    result = self._generate_single_video(
+                        prompt=batch_prompt,
+                        sampling_param=sampling_param,
+                        fastvideo_args=fastvideo_args,
+                        **kwargs,
+                    )
+                    # Add prompt info to result
+                    result["prompt_index"] = i
+                    result["prompt"] = batch_prompt
+                    results.append(result)
+                    logger.info("Successfully generated video for prompt %d", i + 1)
+                except Exception as e:
+                    logger.error("Failed to generate video for prompt %d: %s", i + 1, e)
+                    continue
+            logger.info("Completed batch processing. Generated %d videos successfully.", len(results))
+            return results
+        # Single prompt generation (original behavior)
+        if prompt is None:
+            raise ValueError("Either prompt or prompt_txt must be provided")
+        if not isinstance(prompt, str):
+            raise ValueError("Single-prompt generation expects a string prompt")
+        output_path = self._prepare_output_path(sampling_param.output_path, prompt)
+        kwargs["output_path"] = output_path
+        return self._generate_single_video(
+            prompt=prompt,
+            sampling_param=sampling_param,
+            fastvideo_args=fastvideo_args,
+            **kwargs,
+        )
+    def _is_image_workload(self) -> bool:
+        """Return True when the workload produces a single image (t2i, i2i …)."""
+        args = getattr(self, "fastvideo_args", None)
+        if args is None:
+            return False
+        return args.workload_type.value.endswith("2i")
+    def _prepare_output_path(
+        self,
+        output_path: str,
+        prompt: str,
+    ) -> str:
+        """Build a unique, sanitized output file path.
+        The file extension is chosen automatically based on the workload type:
+        ``.png`` for image workloads (``t2i``, ``i2i``, …) and ``.mp4`` for
+        video workloads.
+        - If ``output_path`` already carries the correct extension, treat it
+          as a file path.
+        - Otherwise, treat ``output_path`` as a directory and derive the
+          filename from the prompt.
+        - Invalid filename characters are removed; if the name changes, a
+          warning is logged.
+        - If the target path already exists, a numeric suffix is appended.
+        """
+        target_ext = ".png" if self._is_image_workload() else ".mp4"
+        def _sanitize_filename_component(name: str) -> str:
+            # Remove characters invalid on common filesystems, strip spaces/dots
+            sanitized = re.sub(r'[\\/:*?"<>|]', '', name)
+            sanitized = sanitized.strip().strip('.')
+            sanitized = re.sub(r'\s+', ' ', sanitized)
+            return sanitized or "output"
+        base_path, extension = os.path.splitext(output_path)
+        extension_lower = extension.lower()
+        if extension_lower == target_ext:
+            output_dir = os.path.dirname(output_path)
+            base_name = os.path.basename(base_path)  # filename without extension
+            sanitized_base = _sanitize_filename_component(base_name)
+            if sanitized_base != base_name:
+                logger.warning(
+                    "The output name '%s' contained invalid characters. "
+                    "It has been renamed to '%s%s'",
+                    os.path.basename(output_path),
+                    sanitized_base,
+                    target_ext,
+                )
+            out_name = f"{sanitized_base}{target_ext}"
+        else:
+            # Treat as directory; inform if an unexpected extension was
+            # provided.
+            if extension:
+                logger.info(
+                    "Output path '%s' has extension '%s' which does not "
+                    "match the target '%s'; treating it as a directory",
+                    output_path,
+                    extension,
+                    target_ext,
+                )
+            output_dir = output_path
+            prompt_component = _sanitize_filename_component(prompt[:100])
+            out_name = f"{prompt_component}{target_ext}"
+        if output_dir:
+            os.makedirs(output_dir, exist_ok=True)
+        new_output_path = os.path.join(output_dir, out_name)
+        counter = 1
+        while os.path.exists(new_output_path):
+            name_part, ext_part = os.path.splitext(out_name)
+            new_name = f"{name_part}_{counter}{ext_part}"
+            new_output_path = os.path.join(output_dir, new_name)
+            counter += 1
+        return new_output_path
+    def _generate_single_video(
+        self,
+        prompt: str,
+        sampling_param: SamplingParam | None = None,
+        fastvideo_args: FastVideoArgs | None = None,
+        **kwargs,
+    ) -> dict[str, Any]:
+        """Internal method for single video generation"""
+        if fastvideo_args is None:
+            fastvideo_args = self.fastvideo_args
+        # Validate inputs
+        if not isinstance(prompt, str):
+            raise TypeError(f"`prompt` must be a string, but got {type(prompt)}")
+        prompt = prompt.strip()
+        sampling_param = deepcopy(sampling_param)
+        output_path = kwargs["output_path"]
+        sampling_param.prompt = prompt
+        # Process negative prompt
+        if sampling_param.negative_prompt is not None:
+            sampling_param.negative_prompt = sampling_param.negative_prompt.strip()
+        # Validate dimensions
+        if (sampling_param.height <= 0 or sampling_param.width <= 0 or sampling_param.num_frames <= 0):
+            raise ValueError(f"Height, width, and num_frames must be positive integers, got "
+                             f"height={sampling_param.height}, width={sampling_param.width}, "
+                             f"num_frames={sampling_param.num_frames}")
+        # Calculate sizes
+        target_height = align_to(sampling_param.height, 16)
+        target_width = align_to(sampling_param.width, 16)
+        # Calculate latent sizes
+        latents_size = [(sampling_param.num_frames - 1) // 4 + 1, sampling_param.height // 8, sampling_param.width // 8]
+        n_tokens = latents_size[0] * latents_size[1] * latents_size[2]
+        # Log parameters
+        debug_str = f"""
+                      height: {target_height}
+                       width: {target_width}
+                video_length: {sampling_param.num_frames}
+                      prompt: {sampling_param.prompt}
+                      image_path: {sampling_param.image_path}
+                  neg_prompt: {sampling_param.negative_prompt}
+                        seed: {sampling_param.seed}
+                 infer_steps: {sampling_param.num_inference_steps}
+       num_videos_per_prompt: {sampling_param.num_videos_per_prompt}
+              guidance_scale: {sampling_param.guidance_scale}
+                    n_tokens: {n_tokens}
+                  flow_shift: {fastvideo_args.pipeline_config.flow_shift}
+     embedded_guidance_scale: {fastvideo_args.pipeline_config.embedded_cfg_scale}
+                  save_video: {sampling_param.save_video}
+                  output_path: {output_path}
+        """ # type: ignore[attr-defined]
+        logger.info(debug_str)
+        # Prepare batch
+        batch = ForwardBatch(
+            **shallow_asdict(sampling_param),
+            eta=0.0,
+            n_tokens=n_tokens,
+            VSA_sparsity=fastvideo_args.VSA_sparsity,
+        )
+        # Run inference
+        start_time = time.perf_counter()
+        # Execute forward pass in a new thread for non-blocking tensor
+        # allocation. Capture thread exceptions so we can surface the true
+        # failure in the main thread instead of later hitting None outputs.
+        result_container = {"output_batch": ForwardBatch(data_type=batch.data_type)}
+        thread_error: dict[str, BaseException | None] = {"error": None}
+        thread_error_traceback: dict[str, str] = {"traceback": ""}
+        def execute_forward_thread():
+            import traceback
+            try:
+                result_container["output_batch"] = self.executor.execute_forward(batch, fastvideo_args)
+            except BaseException as error:  # noqa: BLE001
+                thread_error["error"] = error
+                thread_error_traceback["traceback"] = traceback.format_exc()
+        thread = threading.Thread(target=execute_forward_thread)
+        thread.start()
+        latent_batch_size = _infer_latent_batch_size(batch)
+        samples = torch.empty(
+            (latent_batch_size, 3, sampling_param.num_frames, sampling_param.height, sampling_param.width),
+            device='cpu',
+            pin_memory=fastvideo_args.pin_cpu_memory)
+        thread.join()
+        if thread_error["error"] is not None:
+            raise RuntimeError("Forward execution thread failed.\n"
+                               f"{thread_error_traceback['traceback']}") from thread_error["error"]
+        output_batch = result_container["output_batch"]
+        if output_batch.output is None:
+            raise RuntimeError("Forward execution returned no output tensor. "
+                               "This usually means the executor/pipeline failed earlier.")
+        if output_batch.output.shape == samples.shape:
+            samples.copy_(output_batch.output)
+        else:
+            logger.warning("Output shape %s does not match expected shape %s; use slow path", output_batch.output.shape,
+                           samples.shape)
+            samples = output_batch.output.cpu()
+        logging_info = output_batch.logging_info
+        gen_time = time.perf_counter() - start_time
+        logger.info("Generated successfully in %.2f seconds", gen_time)
+        # Process outputs
+        videos = rearrange(samples, "b c t h w -> t b c h w")
+        frames = []
+        for x in videos:
+            x = torchvision.utils.make_grid(x, nrow=6)
+            x = x.permute(1, 2, 0).squeeze(-1)
+            x = (x * 255).to(torch.uint8)
+            frames.append(x.cpu().numpy())
+        # Save output if requested
+        if batch.save_video:
+            if self._is_image_workload():
+                # Image workloads (t2i, i2i, …): save the first frame as PNG.
+                imageio.imwrite(output_path, frames[0])
+                logger.info("Saved image to %s", output_path)
+            else:
+                imageio.mimsave(output_path, frames, fps=batch.fps, format="mp4")
+                logger.info("Saved video to %s", output_path)
+                audio = output_batch.extra.get("audio")
+                audio_sample_rate = output_batch.extra.get("audio_sample_rate")
+                if (audio is not None and audio_sample_rate is not None
+                        and not self._mux_audio(output_path, audio, audio_sample_rate)):
+                    logger.warning("Audio mux failed; saved video without audio.")
+        result: dict[str, Any] = {
+            "prompts": prompt,
+            "samples": samples if batch.return_frames else None,
+            "frames": frames if batch.return_frames else None,
+            "audio": output_batch.extra.get("audio") if batch.return_frames else None,
+            "size": (target_height, target_width, batch.num_frames),
+            "generation_time": gen_time,
+            "logging_info": logging_info,
+            "trajectory": output_batch.trajectory_latents,
+            "trajectory_timesteps": output_batch.trajectory_timesteps,
+            "trajectory_decoded": output_batch.trajectory_decoded,
+            "video_path": output_path if batch.save_video else None,
+            "peak_memory_mb": output_batch.extra.get("peak_memory_mb"),
+        }
+        return result
+    @staticmethod
+    def _wrap_legacy_result(
+        result: dict[str, Any] | list[dict[str, Any]], ) -> GenerationResult | list[GenerationResult]:
+        if isinstance(result, list):
+            return [GenerationResult.from_legacy_result(item) for item in result]
+        return GenerationResult.from_legacy_result(result)
+    @staticmethod
+    def _unwrap_typed_result(
+        result: GenerationResult | list[GenerationResult], ) -> dict[str, Any] | list[dict[str, Any]]:
+        if isinstance(result, list):
+            return [item.to_legacy_dict() for item in result]
+        return result.to_legacy_dict()
+    @staticmethod
+    def _mux_audio(
+        video_path: str,
+        audio: torch.Tensor | np.ndarray,
+        sample_rate: int,
+    ) -> bool:
+        """Mux audio into video using PyAV."""
+        try:
+            import av
+        except ImportError:
+            logger.warning("PyAV not installed; cannot mux audio. "
+                           "Install with: pip install av")
+            return False
+        if torch.is_tensor(audio):
+            audio_np = audio.detach().cpu().float().numpy()
+        else:
+            audio_np = np.asarray(audio, dtype=np.float32)
+        if audio_np.ndim == 1:
+            audio_np = audio_np[:, None]
+        elif audio_np.ndim == 2:
+            if audio_np.shape[0] <= 8 and audio_np.shape[1] > audio_np.shape[0]:
+                audio_np = audio_np.T
+        else:
+            logger.warning("Unexpected audio shape %s; skipping mux.", audio_np.shape)
+            return False
+        audio_np = np.clip(audio_np, -1.0, 1.0)
+        audio_int16 = (audio_np * 32767.0).astype(np.int16)
+        num_channels = audio_int16.shape[1]
+        layout = "stereo" if num_channels == 2 else "mono"
+        try:
+            import wave
+            with tempfile.TemporaryDirectory() as tmpdir:
+                out_path = os.path.join(tmpdir, "muxed.mp4")
+                wav_path = os.path.join(tmpdir, "audio.wav")
+                # Write audio to WAV file
+                with wave.open(wav_path, "wb") as wav_file:
+                    wav_file.setnchannels(num_channels)
+                    wav_file.setsampwidth(2)
+                    wav_file.setframerate(sample_rate)
+                    wav_file.writeframes(audio_int16.tobytes())
+                # Open input video and audio
+                input_video = av.open(video_path)
+                input_audio = av.open(wav_path)
+                # Create output with both streams
+                output = av.open(out_path, mode="w")
+                # Add video stream (copy codec from input)
+                in_video_stream = input_video.streams.video[0]
+                out_video_stream = output.add_stream(
+                    codec_name=in_video_stream.codec_context.name,
+                    rate=in_video_stream.average_rate,
+                )
+                out_video_stream.width = in_video_stream.width
+                out_video_stream.height = in_video_stream.height
+                out_video_stream.pix_fmt = in_video_stream.pix_fmt
+                # Add audio stream (AAC)
+                out_audio_stream = output.add_stream("aac", rate=sample_rate)
+                out_audio_stream.layout = layout
+                # Remux video (decode and re-encode to be safe)
+                for frame in input_video.decode(video=0):
+                    for packet in out_video_stream.encode(frame):
+                        output.mux(packet)
+                for packet in out_video_stream.encode():
+                    output.mux(packet)
+                # Encode audio
+                for frame in input_audio.decode(audio=0):
+                    frame.pts = None  # Let encoder assign PTS
+                    for packet in out_audio_stream.encode(frame):
+                        output.mux(packet)
+                for packet in out_audio_stream.encode():
+                    output.mux(packet)
+                input_video.close()
+                input_audio.close()
+                output.close()
+                shutil.move(out_path, video_path)
+            return True
+        except Exception as e:
+            logger.warning("Audio mux failed: %s", e)
+            return False
+    def set_lora_adapter(self, lora_nickname: str, lora_path: str | None = None) -> None:
+        self.executor.set_lora_adapter(lora_nickname, lora_path)
+    def unmerge_lora_weights(self) -> None:
+        """
+        Use unmerged weights for inference to produce videos that align with
+        validation videos generated during training.
+        """
+        self.executor.unmerge_lora_weights()
+    def merge_lora_weights(self) -> None:
+        self.executor.merge_lora_weights()
+    def shutdown(self) -> None:
+        """
+        Shutdown the video generator.
+        """
+        self.executor.shutdown()
+        del self.executor

standalone_inference/overlay_files/fastvideo/fastvideo_args.py ADDED Viewed

	@@ -0,0 +1,1188 @@

+# SPDX-License-Identifier: Apache-2.0
+# Inspired by SGLang: https://github.com/sgl-project/sglang/blob/main/python/sglang/srt/server_args.py
+"""The arguments of FastVideo Inference."""
+import argparse
+import dataclasses
+import json
+from contextlib import contextmanager
+from dataclasses import field
+from enum import Enum
+from typing import Any, TYPE_CHECKING
+from fastvideo.configs.configs import PreprocessConfig
+from fastvideo.configs.pipelines.base import PipelineConfig
+from fastvideo.configs.utils import clean_cli_args
+from fastvideo.layers.quantization import QUANTIZATION_METHODS, QuantizationMethods
+from fastvideo.logger import init_logger
+from fastvideo.utils import FlexibleArgumentParser, StoreBoolean
+if TYPE_CHECKING:
+    from ray.runtime_env import RuntimeEnv
+    from ray.util.placement_group import PlacementGroup
+else:
+    RuntimeEnv = Any
+    PlacementGroup = Any
+logger = init_logger(__name__)
+class ExecutionMode(str, Enum):
+    """
+    Enumeration for different pipeline modes.
+    Inherits from str to allow string comparison for backward compatibility.
+    """
+    INFERENCE = "inference"
+    PREPROCESS = "preprocess"
+    FINETUNING = "finetuning"
+    DISTILLATION = "distillation"
+    @classmethod
+    def from_string(cls, value: str) -> "ExecutionMode":
+        """Convert string to ExecutionMode enum."""
+        try:
+            return cls(value.lower())
+        except ValueError:
+            raise ValueError(f"Invalid mode: {value}. Must be one of: {', '.join([m.value for m in cls])}") from None
+    @classmethod
+    def choices(cls) -> list[str]:
+        """Get all available choices as strings for argparse."""
+        return [mode.value for mode in cls]
+class WorkloadType(str, Enum):
+    """
+    Enumeration for different workload types.
+    Inherits from str to allow string comparison for backward compatibility.
+    """
+    I2V = "i2v"  # Image to Video
+    T2V = "t2v"  # Text to Video
+    T2I = "t2i"  # Text to Image
+    I2I = "i2i"  # Image to Image
+    @classmethod
+    def from_string(cls, value: str) -> "WorkloadType":
+        """Convert string to WorkloadType enum."""
+        try:
+            return cls(value.lower())
+        except ValueError:
+            raise ValueError(
+                f"Invalid workload type: {value}. Must be one of: {', '.join([m.value for m in cls])}") from None
+    @classmethod
+    def choices(cls) -> list[str]:
+        """Get all available choices as strings for argparse."""
+        return [workload.value for workload in cls]
+# args for fastvideo framework
+@dataclasses.dataclass
+class FastVideoArgs:
+    # Model and path configuration (for convenience)
+    model_path: str
+    # Running mode
+    mode: ExecutionMode = ExecutionMode.INFERENCE
+    # Workload type
+    workload_type: WorkloadType = WorkloadType.T2V
+    # Distributed executor backend
+    distributed_executor_backend: str = "mp"
+    # a few attributes for ray related
+    ray_placement_group: PlacementGroup | None = None
+    ray_runtime_env: RuntimeEnv | None = None
+    inference_mode: bool = True  # if False == training mode
+    # HuggingFace specific parameters
+    trust_remote_code: bool = False
+    revision: str | None = None
+    # Parallelism
+    num_gpus: int = 1
+    tp_size: int = -1
+    sp_size: int = -1
+    hsdp_replicate_dim: int = 1
+    hsdp_shard_dim: int = -1
+    dist_timeout: int | None = None  # timeout for torch.distributed
+    pipeline_config: PipelineConfig = field(default_factory=PipelineConfig)
+    preprocess_config: PreprocessConfig | None = None
+    # LoRA parameters
+    # (Wenxuan) prefer to keep it here instead of in pipeline config to not make it complicated.
+    lora_path: str | None = None
+    lora_nickname: str = "default"  # for swapping adapters in the pipeline
+    # can restrict layers to adapt, e.g. ["q_proj"]
+    # Will adapt only q, k, v, o by default.
+    lora_target_modules: list[str] | None = None
+    output_type: str = "pil"
+    # CPU offload parameters
+    dit_cpu_offload: bool = True
+    use_fsdp_inference: bool = False
+    dit_layerwise_offload: bool = True
+    text_encoder_cpu_offload: bool = True
+    image_encoder_cpu_offload: bool = True
+    vae_cpu_offload: bool = True
+    pin_cpu_memory: bool = True
+    # Compilation
+    enable_torch_compile: bool = False
+    torch_compile_kwargs: dict[str, Any] = field(default_factory=dict)
+    disable_autocast: bool = False
+    # VSA parameters
+    VSA_sparsity: float = 0.0  # inference/validation sparsity
+    # V-MoBA parameters
+    moba_config_path: str | None = None
+    moba_config: dict[str, Any] = field(default_factory=dict)
+    # Master port for distributed training/inference
+    master_port: int | None = None
+    # Stage verification
+    enable_stage_verification: bool = True
+    # Prompt text file for batch processing
+    prompt_txt: str | None = None
+    # LTX-2 VAE tiling overrides
+    ltx2_vae_tiling: bool | None = None
+    ltx2_vae_spatial_tile_size_in_pixels: int | None = None
+    ltx2_vae_spatial_tile_overlap_in_pixels: int | None = None
+    ltx2_vae_temporal_tile_size_in_frames: int | None = None
+    ltx2_vae_temporal_tile_overlap_in_frames: int | None = None
+    ltx2_initial_latent_path: str | None = None
+    # model paths for correct deallocation
+    model_paths: dict[str, str] = field(default_factory=dict)
+    model_loaded: dict[str, bool] = field(default_factory=lambda: {
+        "transformer": True,
+        "vae": True,
+        "upsampler": True,
+    })
+    override_text_encoder_safetensors: str | None = None  # path to safetensors file for text encoder override
+    override_text_encoder_quant: QuantizationMethods = None
+    transformer_quant: QuantizationMethods = None
+    override_transformer_cls_name: str | None = None
+    init_weights_from_safetensors: str = ""  # path to safetensors file for initial weight loading
+    init_weights_from_safetensors_2: str = ""  # path to safetensors file for initial weight loading for transformer_2
+    override_pipeline_cls_name: str | None = None
+    # # DMD parameters
+    # dmd_denoising_steps: List[int] | None = field(default=None)
+    # MoE parameters used by Wan2.2
+    boundary_ratio: float = 0.875
+    @property
+    def training_mode(self) -> bool:
+        return not self.inference_mode
+    def __post_init__(self):
+        if self.moba_config_path:
+            try:
+                with open(self.moba_config_path) as f:
+                    self.moba_config = json.load(f)
+                logger.info("Loaded V-MoBA config from %s", self.moba_config_path)
+            except (FileNotFoundError, json.JSONDecodeError) as e:
+                logger.error("Failed to load V-MoBA config from %s: %s", self.moba_config_path, e)
+                raise
+        self._apply_ltx2_vae_overrides()
+        self.check_fastvideo_args()
+    def __getattr__(self, name: str) -> Any:
+        raise AttributeError(f"'{type(self).__name__}' object has no attribute '{name}'")
+    def _apply_ltx2_vae_overrides(self) -> None:
+        if self.pipeline_config is None:
+            return
+        vae_config = self.pipeline_config.vae_config
+        has_any = any(value is not None for value in (
+            self.ltx2_vae_spatial_tile_size_in_pixels,
+            self.ltx2_vae_spatial_tile_overlap_in_pixels,
+            self.ltx2_vae_temporal_tile_size_in_frames,
+            self.ltx2_vae_temporal_tile_overlap_in_frames,
+        ))
+        if self.ltx2_vae_tiling is not None and hasattr(self.pipeline_config, "vae_tiling"):
+            self.pipeline_config.vae_tiling = self.ltx2_vae_tiling
+        elif has_any and hasattr(self.pipeline_config, "vae_tiling"):
+            self.pipeline_config.vae_tiling = True
+        if hasattr(vae_config,
+                   "ltx2_spatial_tile_size_in_pixels") and self.ltx2_vae_spatial_tile_size_in_pixels is not None:
+            vae_config.ltx2_spatial_tile_size_in_pixels = (self.ltx2_vae_spatial_tile_size_in_pixels)
+        if hasattr(vae_config,
+                   "ltx2_spatial_tile_overlap_in_pixels") and self.ltx2_vae_spatial_tile_overlap_in_pixels is not None:
+            vae_config.ltx2_spatial_tile_overlap_in_pixels = (self.ltx2_vae_spatial_tile_overlap_in_pixels)
+        if hasattr(vae_config,
+                   "ltx2_temporal_tile_size_in_frames") and self.ltx2_vae_temporal_tile_size_in_frames is not None:
+            vae_config.ltx2_temporal_tile_size_in_frames = (self.ltx2_vae_temporal_tile_size_in_frames)
+        if hasattr(
+                vae_config,
+                "ltx2_temporal_tile_overlap_in_frames") and self.ltx2_vae_temporal_tile_overlap_in_frames is not None:
+            vae_config.ltx2_temporal_tile_overlap_in_frames = (self.ltx2_vae_temporal_tile_overlap_in_frames)
+    @staticmethod
+    def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
+        # Model and path configuration
+        parser.add_argument(
+            "--model-path",
+            type=str,
+            help="The path of the model weights. This can be a local folder or a Hugging Face repo ID.",
+        )
+        # Running mode
+        parser.add_argument(
+            "--mode",
+            type=str,
+            choices=ExecutionMode.choices(),
+            default=FastVideoArgs.mode.value,
+            help="The mode to run FastVideo",
+        )
+        # Workload type
+        parser.add_argument(
+            "--workload-type",
+            type=str,
+            choices=WorkloadType.choices(),
+            default=FastVideoArgs.workload_type.value,
+            help="The workload type",
+        )
+        # distributed_executor_backend
+        parser.add_argument(
+            "--distributed-executor-backend",
+            type=str,
+            choices=["mp"],
+            default=FastVideoArgs.distributed_executor_backend,
+            help="The distributed executor backend to use",
+        )
+        parser.add_argument(
+            "--inference-mode",
+            action=StoreBoolean,
+            default=FastVideoArgs.inference_mode,
+            help="Whether to use inference mode",
+        )
+        # HuggingFace specific parameters
+        parser.add_argument(
+            "--trust-remote-code",
+            action=StoreBoolean,
+            default=FastVideoArgs.trust_remote_code,
+            help="Trust remote code when loading HuggingFace models",
+        )
+        parser.add_argument(
+            "--revision",
+            type=str,
+            default=FastVideoArgs.revision,
+            help="The specific model version to use (can be a branch name, tag name, or commit id)",
+        )
+        # Parallelism
+        parser.add_argument(
+            "--num-gpus",
+            type=int,
+            default=FastVideoArgs.num_gpus,
+            help="The number of GPUs to use.",
+        )
+        parser.add_argument(
+            "--tp-size",
+            type=int,
+            default=FastVideoArgs.tp_size,
+            help="The tensor parallelism size.",
+        )
+        parser.add_argument(
+            "--sp-size",
+            type=int,
+            default=FastVideoArgs.sp_size,
+            help="The sequence parallelism size.",
+        )
+        parser.add_argument(
+            "--hsdp-replicate-dim",
+            type=int,
+            default=FastVideoArgs.hsdp_replicate_dim,
+            help="The data parallelism size.",
+        )
+        parser.add_argument(
+            "--hsdp-shard-dim",
+            type=int,
+            default=FastVideoArgs.hsdp_shard_dim,
+            help="The data parallelism shards.",
+        )
+        parser.add_argument(
+            "--dist-timeout",
+            type=int,
+            default=FastVideoArgs.dist_timeout,
+            help="Set timeout for torch.distributed initialization.",
+        )
+        # Output type
+        parser.add_argument(
+            "--output-type",
+            type=str,
+            default=FastVideoArgs.output_type,
+            choices=["pil"],
+            help="Output type for the generated video",
+        )
+        # Prompt text file for batch processing
+        parser.add_argument(
+            "--prompt-txt",
+            type=str,
+            default=FastVideoArgs.prompt_txt,
+            help="Path to a text file containing prompts (one per line) for batch processing",
+        )
+        # LTX-2 VAE tiling overrides
+        parser.add_argument(
+            "--ltx2-vae-tiling",
+            action=StoreBoolean,
+            default=FastVideoArgs.ltx2_vae_tiling,
+            help="Enable LTX-2 VAE tiling overrides.",
+        )
+        parser.add_argument(
+            "--ltx2-vae-spatial-tile-size-in-pixels",
+            type=int,
+            default=FastVideoArgs.ltx2_vae_spatial_tile_size_in_pixels,
+            help="LTX-2 VAE spatial tile size in pixels.",
+        )
+        parser.add_argument(
+            "--ltx2-vae-spatial-tile-overlap-in-pixels",
+            type=int,
+            default=FastVideoArgs.ltx2_vae_spatial_tile_overlap_in_pixels,
+            help="LTX-2 VAE spatial tile overlap in pixels.",
+        )
+        parser.add_argument(
+            "--ltx2-vae-temporal-tile-size-in-frames",
+            type=int,
+            default=FastVideoArgs.ltx2_vae_temporal_tile_size_in_frames,
+            help="LTX-2 VAE temporal tile size in frames.",
+        )
+        parser.add_argument(
+            "--ltx2-vae-temporal-tile-overlap-in-frames",
+            type=int,
+            default=FastVideoArgs.ltx2_vae_temporal_tile_overlap_in_frames,
+            help="LTX-2 VAE temporal tile overlap in frames.",
+        )
+        parser.add_argument(
+            "--ltx2-initial-latent-path",
+            type=str,
+            default=FastVideoArgs.ltx2_initial_latent_path,
+            help="Path to load/save a precomputed LTX-2 initial latent.",
+        )
+        # LoRA parameters (inference-time adapter loading)
+        parser.add_argument(
+            "--lora-path",
+            type=str,
+            default=FastVideoArgs.lora_path,
+            help="Path to a LoRA adapter (directory or HF repo id). If set, LoRA will be applied at inference.",
+        )
+        parser.add_argument(
+            "--lora-nickname",
+            type=str,
+            default=FastVideoArgs.lora_nickname,
+            help="Nickname to refer to the loaded LoRA adapter (useful for swapping).",
+        )
+        parser.add_argument(
+            "--lora-target-modules",
+            nargs="+",
+            type=str,
+            default=FastVideoArgs.lora_target_modules,
+            help="Optional list of module name substrings to restrict LoRA injection (e.g. q_proj k_proj v_proj).",
+        )
+        # BSA runtime control (LongCat)
+        parser.add_argument(
+            "--enable-bsa",
+            action=StoreBoolean,
+            help="Enable Block Sparse Attention (BSA) at runtime (overrides config).",
+        )
+        parser.add_argument(
+            "--bsa-sparsity",
+            type=float,
+            help="BSA sparsity (e.g., 0.9375).",
+        )
+        parser.add_argument(
+            "--bsa-cdf-threshold",
+            type=float,
+            help="BSA CDF threshold (optional).",
+        )
+        parser.add_argument(
+            "--bsa-chunk-q",
+            nargs=3,
+            type=int,
+            metavar=("T", "H", "W"),
+            help="BSA chunk_3d_shape_q as three ints, e.g., 4 4 4.",
+        )
+        parser.add_argument(
+            "--bsa-chunk-k",
+            nargs=3,
+            type=int,
+            metavar=("T", "H", "W"),
+            help="BSA chunk_3d_shape_k as three ints, e.g., 4 4 4.",
+        )
+        parser.add_argument(
+            "--enable-torch-compile",
+            action=StoreBoolean,
+            default=FastVideoArgs.enable_torch_compile,
+            help="Use torch.compile to speed up DiT inference." +
+            "However, will likely cause precision drifts. See (https://github.com/pytorch/pytorch/issues/145213)",
+        )
+        parser.add_argument(
+            "--torch-compile-kwargs",
+            type=str,
+            default=None,
+            help=
+            "JSON string of kwargs to pass to torch.compile. Example: '{\"backend\":\"inductor\",\"mode\":\"reduce-overhead\"}'",
+        )
+        parser.add_argument(
+            "--dit-cpu-offload",
+            action=StoreBoolean,
+            help="Use CPU offload for DiT inference. Enable if run out of memory with FSDP.",
+        )
+        parser.add_argument(
+            "--dit-layerwise-offload",
+            action=StoreBoolean,
+            help="Enable layerwise CPU offload with async H2D prefetch overlap.",
+        )
+        parser.add_argument(
+            "--use-fsdp-inference",
+            action=StoreBoolean,
+            help=
+            "Use FSDP for inference by sharding the model weights. Latency is very low due to prefetch--enable if run out of memory.",
+        )
+        parser.add_argument(
+            "--text-encoder-cpu-offload",
+            action=StoreBoolean,
+            help="Use CPU offload for text encoder. Enable if run out of memory.",
+        )
+        parser.add_argument(
+            "--image-encoder-cpu-offload",
+            action=StoreBoolean,
+            help="Use CPU offload for image encoder. Enable if run out of memory.",
+        )
+        parser.add_argument(
+            "--vae-cpu-offload",
+            action=StoreBoolean,
+            help="Use CPU offload for VAE. Enable if run out of memory.",
+        )
+        parser.add_argument(
+            "--pin-cpu-memory",
+            action=StoreBoolean,
+            help=
+            "Pin memory for CPU offload. Only added as a temp workaround if it throws \"CUDA error: invalid argument\". "
+            "Should be enabled in almost all cases",
+        )
+        parser.add_argument(
+            "--disable-autocast",
+            action=StoreBoolean,
+            help="Disable autocast for denoising loop and vae decoding in pipeline sampling",
+        )
+        # VSA parameters
+        parser.add_argument(
+            "--VSA-sparsity",
+            type=float,
+            default=FastVideoArgs.VSA_sparsity,
+            help="Validation sparsity for VSA",
+        )
+        # Master port for distributed training/inference
+        parser.add_argument(
+            "--master-port",
+            type=int,
+            default=FastVideoArgs.master_port,
+            help="Master port for distributed training/inference",
+        )
+        # Stage verification
+        parser.add_argument(
+            "--enable-stage-verification",
+            action=StoreBoolean,
+            default=FastVideoArgs.enable_stage_verification,
+            help="Enable input/output verification for pipeline stages",
+        )
+        parser.add_argument(
+            "--override-text-encoder-safetensors",
+            type=str,
+            default=FastVideoArgs.override_text_encoder_safetensors,
+            help="Path to safetensors file for text encoder override",
+        )
+        parser.add_argument(
+            "--override-text-encoder-quant",
+            type=str,
+            choices=QUANTIZATION_METHODS,
+            default=FastVideoArgs.override_text_encoder_quant,
+            help="Quantization method for text encoder override",
+        )
+        parser.add_argument(
+            "--transformer-quant",
+            type=str,
+            choices=QUANTIZATION_METHODS,
+            default=FastVideoArgs.transformer_quant,
+            help="Quantization method for transformer loading",
+        )
+        parser.add_argument(
+            "--override-transformer-cls-name",
+            type=str,
+            default=FastVideoArgs.override_transformer_cls_name,
+            help="Override transformer cls name",
+        )
+        parser.add_argument(
+            "--override-pipeline-cls-name",
+            type=str,
+            default=FastVideoArgs.override_pipeline_cls_name,
+            help="Override pipeline cls name",
+        )
+        parser.add_argument("--init-weights-from-safetensors",
+                            type=str,
+                            help="Path to safetensors file for initial weight loading")
+        parser.add_argument("--init-weights-from-safetensors-2",
+                            type=str,
+                            help="Path to safetensors file for initial weight loading")
+        # Add pipeline configuration arguments
+        PipelineConfig.add_cli_args(parser)
+        # Add preprocessing configuration arguments
+        PreprocessConfig.add_cli_args(parser)
+        return parser
+    @classmethod
+    def from_cli_args(cls, args: argparse.Namespace) -> "FastVideoArgs":
+        provided_args = clean_cli_args(args)
+        # Get all fields from the dataclass
+        attrs = [attr.name for attr in dataclasses.fields(cls)]
+        # Create a dictionary of attribute values, with defaults for missing attributes
+        kwargs: dict[str, Any] = {}
+        for attr in attrs:
+            if attr == 'pipeline_config':
+                pipeline_config = PipelineConfig.from_kwargs(provided_args)
+                kwargs['pipeline_config'] = pipeline_config
+            elif attr == 'preprocess_config':
+                preprocess_config = PreprocessConfig.from_kwargs(provided_args)
+                kwargs['preprocess_config'] = preprocess_config
+            elif attr == 'mode':
+                # Convert string to ExecutionMode enum
+                mode_value = getattr(args, attr, FastVideoArgs.mode.value)
+                kwargs['mode'] = ExecutionMode.from_string(mode_value) if isinstance(mode_value, str) else mode_value
+            elif attr == 'torch_compile_kwargs':
+                # Parse JSON string for torch.compile kwargs
+                torch_compile_kwargs_str = getattr(args, 'torch_compile_kwargs', None)
+                if torch_compile_kwargs_str:
+                    try:
+                        import json
+                        kwargs['torch_compile_kwargs'] = json.loads(torch_compile_kwargs_str)
+                    except json.JSONDecodeError as e:
+                        raise ValueError(f"Invalid JSON for torch_compile_kwargs: {e}") from e
+                else:
+                    kwargs['torch_compile_kwargs'] = {}
+            elif attr == 'workload_type':
+                # Convert string to WorkloadType enum
+                workload_type_value = getattr(args, 'workload_type', FastVideoArgs.workload_type.value)
+                kwargs['workload_type'] = WorkloadType.from_string(workload_type_value) if isinstance(
+                    workload_type_value, str) else workload_type_value
+            # Use getattr with default value from the dataclass for potentially missing attributes
+            else:
+                # Get the field to check if it has a default_factory
+                field = dataclasses.fields(cls)[next(i for i, f in enumerate(dataclasses.fields(cls))
+                                                     if f.name == attr)]
+                if field.default_factory is not dataclasses.MISSING:
+                    # Use the default_factory to create the default value
+                    default_value = field.default_factory()
+                else:
+                    default_value = getattr(cls, attr, None)
+                value = getattr(args, attr, default_value)
+                kwargs[attr] = value  # type: ignore
+        return cls(**kwargs)  # type: ignore
+    @classmethod
+    def from_kwargs(cls, **kwargs: Any) -> "FastVideoArgs":
+        # Convert mode string to enum if necessary
+        if 'mode' in kwargs and isinstance(kwargs['mode'], str):
+            kwargs['mode'] = ExecutionMode.from_string(kwargs['mode'])
+        # Convert workload_type string to enum if necessary
+        if 'workload_type' in kwargs and isinstance(kwargs['workload_type'], str):
+            kwargs['workload_type'] = WorkloadType.from_string(kwargs['workload_type'])
+        kwargs['pipeline_config'] = PipelineConfig.from_kwargs(kwargs)
+        kwargs['preprocess_config'] = PreprocessConfig.from_kwargs(kwargs)
+        # Filter to only FastVideoArgs dataclass fields — pipeline-specific CLI
+        # args (e.g. enable_bsa, bsa_sparsity) live in PipelineConfig and must
+        # not be forwarded to the FastVideoArgs constructor.
+        valid_fields = {f.name for f in dataclasses.fields(cls)}
+        return cls(**{k: v for k, v in kwargs.items() if k in valid_fields})
+    def check_fastvideo_args(self) -> None:
+        """Validate inference arguments for consistency"""
+        from fastvideo.platforms import current_platform
+        if current_platform.is_mps():
+            self.use_fsdp_inference = False
+            self.dit_layerwise_offload = False
+        if self.dit_layerwise_offload:
+            if self.use_fsdp_inference:
+                logger.warning("dit_layerwise_offload is enabled, automatically disabling use_fsdp_inference.")
+                self.use_fsdp_inference = False
+            if self.dit_cpu_offload:
+                logger.warning("dit_layerwise_offload is enabled, automatically disabling dit_cpu_offload.")
+                self.dit_cpu_offload = False
+        # Validate mode and inference_mode consistency
+        assert isinstance(self.mode, ExecutionMode), f"Mode must be an ExecutionMode enum, got {type(self.mode)}"
+        assert self.mode in ExecutionMode.choices(), f"Invalid execution mode: {self.mode}"
+        # Validate workload type
+        assert isinstance(self.workload_type,
+                          WorkloadType), f"Workload type must be a WorkloadType enum, got {type(self.workload_type)}"
+        assert self.workload_type in WorkloadType.choices(), f"Invalid workload type: {self.workload_type}"
+        if self.mode in [ExecutionMode.DISTILLATION, ExecutionMode.FINETUNING] and self.inference_mode:
+            logger.warning("Mode is 'training' but inference_mode is True. Setting inference_mode to False.")
+            self.inference_mode = False
+        elif self.mode in [ExecutionMode.INFERENCE, ExecutionMode.PREPROCESS] and not self.inference_mode:
+            logger.warning("Mode is '%s' but inference_mode is False. Setting inference_mode to True.", self.mode)
+            self.inference_mode = True
+        if not self.inference_mode:
+            assert self.hsdp_replicate_dim != -1, "hsdp_replicate_dim must be set for training"
+            assert self.hsdp_shard_dim != -1, "hsdp_shard_dim must be set for training"
+            assert self.sp_size != -1, "sp_size must be set for training"
+        if self.tp_size == -1:
+            self.tp_size = 1
+        if self.sp_size == -1:
+            self.sp_size = self.num_gpus
+        if self.hsdp_shard_dim == -1:
+            self.hsdp_shard_dim = self.num_gpus
+        assert self.sp_size <= self.num_gpus and self.num_gpus % self.sp_size == 0, "num_gpus must >= and be divisible by sp_size"
+        assert self.hsdp_replicate_dim <= self.num_gpus and self.num_gpus % self.hsdp_replicate_dim == 0, "num_gpus must >= and be divisible by hsdp_replicate_dim"
+        assert self.hsdp_shard_dim <= self.num_gpus and self.num_gpus % self.hsdp_shard_dim == 0, "num_gpus must >= and be divisible by hsdp_shard_dim"
+        if self.num_gpus < max(self.tp_size, self.sp_size):
+            self.num_gpus = max(self.tp_size, self.sp_size)
+        if self.pipeline_config is None:
+            raise ValueError("pipeline_config is not set in FastVideoArgs")
+        self.pipeline_config.check_pipeline_config()
+        # Add preprocessing config validation if needed
+        if self.mode == ExecutionMode.PREPROCESS:
+            if self.preprocess_config is None:
+                raise ValueError("preprocess_config is not set in FastVideoArgs when mode is PREPROCESS")
+            if self.preprocess_config.model_path == "":
+                self.preprocess_config.model_path = self.model_path
+            if not self.pipeline_config.vae_config.load_encoder:
+                self.pipeline_config.vae_config.load_encoder = True
+            self.preprocess_config.check_preprocess_config()
+_current_fastvideo_args = None
+def prepare_fastvideo_args(argv: list[str]) -> FastVideoArgs:
+    """
+    Prepare the inference arguments from the command line arguments.
+    Args:
+        argv: The command line arguments. Typically, it should be `sys.argv[1:]`
+            to ensure compatibility with `parse_args` when no arguments are passed.
+    Returns:
+        The inference arguments.
+    """
+    parser = FlexibleArgumentParser()
+    FastVideoArgs.add_cli_args(parser)
+    raw_args = parser.parse_args(argv)
+    fastvideo_args = FastVideoArgs.from_cli_args(raw_args)
+    global _current_fastvideo_args
+    _current_fastvideo_args = fastvideo_args
+    return fastvideo_args
+@contextmanager
+def set_current_fastvideo_args(fastvideo_args: FastVideoArgs):
+    """
+    Temporarily set the current fastvideo config.
+    Used during model initialization.
+    We save the current fastvideo config in a global variable,
+    so that all modules can access it, e.g. custom ops
+    can access the fastvideo config to determine how to dispatch.
+    """
+    global _current_fastvideo_args
+    old_fastvideo_args = _current_fastvideo_args
+    try:
+        _current_fastvideo_args = fastvideo_args
+        yield
+    finally:
+        _current_fastvideo_args = old_fastvideo_args
+def get_current_fastvideo_args() -> FastVideoArgs:
+    if _current_fastvideo_args is None:
+        # in ci, usually when we test custom ops/modules directly,
+        # we don't set the fastvideo config. In that case, we set a default
+        # config.
+        # TODO(will): may need to handle this for CI.
+        raise ValueError("Current fastvideo args is not set.")
+    return _current_fastvideo_args
+@dataclasses.dataclass
+class TrainingArgs(FastVideoArgs):
+    """
+    Training arguments. Inherits from FastVideoArgs and adds training-specific
+    arguments. If there are any conflicts, the training arguments will take
+    precedence.
+    """
+    data_path: str = ""
+    dataloader_num_workers: int = 0
+    num_height: int = 0
+    num_width: int = 0
+    num_frames: int = 0
+    train_batch_size: int = 0
+    num_latent_t: int = 0
+    group_frame: bool = False
+    group_resolution: bool = False
+    # text encoder & vae & diffusion model
+    pretrained_model_name_or_path: str = ""
+    # DMD model paths - separate paths for each network
+    real_score_model_path: str = ""  # path for real score (teacher) model
+    fake_score_model_path: str = ""  # path for fake score (critic) model
+    # diffusion setting
+    ema_decay: float = 0.0
+    ema_start_step: int = 0
+    training_cfg_rate: float = 0.0
+    precondition_outputs: bool = False
+    # validation & logs
+    validation_dataset_file: str = ""
+    validation_preprocessed_path: str = ""
+    validation_sampling_steps: str = ""
+    validation_guidance_scale: str = ""
+    validation_steps: float = 0.0
+    log_validation: bool = False
+    trackers: list[str] = dataclasses.field(default_factory=list)
+    tracker_project_name: str = ""
+    wandb_run_name: str = ""
+    seed: int = 0
+    _loading_teacher_critic_model: bool = False
+    # output
+    output_dir: str = ""
+    checkpoints_total_limit: int = 0
+    resume_from_checkpoint: str = ""  # specify the checkpoint folder to resume from
+    # optimizer & scheduler
+    num_train_epochs: int = 0
+    max_train_steps: int = 0
+    gradient_accumulation_steps: int = 0
+    learning_rate: float = 0.0
+    scale_lr: bool = False
+    lr_scheduler: str = "constant"
+    lr_warmup_steps: int = 0
+    max_grad_norm: float = 0.0
+    enable_gradient_checkpointing_type: str | None = None
+    selective_checkpointing: float = 0.0
+    mixed_precision: str = ""
+    train_sp_batch_size: int = 0
+    fsdp_sharding_startegy: str = ""
+    weighting_scheme: str = ""
+    logit_mean: float = 0.0
+    logit_std: float = 1.0
+    mode_scale: float = 0.0
+    num_euler_timesteps: int = 0
+    lr_num_cycles: int = 0
+    lr_power: float = 0.0
+    min_lr_ratio: float = 0.5  # minimum learning rate ratio for cosine_with_min_lr scheduler
+    not_apply_cfg_solver: bool = False
+    distill_cfg: float = 0.0
+    scheduler_type: str = ""
+    linear_quadratic_threshold: float = 0.0
+    linear_range: float = 0.0
+    weight_decay: float = 0.0
+    betas: str = "0.9,0.999"  # betas for optimizer, format: "beta1,beta2"
+    use_ema: bool = False
+    multi_phased_distill_schedule: str = ""
+    pred_decay_weight: float = 0.0
+    pred_decay_type: str = ""
+    hunyuan_teacher_disable_cfg: bool = False
+    # master_weight_type
+    master_weight_type: str = ""
+    # VSA training decay parameters
+    VSA_decay_rate: float = 0.01  # decay rate -> 0.02
+    VSA_decay_interval_steps: int = 1  # decay interval steps -> 50
+    VSA_init_sparsity: float = 0.0  # initial sparsity (default 0, ramp from 0)
+    VSA_warmup_steps: int = 0  # keep init_sparsity for this many steps before ramping
+    # LoRA training parameters
+    lora_rank: int | None = None
+    lora_alpha: int | None = None
+    lora_training: bool = False
+    ltx2_first_frame_conditioning_p: float = 0.1
+    # distillation args
+    generator_update_interval: int = 5
+    dfake_gen_update_ratio: int = 5  # self-forcing: how often to train generator vs critic
+    min_timestep_ratio: float = 0.2
+    max_timestep_ratio: float = 0.98
+    real_score_guidance_scale: float = 3.5
+    fake_score_learning_rate: float = 0.0  # separate learning rate for fake_score_transformer, if 0.0, use learning_rate
+    fake_score_lr_scheduler: str = "constant"  # separate lr scheduler for fake_score_transformer, if not set, use lr_scheduler
+    fake_score_betas: str = "0.9,0.999"  # betas for fake score optimizer, format: "beta1,beta2"
+    training_state_checkpointing_steps: int = 0  # for resuming training
+    weight_only_checkpointing_steps: int = 0  # for inference
+    log_visualization: bool = False
+    visualization_steps: int = 0
+    # simulate generator forward to match inference
+    simulate_generator_forward: bool = False
+    warp_denoising_step: bool = False
+    generator_4bit_attn: bool = False
+    generator_4bit_linear: bool = False
+    # Self-forcing specific arguments
+    num_frame_per_block: int = 3
+    independent_first_frame: bool = False
+    enable_gradient_masking: bool = True
+    gradient_mask_last_n_frames: int = 21
+    same_step_across_blocks: bool = False  # Use same exit timestep for all blocks
+    last_step_only: bool = False  # Only use the last timestep for training
+    context_noise: int = 0  # Context noise level for cache updates
+    @classmethod
+    def from_cli_args(cls, args: argparse.Namespace) -> "TrainingArgs":
+        provided_args = clean_cli_args(args)
+        # Get all fields from the dataclass
+        attrs = [attr.name for attr in dataclasses.fields(cls)]
+        logger.info(provided_args)
+        # Create a dictionary of attribute values, with defaults for missing attributes
+        kwargs: dict[str, Any] = {}
+        for attr in attrs:
+            if attr == 'pipeline_config':
+                pipeline_config = PipelineConfig.from_kwargs(provided_args)
+                kwargs[attr] = pipeline_config
+            elif attr == 'mode':
+                # Convert string to ExecutionMode enum
+                mode_value = getattr(args, attr, ExecutionMode.FINETUNING.value)
+                kwargs[attr] = ExecutionMode.from_string(mode_value) if isinstance(mode_value, str) else mode_value
+            elif attr == 'workload_type':
+                # Convert string to WorkloadType enum
+                workload_type_value = getattr(args, 'workload_type', WorkloadType.T2V.value)
+                kwargs[attr] = WorkloadType.from_string(workload_type_value) if isinstance(workload_type_value,
+                                                                                           str) else workload_type_value
+            # Use getattr with default value from the dataclass for potentially missing attributes
+            else:
+                # Get the field to check its default value
+                field = dataclasses.fields(cls)[next(i for i, f in enumerate(dataclasses.fields(cls))
+                                                     if f.name == attr)]
+                # Check if the attribute is provided in args
+                if hasattr(args, attr):
+                    value = getattr(args, attr)
+                else:
+                    # Use the field's default value
+                    if field.default_factory is not dataclasses.MISSING:
+                        value = field.default_factory()
+                    elif field.default is not dataclasses.MISSING:
+                        value = field.default
+                    else:
+                        # No default value, use None
+                        value = None
+                kwargs[attr] = value
+        return cls(**kwargs)  # type: ignore
+    @staticmethod
+    def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
+        parser.add_argument("--data-path", type=str, required=True, help="Path to parquet files")
+        parser.add_argument("--dataloader-num-workers",
+                            type=int,
+                            required=True,
+                            help="Number of workers for dataloader")
+        parser.add_argument("--num-height", type=int, required=True, help="Number of heights")
+        parser.add_argument("--num-width", type=int, required=True, help="Number of widths")
+        parser.add_argument("--num-frames", type=int, required=True, help="Number of frames")
+        # Training batch and model configuration
+        parser.add_argument("--train-batch-size", type=int, required=True, help="Training batch size")
+        parser.add_argument("--num-latent-t", type=int, required=True, help="Number of latent time steps")
+        parser.add_argument("--group-frame", action=StoreBoolean, help="Whether to group frames during training")
+        parser.add_argument("--group-resolution",
+                            action=StoreBoolean,
+                            help="Whether to group resolutions during training")
+        # Model paths
+        parser.add_argument("--pretrained-model-name-or-path",
+                            type=str,
+                            required=True,
+                            help="Path to pretrained model or model name")
+        parser.add_argument("--dit-model-name-or-path",
+                            type=str,
+                            required=False,
+                            help="Path to DiT model or model name")
+        parser.add_argument("--cache-dir", type=str, help="Directory to cache models")
+        # DMD model paths - separate paths for each network
+        parser.add_argument("--generator-model-path",
+                            type=str,
+                            help="Path to generator (student) model for DMD distillation")
+        parser.add_argument("--real-score-model-path",
+                            type=str,
+                            help="Path to real score (teacher) model for DMD distillation")
+        parser.add_argument("--fake-score-model-path",
+                            type=str,
+                            help="Path to fake score (critic) model for DMD distillation")
+        # Diffusion settings
+        parser.add_argument("--ema-decay", type=float, default=0.999, help="EMA decay rate")
+        parser.add_argument("--ema-start-step", type=int, default=0, help="Step to start EMA")
+        parser.add_argument("--training-cfg-rate", type=float, help="Classifier-free guidance scale")
+        parser.add_argument("--precondition-outputs",
+                            action=StoreBoolean,
+                            help="Whether to precondition the outputs of the model")
+        # Validation and logging
+        parser.add_argument("--validation-dataset-file", type=str, help="Path to unprocessed validation dataset")
+        parser.add_argument("--validation-preprocessed-path", type=str, help="Path to processed validation dataset")
+        parser.add_argument("--validation-sampling-steps", type=str, help="Validation sampling steps")
+        parser.add_argument("--validation-guidance-scale", type=str, help="Validation guidance scale")
+        parser.add_argument("--validation-steps", type=float, help="Number of validation steps")
+        parser.add_argument("--log-validation", action=StoreBoolean, help="Whether to log validation results")
+        parser.add_argument("--visualization-steps", type=int, help="Number of visualization steps")
+        parser.add_argument("--tracker-project-name", type=str, help="Project name for tracking")
+        parser.add_argument("--wandb-run-name", type=str, help="Run name for wandb")
+        parser.add_argument("--seed", type=int, default=42, help="Seed for deterministic training")
+        # Output configuration
+        parser.add_argument("--output-dir", type=str, required=True, help="Output directory for checkpoints and logs")
+        parser.add_argument("--checkpoints-total-limit", type=int, help="Maximum number of checkpoints to keep")
+        parser.add_argument("--training-state-checkpointing-steps",
+                            type=int,
+                            help="Steps between training state checkpoints (for resuming training)")
+        parser.add_argument("--weight-only-checkpointing-steps",
+                            type=int,
+                            help="Steps between weight-only checkpoints (for inference)")
+        parser.add_argument("--resume-from-checkpoint", type=str, help="Path to checkpoint to resume from")
+        parser.add_argument("--logging-dir", type=str, help="Directory for logging")
+        # Training configuration
+        parser.add_argument("--num-train-epochs", type=int, help="Number of training epochs")
+        parser.add_argument("--max-train-steps", type=int, help="Maximum number of training steps")
+        parser.add_argument("--gradient-accumulation-steps", type=int, help="Number of steps to accumulate gradients")
+        parser.add_argument("--learning-rate", type=float, required=True, help="Learning rate")
+        parser.add_argument("--scale-lr", action=StoreBoolean, help="Whether to scale learning rate")
+        parser.add_argument("--lr-scheduler", type=str, default="constant", help="Learning rate scheduler type")
+        parser.add_argument("--lr-warmup-steps", type=int, default=10, help="Number of warmup steps for learning rate")
+        parser.add_argument("--max-grad-norm", type=float, help="Maximum gradient norm")
+        parser.add_argument("--enable-gradient-checkpointing-type",
+                            type=str,
+                            choices=["full", "ops", "block_skip"],
+                            default=None,
+                            help="Gradient checkpointing type")
+        parser.add_argument("--selective-checkpointing", type=float, help="Selective checkpointing threshold")
+        parser.add_argument("--mixed-precision", type=str, help="Mixed precision training type")
+        parser.add_argument("--train-sp-batch-size", type=int, help="Training spatial parallelism batch size")
+        parser.add_argument("--fsdp-sharding-strategy", type=str, help="FSDP sharding strategy")
+        parser.add_argument(
+            "--weighting_scheme",
+            type=str,
+            default="uniform",
+            choices=["sigma_sqrt", "logit_normal", "mode", "cosmap", "uniform"],
+        )
+        parser.add_argument(
+            "--logit_mean",
+            type=float,
+            default=0.0,
+            help="mean to use when using the `'logit_normal'` weighting scheme.",
+        )
+        parser.add_argument(
+            "--logit_std",
+            type=float,
+            default=1.0,
+            help="std to use when using the `'logit_normal'` weighting scheme.",
+        )
+        parser.add_argument(
+            "--mode_scale",
+            type=float,
+            default=1.29,
+            help="Scale of mode weighting scheme. Only effective when using the `'mode'` as the `weighting_scheme`.",
+        )
+        # Additional training parameters
+        parser.add_argument("--num-euler-timesteps", type=int, help="Number of Euler timesteps")
+        parser.add_argument("--lr-num-cycles", type=int, help="Number of learning rate cycles")
+        parser.add_argument("--lr-power", type=float, help="Learning rate power")
+        parser.add_argument("--min-lr-ratio",
+                            type=float,
+                            default=TrainingArgs.min_lr_ratio,
+                            help="Minimum learning rate ratio for cosine_with_min_lr scheduler")
+        parser.add_argument("--not-apply-cfg-solver", action=StoreBoolean, help="Whether to not apply CFG solver")
+        parser.add_argument("--distill-cfg", type=float, help="Distillation CFG scale")
+        parser.add_argument("--scheduler-type", type=str, help="Scheduler type")
+        parser.add_argument("--linear-quadratic-threshold", type=float, help="Linear quadratic threshold")
+        parser.add_argument("--linear-range", type=float, help="Linear range")
+        parser.add_argument("--weight-decay", type=float, help="Weight decay")
+        parser.add_argument("--betas",
+                            type=str,
+                            default=TrainingArgs.betas,
+                            help="Betas for optimizer (format: 'beta1,beta2')")
+        parser.add_argument("--use-ema", action=StoreBoolean, help="Whether to use EMA")
+        parser.add_argument("--multi-phased-distill-schedule", type=str, help="Multi-phased distillation schedule")
+        parser.add_argument("--pred-decay-weight", type=float, help="Prediction decay weight")
+        parser.add_argument("--pred-decay-type", type=str, help="Prediction decay type")
+        parser.add_argument("--hunyuan-teacher-disable-cfg",
+                            action=StoreBoolean,
+                            help="Whether to disable CFG for Hunyuan teacher")
+        parser.add_argument("--master-weight-type", type=str, help="Master weight type")
+        # VSA parameters for training with dense to sparse adaption
+        parser.add_argument(
+            "--VSA-decay-rate",  # decay rate, how much sparsity you want to decay each step
+            type=float,
+            default=TrainingArgs.VSA_decay_rate,
+            help="VSA decay rate")
+        parser.add_argument(
+            "--VSA-decay-interval-steps",  # how many steps for training with current sparsity
+            type=int,
+            default=TrainingArgs.VSA_decay_interval_steps,
+            help="VSA decay interval steps")
+        parser.add_argument(
+            "--VSA-init-sparsity",
+            type=float,
+            default=TrainingArgs.VSA_init_sparsity,
+            help="Initial sparsity to start from (default 0)")
+        parser.add_argument(
+            "--VSA-warmup-steps",
+            type=int,
+            default=TrainingArgs.VSA_warmup_steps,
+            help="Keep init sparsity for N steps before ramping (default 0)")
+        parser.add_argument("--lora-training", action=StoreBoolean, help="Whether to use LoRA training")
+        parser.add_argument("--lora-rank", type=int, help="LoRA rank")
+        parser.add_argument("--lora-alpha", type=int, help="LoRA alpha")
+        parser.add_argument(
+            "--ltx2-first-frame-conditioning-p",
+            type=float,
+            default=TrainingArgs.ltx2_first_frame_conditioning_p,
+            help="Probability of conditioning on the first frame during LTX-2 training",
+        )
+        # V-MoBA parameters
+        parser.add_argument(
+            "--moba-config-path",
+            type=str,
+            default=None,
+            help="Path to a JSON file containing V-MoBA specific configurations.",
+        )
+        # Distillation arguments
+        parser.add_argument("--generator-update-interval",
+                            type=int,
+                            default=TrainingArgs.generator_update_interval,
+                            help="Ratio of student updates to critic updates.")
+        parser.add_argument(
+            "--dfake-gen-update-ratio",
+            type=int,
+            default=TrainingArgs.dfake_gen_update_ratio,
+            help="Self-forcing: How often to train generator vs critic (train generator every N steps).")
+        parser.add_argument("--min-timestep-ratio",
+                            type=float,
+                            default=TrainingArgs.min_timestep_ratio,
+                            help="Minimum step ratio")
+        parser.add_argument("--max-timestep-ratio",
+                            type=float,
+                            default=TrainingArgs.max_timestep_ratio,
+                            help="Maximum step ratio")
+        parser.add_argument("--real-score-guidance-scale",
+                            type=float,
+                            default=TrainingArgs.real_score_guidance_scale,
+                            help="Teacher guidance scale")
+        parser.add_argument("--fake-score-learning-rate",
+                            type=float,
+                            default=TrainingArgs.fake_score_learning_rate,
+                            help="Learning rate for fake score transformer")
+        parser.add_argument("--fake-score-betas",
+                            type=str,
+                            default=TrainingArgs.fake_score_betas,
+                            help="Betas for fake score optimizer (format: 'beta1,beta2')")
+        parser.add_argument("--fake-score-lr-scheduler",
+                            type=str,
+                            default=TrainingArgs.fake_score_lr_scheduler,
+                            help="Learning rate scheduler for fake score transformer")
+        parser.add_argument("--log-visualization", action=StoreBoolean, help="Whether to log visualization")
+        parser.add_argument("--simulate-generator-forward",
+                            action=StoreBoolean,
+                            help="Whether to simulate generator forward to match inference")
+        parser.add_argument("--warp-denoising-step",
+                            action=StoreBoolean,
+                            help="Whether to warp denoising step according to the scheduler time shift")
+        # Self-forcing specific arguments
+        parser.add_argument("--num-frame-per-block",
+                            type=int,
+                            default=TrainingArgs.num_frame_per_block,
+                            help="Number of frames per block for causal generation")
+        parser.add_argument("--independent-first-frame",
+                            action=StoreBoolean,
+                            help="Whether the first frame is independent in causal generation")
+        parser.add_argument("--enable-gradient-masking",
+                            action=StoreBoolean,
+                            help="Whether to enable frame-level gradient masking")
+        parser.add_argument("--gradient-mask-last-n-frames",
+                            type=int,
+                            default=TrainingArgs.gradient_mask_last_n_frames,
+                            help="Number of last frames to enable gradients for")
+        parser.add_argument("--validate-cache-structure",
+                            action=StoreBoolean,
+                            help="Whether to validate KV cache structure (debug flag)")
+        parser.add_argument("--same-step-across-blocks",
+                            action=StoreBoolean,
+                            help="Whether to use the same exit timestep for all blocks")
+        parser.add_argument("--last-step-only",
+                            action=StoreBoolean,
+                            help="Whether to only use the last timestep for training")
+        parser.add_argument("--context-noise",
+                            type=int,
+                            default=TrainingArgs.context_noise,
+                            help="Context noise level for cache updates")
+        return parser
+def parse_int_list(value: str) -> list[int]:
+    if not value:
+        return []
+    return [int(x.strip()) for x in value.split(",")]

standalone_inference/overlay_files/fastvideo/forward_context.py ADDED Viewed

	@@ -0,0 +1,100 @@

+# SPDX-License-Identifier: Apache-2.0
+# Adapted from vllm: https://github.com/vllm-project/vllm/blob/v0.7.3/vllm/forward_context.py
+import time
+from collections import defaultdict
+from contextlib import contextmanager
+from dataclasses import dataclass
+from typing import TYPE_CHECKING, Optional
+import torch
+from fastvideo.logger import init_logger
+if TYPE_CHECKING:
+    from fastvideo.attention import AttentionMetadata
+    from fastvideo.pipelines import ForwardBatch
+logger = init_logger(__name__)
+# TODO(will): check if this is needed
+# track_batchsize: bool = envs.FASTVIDEO_LOG_BATCHSIZE_INTERVAL >= 0
+track_batchsize: bool = False
+last_logging_time: float = 0
+forward_start_time: float = 0
+# batchsize_logging_interval: float = envs.FASTVIDEO_LOG_BATCHSIZE_INTERVAL
+batchsize_logging_interval: float = 1000
+batchsize_forward_time: defaultdict = defaultdict(list)
+#
+@dataclass
+class ForwardContext:
+    current_timestep: int
+    # TODO(will): check this arg
+    # copy from vllm_config.compilation_config.static_forward_context
+    # attn_layers: Dict[str, Any]
+    # TODO: extend to support per-layer dynamic forward context
+    attn_metadata: "AttentionMetadata"  # set dynamically for each forward pass
+    forward_batch: Optional["ForwardBatch"] = None
+    force_dense: bool = False
+_forward_context: Optional["ForwardContext"] = None
+def get_forward_context() -> "ForwardContext":
+    """Get the current forward context."""
+    assert _forward_context is not None, ("Forward context is not set. "
+                                          "Please use `set_forward_context` to set the forward context.")
+    return _forward_context
+# TODO(will): finalize the interface
+@contextmanager
+def set_forward_context(current_timestep, attn_metadata, forward_batch: Optional["ForwardBatch"] = None, force_dense: bool = False):
+    """A context manager that stores the current forward context,
+    can be attention metadata, etc.
+    Here we can inject common logic for every model forward pass.
+    """
+    global forward_start_time
+    need_to_track_batchsize = track_batchsize and attn_metadata is not None
+    if need_to_track_batchsize:
+        forward_start_time = time.perf_counter()
+    global _forward_context
+    prev_context = _forward_context
+    _forward_context = ForwardContext(current_timestep=current_timestep,
+                                      attn_metadata=attn_metadata,
+                                      forward_batch=forward_batch,
+                                      force_dense=force_dense)
+    try:
+        yield
+    finally:
+        global last_logging_time, batchsize_logging_interval
+        if need_to_track_batchsize:
+            if hasattr(attn_metadata, "num_prefill_tokens"):
+                # for v0 attention backends
+                batchsize = attn_metadata.num_prefill_tokens + \
+                    attn_metadata.num_decode_tokens
+            else:
+                # for v1 attention backends
+                batchsize = attn_metadata.num_input_tokens
+            now = time.perf_counter()
+            # time measurement is in milliseconds
+            batchsize_forward_time[batchsize].append((now - forward_start_time) * 1000)
+            if now - last_logging_time > batchsize_logging_interval:
+                last_logging_time = now
+                forward_stats = []
+                for bs, times in batchsize_forward_time.items():
+                    if len(times) <= 1:
+                        # can be cudagraph / profiling run
+                        continue
+                    medium = torch.quantile(torch.tensor(times), q=0.5).item()
+                    medium = round(medium, 2)
+                    forward_stats.append((bs, len(times), medium))
+                forward_stats.sort(key=lambda x: x[1], reverse=True)
+                if forward_stats:
+                    logger.info(("Batchsize forward time stats "
+                                 "(batchsize, count, median_time(ms)): %s"), forward_stats)
+        _forward_context = prev_context

standalone_inference/overlay_files/fastvideo/pipelines/basic/wan/__init__.py ADDED Viewed

File without changes

standalone_inference/overlay_files/fastvideo/pipelines/basic/wan/wan_pipeline.py ADDED Viewed

	@@ -0,0 +1,60 @@

+# SPDX-License-Identifier: Apache-2.0
+"""
+Wan video diffusion pipeline implementation.
+This module contains an implementation of the Wan video diffusion pipeline
+using the modular pipeline architecture.
+"""
+from fastvideo.fastvideo_args import FastVideoArgs
+from fastvideo.logger import init_logger
+from fastvideo.models.schedulers.scheduling_flow_unipc_multistep import (FlowUniPCMultistepScheduler)
+from fastvideo.pipelines import ComposedPipelineBase, LoRAPipeline
+from fastvideo.pipelines.stages import (ConditioningStage, DecodingStage, DenoisingStage, InputValidationStage,
+                                        LatentPreparationStage, TextEncodingStage, TimestepPreparationStage)
+logger = init_logger(__name__)
+class WanPipeline(LoRAPipeline, ComposedPipelineBase):
+    """
+    Wan video diffusion pipeline with LoRA support.
+    """
+    _required_config_modules = ["text_encoder", "tokenizer", "vae", "transformer", "scheduler"]
+    def initialize_pipeline(self, fastvideo_args: FastVideoArgs):
+        # We use UniPCMScheduler from Wan2.1 official repo, not the one in diffusers.
+        self.modules["scheduler"] = FlowUniPCMultistepScheduler(shift=fastvideo_args.pipeline_config.flow_shift)
+    def create_pipeline_stages(self, fastvideo_args: FastVideoArgs) -> None:
+        """Set up pipeline stages with proper dependency injection."""
+        self.add_stage(stage_name="input_validation_stage", stage=InputValidationStage())
+        self.add_stage(stage_name="prompt_encoding_stage",
+                       stage=TextEncodingStage(
+                           text_encoders=[self.get_module("text_encoder")],
+                           tokenizers=[self.get_module("tokenizer")],
+                       ))
+        self.add_stage(stage_name="conditioning_stage", stage=ConditioningStage())
+        self.add_stage(stage_name="timestep_preparation_stage",
+                       stage=TimestepPreparationStage(scheduler=self.get_module("scheduler")))
+        self.add_stage(stage_name="latent_preparation_stage",
+                       stage=LatentPreparationStage(scheduler=self.get_module("scheduler"),
+                                                    transformer=self.get_module("transformer", None)))
+        self.add_stage(stage_name="denoising_stage",
+                       stage=DenoisingStage(transformer=self.get_module("transformer"),
+                                            transformer_2=self.get_module("transformer_2", None),
+                                            scheduler=self.get_module("scheduler"),
+                                            vae=self.get_module("vae"),
+                                            pipeline=self))
+        self.add_stage(stage_name="decoding_stage", stage=DecodingStage(vae=self.get_module("vae"), pipeline=self))
+EntryClass = WanPipeline

standalone_inference/overlay_files/fastvideo/pipelines/composed_pipeline_base.py ADDED Viewed

	@@ -0,0 +1,474 @@

+# SPDX-License-Identifier: Apache-2.0
+"""
+Base class for composed pipelines.
+This module defines the base class for pipelines that are composed of multiple stages.
+"""
+import argparse
+import os
+from abc import ABC, abstractmethod
+from typing import Any, cast
+import torch
+from fastvideo.configs.pipelines import PipelineConfig
+from fastvideo.distributed import (maybe_init_distributed_environment_and_model_parallel, get_world_group)
+from fastvideo.distributed.communication_op import (warmup_sequence_parallel_communication)
+from fastvideo.fastvideo_args import FastVideoArgs, TrainingArgs
+from fastvideo.logger import init_logger
+from fastvideo.profiler import get_or_create_profiler
+from fastvideo.models.loader.component_loader import PipelineComponentLoader
+from fastvideo.pipelines.pipeline_batch_info import ForwardBatch
+from fastvideo.pipelines.stages import PipelineStage
+import fastvideo.envs as envs
+from fastvideo.utils import (maybe_download_model, verify_model_config_and_directory)
+logger = init_logger(__name__)
+class ComposedPipelineBase(ABC):
+    """
+    Base class for pipelines composed of multiple stages.
+    This class provides the framework for creating pipelines by composing multiple
+    stages together. Each stage is responsible for a specific part of the diffusion
+    process, and the pipeline orchestrates the execution of these stages.
+    """
+    is_video_pipeline: bool = False  # To be overridden by video pipelines
+    _required_config_modules: list[str] = []
+    _extra_config_module_map: dict[str, str] = {}
+    training_args: Any = None
+    fastvideo_args: Any = None
+    modules: dict[str, Any] = {}
+    # do not need to include moe related transformers
+    trainable_transformer_names: list[str] = ["transformer"]
+    trainable_transformer_modules: dict[str, torch.nn.Module] = {}
+    post_init_called: bool = False
+    # TODO(will): args should support both inference args and training args
+    def __init__(self,
+                 model_path: str,
+                 fastvideo_args: FastVideoArgs | TrainingArgs,
+                 required_config_modules: list[str] | None = None,
+                 loaded_modules: dict[str, torch.nn.Module] | None = None):
+        """
+        Initialize the pipeline. After __init__, the pipeline should be ready to
+        use. The pipeline should be stateless and not hold any batch state.
+        """
+        self.fastvideo_args = fastvideo_args
+        self.model_path: str = model_path
+        self._stages: list[PipelineStage] = []
+        self._stage_name_mapping: dict[str, PipelineStage] = {}
+        if required_config_modules is not None:
+            self._required_config_modules = required_config_modules
+        if self._required_config_modules is None:
+            raise NotImplementedError("Subclass must set _required_config_modules")
+        maybe_init_distributed_environment_and_model_parallel(fastvideo_args.tp_size, fastvideo_args.sp_size)
+        # Torch profiler. Enabled and configured through env vars:
+        # FASTVIDEO_TORCH_PROFILER_DIR=/path/to/save/trace
+        trace_dir = envs.FASTVIDEO_TORCH_PROFILER_DIR
+        self.profiler_controller = get_or_create_profiler(trace_dir)
+        self.profiler = self.profiler_controller.profiler
+        self.local_rank = get_world_group().local_rank
+        # Load modules directly in initialization
+        logger.info("Loading pipeline modules...")
+        with self.profiler_controller.region("profiler_region_model_loading"):
+            self.modules = self.load_modules(fastvideo_args, loaded_modules)
+    def set_trainable(self) -> None:
+        # Only train DiT
+        if getattr(self.fastvideo_args, "training_mode", False):
+            for name, module in self.trainable_transformer_modules.items():
+                logger.info("Setting %s to requires_grad=True", name)
+                if not isinstance(module, torch.nn.Module):
+                    logger.info("Skipping %s because it is not a torch.nn.Module", name)
+                    continue
+                module.requires_grad_(True)
+                module.train()
+    @staticmethod
+    def _compile_with_conditions(
+        module: torch.nn.Module,
+        compile_kwargs: dict[str, Any],
+    ) -> int:
+        """Compile submodules that match module._compile_conditions."""
+        compile_conditions = getattr(module, "_compile_conditions", None)
+        if not compile_conditions:
+            return 0
+        compiled_count = 0
+        for name, submodule in module.named_modules():
+            if not name:
+                continue
+            if any(cond(name, submodule) for cond in compile_conditions):
+                submodule.forward = torch.compile(submodule.forward, **compile_kwargs)
+                compiled_count += 1
+        return compiled_count
+    def _maybe_compile_pipeline_module(
+        self,
+        module_name: str,
+        fsdp_module_cls: type | None,
+        compile_kwargs: dict[str, Any],
+    ) -> None:
+        if module_name not in self.modules:
+            return
+        module = self.modules[module_name]
+        if fsdp_module_cls is not None and isinstance(module, fsdp_module_cls):
+            logger.info(
+                "%s is already FSDP-wrapped; skipping torch.compile in pipeline",
+                module_name.capitalize(),
+            )
+            return
+        compiled_count = self._compile_with_conditions(module, compile_kwargs)
+        if compiled_count > 0:
+            logger.info(
+                "Enabled torch.compile for %d submodules in %s via _compile_conditions with kwargs=%s",
+                compiled_count,
+                module_name,
+                compile_kwargs,
+            )
+            return
+        # Backward-compatible fallback: compile full module if no condition matched.
+        logger.info("Enabling torch.compile for %s with kwargs=%s", module_name, compile_kwargs)
+        self.modules[module_name] = torch.compile(module, **compile_kwargs)
+    def post_init(self) -> None:
+        assert self.fastvideo_args is not None, "fastvideo_args must be set"
+        if self.post_init_called:
+            return
+        self.post_init_called = True
+        if self.fastvideo_args.training_mode:
+            assert isinstance(self.fastvideo_args, TrainingArgs)
+            self.training_args = self.fastvideo_args
+            assert self.training_args is not None
+            self.initialize_training_pipeline(self.training_args)
+            if self.training_args.log_validation:
+                self.initialize_validation_pipeline(self.training_args)
+        self.initialize_pipeline(self.fastvideo_args)
+        if self.fastvideo_args.enable_torch_compile:
+            if self.fastvideo_args.training_mode:
+                logger.info("Torch Compile enabled via FSDP loader for training; skipping additional pipeline compile")
+            else:
+                fsdp_module_cls = None
+                try:
+                    from torch.distributed.fsdp import FSDPModule  # type: ignore
+                    fsdp_module_cls = FSDPModule
+                except Exception:  # pragma: no cover - FSDP not always available
+                    fsdp_module_cls = None
+                compile_kwargs = self.fastvideo_args.torch_compile_kwargs or {}
+                self._maybe_compile_pipeline_module(
+                    module_name="transformer",
+                    fsdp_module_cls=fsdp_module_cls,
+                    compile_kwargs=compile_kwargs,
+                )
+                self._maybe_compile_pipeline_module(
+                    module_name="transformer_2",
+                    fsdp_module_cls=fsdp_module_cls,
+                    compile_kwargs=compile_kwargs,
+                )
+                logger.info("Torch Compile enabled for DiT")
+        if not self.fastvideo_args.training_mode:
+            logger.info("Creating pipeline stages...")
+            self.create_pipeline_stages(self.fastvideo_args)
+            # Warmup NCCL communicators for sequence parallelism to avoid
+            # slow first forward pass due to lazy initialization
+            warmup_sequence_parallel_communication()
+    def initialize_training_pipeline(self, training_args: TrainingArgs):
+        raise NotImplementedError("if training_mode is True, the pipeline must implement this method")
+    def initialize_validation_pipeline(self, training_args: TrainingArgs):
+        raise NotImplementedError("if log_validation is True, the pipeline must implement this method")
+    @classmethod
+    def from_pretrained(cls,
+                        model_path: str,
+                        device: str | None = None,
+                        torch_dtype: torch.dtype | None = None,
+                        pipeline_config: str | PipelineConfig | None = None,
+                        args: argparse.Namespace | FastVideoArgs | TrainingArgs | None = None,
+                        required_config_modules: list[str] | None = None,
+                        loaded_modules: dict[str, torch.nn.Module]
+                        | None = None,
+                        **kwargs) -> "ComposedPipelineBase":
+        """
+        Load a pipeline from a pretrained model.
+        loaded_modules: Optional[Dict[str, torch.nn.Module]] = None,
+        If provided, loaded_modules will be used instead of loading from config/pretrained weights.
+        """
+        if args is None or (isinstance(args, FastVideoArgs) and args.inference_mode):
+            kwargs['model_path'] = model_path
+            fastvideo_args = FastVideoArgs.from_kwargs(**kwargs)
+        else:
+            if isinstance(args, TrainingArgs):
+                fastvideo_args = args
+            else:
+                assert isinstance(args, argparse.Namespace), "training mode expects argparse.Namespace args"
+                fastvideo_args = TrainingArgs.from_cli_args(args)
+            # TODO(will): fix this so that its not so ugly
+            fastvideo_args.model_path = model_path
+            for key, value in kwargs.items():
+                setattr(fastvideo_args, key, value)
+            fastvideo_args.dit_cpu_offload = False
+            # we hijack the precision to be the master weight type so that the
+            # model is loaded with the correct precision. Subsequently we will
+            # use FSDP2's MixedPrecisionPolicy to set the precision for the
+            # fwd, bwd, and other operations' precision.
+            assert fastvideo_args.pipeline_config.dit_precision == 'fp32', 'only fp32 is supported for training'
+        logger.info("fastvideo_args in from_pretrained: %s", fastvideo_args)
+        pipe = cls(model_path,
+                   fastvideo_args,
+                   required_config_modules=required_config_modules,
+                   loaded_modules=loaded_modules)
+        pipe.post_init()
+        return pipe
+    def get_module(self, module_name: str, default_value: Any = None) -> Any:
+        if module_name not in self.modules:
+            return default_value
+        return self.modules[module_name]
+    def add_module(self, module_name: str, module: Any):
+        self.modules[module_name] = module
+    def __getattr__(self, name: str) -> Any:
+        if "_stage_name_mapping" in self.__dict__ and name in self._stage_name_mapping:
+            return self._stage_name_mapping[name]
+        raise AttributeError(f"'{type(self).__name__}' object has no attribute '{name}'")
+    def _load_config(self, model_path: str) -> dict[str, Any]:
+        model_path = maybe_download_model(self.model_path)
+        self.model_path = model_path
+        # fastvideo_args.downloaded_model_path = model_path
+        logger.info("Model path: %s", model_path)
+        config = verify_model_config_and_directory(model_path)
+        return cast(dict[str, Any], config)
+    @property
+    def required_config_modules(self) -> list[str]:
+        """
+        List of modules that are required by the pipeline. The names should match
+        the diffusers directory and model_index.json file. These modules will be
+        loaded using the PipelineComponentLoader and made available in the
+        modules dictionary. Access these modules using the get_module method.
+        class ConcretePipeline(ComposedPipelineBase):
+            _required_config_modules = ["vae", "text_encoder", "transformer", "scheduler", "tokenizer"]
+            @property
+            def required_config_modules(self):
+                return self._required_config_modules
+        """
+        return self._required_config_modules
+    @property
+    def stages(self) -> list[PipelineStage]:
+        """
+        List of stages in the pipeline.
+        """
+        return self._stages
+    @abstractmethod
+    def create_pipeline_stages(self, fastvideo_args: FastVideoArgs):
+        """
+        Create the inference pipeline stages.
+        """
+        raise NotImplementedError
+    def create_training_stages(self, training_args: TrainingArgs):
+        """
+        Create the training pipeline stages.
+        """
+        raise NotImplementedError
+    def initialize_pipeline(self, fastvideo_args: FastVideoArgs):
+        """
+        Initialize the pipeline.
+        """
+        return
+    def load_modules(self,
+                     fastvideo_args: FastVideoArgs,
+                     loaded_modules: dict[str, torch.nn.Module] | None = None) -> dict[str, Any]:
+        """
+        Load the modules from the config.
+        loaded_modules: Optional[Dict[str, torch.nn.Module]] = None,
+        If provided, loaded_modules will be used instead of loading from config/pretrained weights.
+        """
+        model_index = self._load_config(self.model_path)
+        logger.info("Loading pipeline modules from config: %s", model_index)
+        # remove keys that are not pipeline modules
+        model_index.pop("_class_name")
+        model_index.pop("_diffusers_version")
+        model_index.pop("_name_or_path", None)
+        model_index.pop("workload_type", None)
+        if "boundary_ratio" in model_index and model_index["boundary_ratio"] is not None:
+            logger.info("MoE pipeline detected. Adding transformer_2 to self.required_config_modules...")
+            self.required_config_modules.append("transformer_2")
+            logger.info("MoE pipeline detected. Setting boundary ratio to %s", model_index["boundary_ratio"])
+            fastvideo_args.pipeline_config.dit_config.boundary_ratio = model_index["boundary_ratio"]
+        model_index.pop("boundary_ratio", None)
+        # used by Wan2.2 ti2v
+        model_index.pop("expand_timesteps", None)
+        # some sanity checks
+        assert len(model_index) > 1, "model_index.json must contain at least one pipeline module"
+        for module_name in self.required_config_modules:
+            if module_name not in model_index and module_name in self._extra_config_module_map:
+                extra_module_value = self._extra_config_module_map[module_name]
+                logger.warning(
+                    "model_index.json does not contain a %s module, but found {%s: %s} in _extra_config_module_map, adding to model_index.",
+                    module_name, module_name, extra_module_value)
+                if extra_module_value in model_index:
+                    logger.info("Using module %s for %s", extra_module_value, module_name)
+                    model_index[module_name] = model_index[extra_module_value]
+                    continue
+                else:
+                    raise ValueError(
+                        f"Required module key: {module_name} value: {model_index.get(module_name)} was not found in loaded modules {model_index.keys()}"
+                    )
+        # all the component models used by the pipeline
+        required_modules = self.required_config_modules
+        logger.info("Loading required modules: %s", required_modules)
+        modules = {}
+        for module_name, module_spec in model_index.items():
+            if not isinstance(module_spec, list | tuple):
+                logger.info(
+                    "Skipping non-module config entry %s=%s",
+                    module_name,
+                    module_spec,
+                )
+                continue
+            if len(module_spec) < 1:
+                logger.warning(
+                    "Skipping module %s due to invalid empty spec in model_index.json",
+                    module_name,
+                )
+                continue
+            transformers_or_diffusers = module_spec[0]
+            if transformers_or_diffusers is None:
+                logger.warning("Module %s in model_index.json has null value, removing from required_config_modules",
+                               module_name)
+                if module_name in self.required_config_modules:
+                    self.required_config_modules.remove(module_name)
+                continue
+            if module_name not in required_modules:
+                logger.info("Skipping module %s", module_name)
+                continue
+            if loaded_modules is not None and module_name in loaded_modules:
+                logger.info("Using module %s already provided", module_name)
+                modules[module_name] = loaded_modules[module_name]
+                continue
+            # we load the module from the extra config module map if it exists
+            if module_name in self._extra_config_module_map:
+                load_module_name = self._extra_config_module_map[module_name]
+            else:
+                load_module_name = module_name
+            component_model_path = os.path.join(self.model_path, load_module_name)
+            module = PipelineComponentLoader.load_module(
+                module_name=load_module_name,
+                component_model_path=component_model_path,
+                transformers_or_diffusers=transformers_or_diffusers,
+                fastvideo_args=fastvideo_args,
+            )
+            logger.info("Loaded module %s from %s", module_name, component_model_path)
+            if module_name in modules:
+                logger.warning("Overwriting module %s", module_name)
+            modules[module_name] = module
+        # Check if all required modules were loaded
+        for module_name in required_modules:
+            if module_name not in modules or modules[module_name] is None:
+                raise ValueError(
+                    f"Required module key: {module_name} value: {modules.get(module_name)} was not found in loaded modules {modules.keys()}"
+                )
+        return modules
+    def add_stage(self, stage_name: str, stage: PipelineStage):
+        assert self.modules is not None, "No modules are registered"
+        self._stages.append(stage)
+        self._stage_name_mapping[stage_name] = stage
+        setattr(self, stage_name, stage)
+    def profile(self, is_start: bool = True):
+        if self.profiler is None:
+            raise RuntimeError("Profiler is not enabled.")
+        if is_start:
+            self.profiler.start()
+        else:
+            self.profiler.stop()
+            # only print profiler results on rank 0
+            if self.local_rank == 0:
+                print(self.profiler.key_averages().table(sort_by="self_cuda_time_total"))
+    # TODO(will): don't hardcode no_grad
+    @torch.no_grad()
+    def forward(
+        self,
+        batch: ForwardBatch,
+        fastvideo_args: FastVideoArgs,
+    ) -> ForwardBatch:
+        """
+        Generate a video or image using the pipeline.
+        Args:
+            batch: The batch to generate from.
+            fastvideo_args: The inference arguments.
+        Returns:
+            ForwardBatch: The batch with the generated video or image.
+        """
+        if not self.post_init_called:
+            self.post_init()
+        # Execute each stage
+        logger.info("Running pipeline stages: %s", self._stage_name_mapping.keys())
+        # logger.info("Batch: %s", batch)
+        for stage in self.stages:
+            batch = stage(batch, fastvideo_args)
+        # Return the output
+        return batch
+    def train(self) -> None:
+        raise NotImplementedError("if training_mode is True, the pipeline must implement this method")
+    def streaming_reset(self, batch: ForwardBatch, fastvideo_args: FastVideoArgs) -> ForwardBatch:
+        raise NotImplementedError(f"{type(self).__name__} does not support streaming_reset")
+    def streaming_step(self, *args: Any, **kwargs: Any) -> ForwardBatch:
+        raise NotImplementedError(f"{type(self).__name__} does not support streaming_step")
+    def streaming_clear(self) -> None:
+        raise NotImplementedError(f"{type(self).__name__} does not support streaming_clear")

standalone_inference/overlay_files/fastvideo/pipelines/stages/denoising.py ADDED Viewed

	@@ -0,0 +1,1184 @@

+# SPDX-License-Identifier: Apache-2.0
+"""
+Denoising stage for diffusion pipelines.
+"""
+import inspect
+import weakref
+from collections.abc import Iterable
+from typing import Any
+import torch
+from tqdm.auto import tqdm
+from fastvideo.attention import get_attn_backend
+from fastvideo.distributed import (get_local_torch_device, get_world_group)
+from fastvideo.fastvideo_args import FastVideoArgs
+from fastvideo.forward_context import set_forward_context
+from fastvideo.logger import init_logger
+from fastvideo.models.loader.component_loader import TransformerLoader
+from fastvideo.models.schedulers.scheduling_flow_match_euler_discrete import (FlowMatchEulerDiscreteScheduler)
+from fastvideo.models.utils import pred_noise_to_pred_video
+from fastvideo.pipelines.pipeline_batch_info import ForwardBatch
+from fastvideo.pipelines.stages.base import PipelineStage
+from fastvideo.pipelines.stages.validators import StageValidators as V
+from fastvideo.pipelines.stages.validators import VerificationResult
+from fastvideo.platforms import AttentionBackendEnum
+from fastvideo.utils import dict_to_3d_list, masks_like
+try:
+    from fastvideo.attention.backends.vmoba import VMOBAAttentionBackend
+    from fastvideo.utils import is_vmoba_available
+    vmoba_attn_available = is_vmoba_available()
+except ImportError:
+    vmoba_attn_available = False
+try:
+    from fastvideo.attention.backends.video_sparse_attn import (VideoSparseAttentionBackend)
+    vsa_available = True
+except ImportError:
+    vsa_available = False
+try:
+    from fastvideo.attention.backends.sparse_fp4_attn import (SparseFP4AttentionBackend)
+except ImportError:
+    SparseFP4AttentionBackend = None  # type: ignore[assignment]
+try:
+    from fastvideo.attention.backends.sparse_fp4_ours_p_attn import (SparseFP4OursPAttentionBackend)
+except ImportError:
+    SparseFP4OursPAttentionBackend = None  # type: ignore[assignment]
+sparse_fp4_backends = tuple(
+    backend for backend in (
+        SparseFP4AttentionBackend,
+        SparseFP4OursPAttentionBackend,
+    ) if backend is not None)
+sparse_fp4_available = bool(sparse_fp4_backends)
+logger = init_logger(__name__)
+class DenoisingStage(PipelineStage):
+    """
+    Stage for running the denoising loop in diffusion pipelines.
+    This stage handles the iterative denoising process that transforms
+    the initial noise into the final output.
+    """
+    def __init__(self, transformer, scheduler, pipeline=None, transformer_2=None, vae=None) -> None:
+        super().__init__()
+        self.transformer = transformer
+        self.transformer_2 = transformer_2
+        self.scheduler = scheduler
+        self.vae = vae
+        self.pipeline = weakref.ref(pipeline) if pipeline else None
+        attn_head_size = self.transformer.hidden_size // self.transformer.num_attention_heads
+        self.attn_backend = get_attn_backend(
+            head_size=attn_head_size,
+            dtype=torch.float16,  # TODO(will): hack
+            supported_attention_backends=(
+                AttentionBackendEnum.VIDEO_SPARSE_ATTN, AttentionBackendEnum.BSA_ATTN, AttentionBackendEnum.VMOBA_ATTN,
+                AttentionBackendEnum.FLASH_ATTN, AttentionBackendEnum.TORCH_SDPA, AttentionBackendEnum.SAGE_ATTN_THREE,
+                AttentionBackendEnum.ATTN_QAT_INFER, AttentionBackendEnum.ATTN_QAT_TRAIN,
+                AttentionBackendEnum.SPARSE_FP4_ATTN, AttentionBackendEnum.SPARSE_FP4_OURS_P_ATTN)  # hack
+        )
+    def forward(
+        self,
+        batch: ForwardBatch,
+        fastvideo_args: FastVideoArgs,
+    ) -> ForwardBatch:
+        """
+        Run the denoising loop.
+        Args:
+            batch: The current batch information.
+            fastvideo_args: The inference arguments.
+        Returns:
+            The batch with denoised latents.
+        """
+        pipeline = self.pipeline() if self.pipeline else None
+        if not fastvideo_args.model_loaded["transformer"]:
+            loader = TransformerLoader()
+            self.transformer = loader.load(fastvideo_args.model_paths["transformer"], fastvideo_args)
+            if pipeline:
+                pipeline.add_module("transformer", self.transformer)
+            fastvideo_args.model_loaded["transformer"] = True
+        # Prepare extra step kwargs for scheduler
+        extra_step_kwargs = self.prepare_extra_func_kwargs(
+            self.scheduler.step,
+            {
+                "generator": batch.generator,
+                "eta": batch.eta
+            },
+        )
+        # Setup precision and autocast settings
+        # TODO(will): make the precision configurable for inference
+        # target_dtype = PRECISION_TO_TYPE[fastvideo_args.precision]
+        target_dtype = torch.bfloat16
+        autocast_enabled = (target_dtype != torch.float32) and not fastvideo_args.disable_autocast
+        # Get timesteps and calculate warmup steps
+        timesteps = batch.timesteps
+        # TODO(will): remove this once we add input/output validation for stages
+        if timesteps is None:
+            raise ValueError("Timesteps must be provided")
+        num_inference_steps = batch.num_inference_steps
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
+        # Prepare image latents and embeddings for I2V generation
+        image_embeds = batch.image_embeds
+        if len(image_embeds) > 0:
+            assert not torch.isnan(image_embeds[0]).any(), "image_embeds contains nan"
+            image_embeds = [image_embed.to(target_dtype) for image_embed in image_embeds]
+        image_kwargs = self.prepare_extra_func_kwargs(
+            self.transformer.forward,
+            {
+                "encoder_hidden_states_image": image_embeds,
+                "mask_strategy": dict_to_3d_list(None, t_max=50, l_max=60, h_max=24)
+            },
+        )
+        pos_cond_kwargs = self.prepare_extra_func_kwargs(
+            self.transformer.forward,
+            {
+                "encoder_hidden_states_2": batch.clip_embedding_pos,
+                "encoder_attention_mask": batch.prompt_attention_mask,
+            },
+        )
+        neg_cond_kwargs = self.prepare_extra_func_kwargs(
+            self.transformer.forward,
+            {
+                "encoder_hidden_states_2": batch.clip_embedding_neg,
+                "encoder_attention_mask": batch.negative_attention_mask,
+            },
+        )
+        action_kwargs = self.prepare_extra_func_kwargs(
+            self.transformer.forward,
+            {
+                "mouse_cond": batch.mouse_cond,
+                "keyboard_cond": batch.keyboard_cond,
+                "c2ws_plucker_emb": batch.c2ws_plucker_emb,
+            },
+        )
+        camera_kwargs = self.prepare_extra_func_kwargs(
+            self.transformer.forward,
+            {
+                "camera_states": batch.camera_states,
+            },
+        )
+        # Get latents and embeddings
+        latents = batch.latents
+        prompt_embeds = batch.prompt_embeds
+        assert not torch.isnan(prompt_embeds[0]).any(), "prompt_embeds contains nan"
+        if batch.do_classifier_free_guidance:
+            neg_prompt_embeds = batch.negative_prompt_embeds
+            assert neg_prompt_embeds is not None
+            assert not torch.isnan(neg_prompt_embeds[0]).any(), "neg_prompt_embeds contains nan"
+        # (Wan2.2) Calculate timestep to switch from high noise expert to low noise expert
+        boundary_ratio = fastvideo_args.pipeline_config.dit_config.boundary_ratio
+        if batch.boundary_ratio is not None:
+            logger.info("Overriding boundary ratio from %s to %s", boundary_ratio, batch.boundary_ratio)
+            boundary_ratio = batch.boundary_ratio
+        boundary_timestep = boundary_ratio * self.scheduler.num_train_timesteps if boundary_ratio is not None else None
+        latent_model_input = latents.to(target_dtype)
+        assert latent_model_input.shape[0] == 1, "only support batch size 1"
+        if fastvideo_args.pipeline_config.ti2v_task and batch.pil_image is not None:
+            # TI2V directly replaces the first frame of the latent with
+            # the image latent instead of appending along the channel dim
+            assert batch.image_latent is None, "TI2V task should not have image latents"
+            assert self.vae is not None, "VAE is not provided for TI2V task"
+            z = self.vae.encode(batch.pil_image).mean.float()
+            if (hasattr(self.vae, "shift_factor") and self.vae.shift_factor is not None):
+                if isinstance(self.vae.shift_factor, torch.Tensor):
+                    z -= self.vae.shift_factor.to(z.device, z.dtype)
+                else:
+                    z -= self.vae.shift_factor
+            if isinstance(self.vae.scaling_factor, torch.Tensor):
+                z = z * self.vae.scaling_factor.to(z.device, z.dtype)
+            else:
+                z = z * self.vae.scaling_factor
+            latent_model_input = latent_model_input.squeeze(0)
+            _, mask2 = masks_like([latent_model_input], zero=True)
+            latent_model_input = (1. - mask2[0]) * z + mask2[0] * latent_model_input
+            # latent_model_input = latent_model_input.unsqueeze(0)
+            latent_model_input = latent_model_input.to(get_local_torch_device())
+            latents = latent_model_input
+            F = batch.num_frames
+            temporal_scale = fastvideo_args.pipeline_config.vae_config.arch_config.scale_factor_temporal
+            spatial_scale = fastvideo_args.pipeline_config.vae_config.arch_config.scale_factor_spatial
+            patch_size = fastvideo_args.pipeline_config.dit_config.arch_config.patch_size
+            if not isinstance(patch_size, tuple):
+                raise ValueError(f"Expected 3D patch_size tuple for denoising, got {patch_size!r}")
+            seq_len = ((F - 1) // temporal_scale + 1) * (batch.height // spatial_scale) * (
+                batch.width // spatial_scale) // (patch_size[1] * patch_size[2])
+        # Initialize lists for ODE trajectory
+        trajectory_timesteps: list[torch.Tensor] = []
+        trajectory_latents: list[torch.Tensor] = []
+        # Run denoising loop
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                # Skip if interrupted
+                if hasattr(self, 'interrupt') and self.interrupt:
+                    continue
+                if boundary_timestep is None or t >= boundary_timestep:
+                    if (fastvideo_args.dit_cpu_offload and not fastvideo_args.dit_layerwise_offload
+                            and self.transformer_2 is not None
+                            and next(self.transformer_2.parameters()).device.type == 'cuda'):
+                        self.transformer_2.to('cpu')
+                    current_model = self.transformer
+                    if (fastvideo_args.dit_cpu_offload and not fastvideo_args.dit_layerwise_offload
+                            and not fastvideo_args.use_fsdp_inference and current_model is not None):
+                        transformer_device = next(current_model.parameters()).device.type
+                        if transformer_device == 'cpu':
+                            current_model.to(get_local_torch_device())
+                    current_guidance_scale = batch.guidance_scale
+                else:
+                    # low-noise stage in wan2.2
+                    if (fastvideo_args.dit_cpu_offload and not fastvideo_args.dit_layerwise_offload
+                            and next(self.transformer.parameters()).device.type == 'cuda'):
+                        self.transformer.to('cpu')
+                    current_model = self.transformer_2
+                    if (fastvideo_args.dit_cpu_offload and not fastvideo_args.dit_layerwise_offload
+                            and not fastvideo_args.use_fsdp_inference and current_model is not None):
+                        transformer_2_device = next(current_model.parameters()).device.type
+                        if transformer_2_device == 'cpu':
+                            current_model.to(get_local_torch_device())
+                    current_guidance_scale = batch.guidance_scale_2
+                assert current_model is not None, "current_model is None"
+                # Expand latents for V2V/I2V
+                latent_model_input = latents.to(target_dtype)
+                if batch.video_latent is not None:
+                    latent_model_input = torch.cat([latent_model_input, batch.video_latent,
+                                                    torch.zeros_like(latents)],
+                                                   dim=1).to(target_dtype)
+                elif batch.image_latent is not None:
+                    assert not fastvideo_args.pipeline_config.ti2v_task, "image latents should not be provided for TI2V task"
+                    latent_model_input = torch.cat([latent_model_input, batch.image_latent], dim=1).to(target_dtype)
+                assert not torch.isnan(latent_model_input).any(), "latent_model_input contains nan"
+                if fastvideo_args.pipeline_config.ti2v_task and batch.pil_image is not None:
+                    timestep = torch.stack([t]).to(get_local_torch_device())
+                    temp_ts = (mask2[0][0][:, ::2, ::2] * timestep).flatten()
+                    temp_ts = torch.cat([temp_ts, temp_ts.new_ones(seq_len - temp_ts.size(0)) * timestep])
+                    timestep = temp_ts.unsqueeze(0)
+                    t_expand = timestep.repeat(latent_model_input.shape[0], 1)
+                else:
+                    t_expand = t.repeat(latent_model_input.shape[0])
+                t_expand = t_expand.to(get_local_torch_device())
+                use_meanflow = getattr(self.transformer.config, "use_meanflow", False)
+                if use_meanflow:
+                    if i == len(timesteps) - 1:
+                        timesteps_r = torch.tensor([0.0], device=get_local_torch_device())
+                    else:
+                        timesteps_r = timesteps[i + 1]
+                    timesteps_r = timesteps_r.repeat(latent_model_input.shape[0])
+                else:
+                    timesteps_r = None
+                timesteps_r_kwarg = self.prepare_extra_func_kwargs(
+                    self.transformer.forward,
+                    {
+                        "timestep_r": timesteps_r,
+                    },
+                )
+                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+                # Prepare inputs for transformer
+                guidance_expand = (torch.tensor(
+                    [fastvideo_args.pipeline_config.embedded_cfg_scale] * latent_model_input.shape[0],
+                    dtype=torch.float32,
+                    device=get_local_torch_device(),
+                ).to(target_dtype) * 1000.0 if fastvideo_args.pipeline_config.embedded_cfg_scale is not None else None)
+                # Predict noise residual
+                with torch.autocast(device_type="cuda", dtype=target_dtype, enabled=autocast_enabled):
+                    if (vsa_available and self.attn_backend == VideoSparseAttentionBackend) or \
+                       (sparse_fp4_available and self.attn_backend in sparse_fp4_backends):
+                        self.attn_metadata_builder_cls = self.attn_backend.get_builder_cls()
+                        if self.attn_metadata_builder_cls is not None:
+                            self.attn_metadata_builder = self.attn_metadata_builder_cls()
+                            # TODO(will): clean this up
+                            attn_metadata = self.attn_metadata_builder.build(  # type: ignore
+                                current_timestep=i,  # type: ignore
+                                raw_latent_shape=batch.raw_latent_shape[2:5],  # type: ignore
+                                patch_size=fastvideo_args.pipeline_config.  # type: ignore
+                                dit_config.patch_size,  # type: ignore
+                                VSA_sparsity=fastvideo_args.VSA_sparsity,  # type: ignore
+                                device=get_local_torch_device(),
+                            )
+                            assert attn_metadata is not None, "attn_metadata cannot be None"
+                        else:
+                            attn_metadata = None
+                    elif (vmoba_attn_available and self.attn_backend == VMOBAAttentionBackend):
+                        self.attn_metadata_builder_cls = self.attn_backend.get_builder_cls()
+                        if self.attn_metadata_builder_cls is not None:
+                            self.attn_metadata_builder = self.attn_metadata_builder_cls()
+                            # Prepare V-MoBA parameters from config
+                            moba_params = fastvideo_args.moba_config.copy()
+                            assert batch.raw_latent_shape is not None, "raw_latent_shape must be set for V-MoBA"
+                            moba_params.update({
+                                "current_timestep": i,
+                                "raw_latent_shape": batch.raw_latent_shape[2:5],
+                                "patch_size": fastvideo_args.pipeline_config.dit_config.patch_size,
+                                "device": get_local_torch_device(),
+                            })
+                            attn_metadata = self.attn_metadata_builder.build(**moba_params)
+                            assert attn_metadata is not None, "attn_metadata cannot be None"
+                        else:
+                            attn_metadata = None
+                    else:
+                        attn_metadata = None
+                    # TODO(will): finalize the interface. vLLM uses this to
+                    # support torch dynamo compilation. They pass in
+                    # attn_metadata, vllm_config, and num_tokens. We can pass in
+                    # fastvideo_args or training_args, and attn_metadata.
+                    batch.is_cfg_negative = False
+                    with set_forward_context(
+                            current_timestep=i,
+                            attn_metadata=attn_metadata,
+                            forward_batch=batch,
+                            # fastvideo_args=fastvideo_args
+                    ):
+                        # Run transformer
+                        noise_pred = current_model(
+                            latent_model_input,
+                            prompt_embeds,
+                            t_expand,
+                            guidance=guidance_expand,
+                            **image_kwargs,
+                            **pos_cond_kwargs,
+                            **action_kwargs,
+                            **camera_kwargs,
+                            **timesteps_r_kwarg,
+                        )
+                    if batch.do_classifier_free_guidance:
+                        batch.is_cfg_negative = True
+                        with set_forward_context(
+                                current_timestep=i,
+                                attn_metadata=attn_metadata,
+                                forward_batch=batch,
+                        ):
+                            noise_pred_uncond = current_model(
+                                latent_model_input,
+                                neg_prompt_embeds,
+                                t_expand,
+                                guidance=guidance_expand,
+                                **image_kwargs,
+                                **neg_cond_kwargs,
+                                **action_kwargs,
+                                **camera_kwargs,
+                                **timesteps_r_kwarg,
+                            )
+                        noise_pred_text = noise_pred
+                        noise_pred = noise_pred_uncond + current_guidance_scale * (noise_pred_text - noise_pred_uncond)
+                        # Apply guidance rescale if needed
+                        if batch.guidance_rescale > 0.0:
+                            # Based on 3.4. in https://arxiv.org/pdf/2305.08891.pdf
+                            noise_pred = self.rescale_noise_cfg(
+                                noise_pred,
+                                noise_pred_text,
+                                guidance_rescale=batch.guidance_rescale,
+                            )
+                    # Compute the previous noisy sample
+                    latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
+                    if fastvideo_args.pipeline_config.ti2v_task and batch.pil_image is not None:
+                        latents = latents.squeeze(0)
+                        latents = (1. - mask2[0]) * z + mask2[0] * latents
+                        # latents = latents.unsqueeze(0)
+                # save trajectory latents if needed
+                if batch.return_trajectory_latents:
+                    trajectory_timesteps.append(t)
+                    trajectory_latents.append(latents)
+                # Update progress bar
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and
+                                               (i + 1) % self.scheduler.order == 0 and progress_bar is not None):
+                    progress_bar.update()
+        trajectory_tensor: torch.Tensor | None = None
+        if trajectory_latents:
+            trajectory_tensor = torch.stack(trajectory_latents, dim=1)
+            trajectory_timesteps_tensor = torch.stack(trajectory_timesteps, dim=0)
+        else:
+            trajectory_tensor = None
+            trajectory_timesteps_tensor = None
+        if trajectory_tensor is not None and trajectory_timesteps_tensor is not None:
+            batch.trajectory_timesteps = trajectory_timesteps_tensor.cpu()
+            batch.trajectory_latents = trajectory_tensor.cpu()
+        # Update batch with final latents
+        batch.latents = latents
+        if fastvideo_args.dit_layerwise_offload:
+            mgr = getattr(self.transformer, "_layerwise_offload_manager", None)
+            if mgr is not None and getattr(mgr, "enabled", False):
+                mgr.release_all()
+            if self.transformer_2 is not None:
+                mgr2 = getattr(self.transformer_2, "_layerwise_offload_manager", None)
+                if mgr2 is not None and getattr(mgr2, "enabled", False):
+                    mgr2.release_all()
+        # deallocate transformer if on mps
+        if torch.backends.mps.is_available():
+            logger.info("Memory before deallocating transformer: %s", torch.mps.current_allocated_memory())
+            del self.transformer
+            if pipeline is not None and "transformer" in pipeline.modules:
+                del pipeline.modules["transformer"]
+            fastvideo_args.model_loaded["transformer"] = False
+            logger.info("Memory after deallocating transformer: %s", torch.mps.current_allocated_memory())
+        return batch
+    def prepare_extra_func_kwargs(self, func, kwargs) -> dict[str, Any]:
+        """
+        Prepare extra kwargs for the scheduler step / denoise step.
+        Args:
+            func: The function to prepare kwargs for.
+            kwargs: The kwargs to prepare.
+        Returns:
+            The prepared kwargs.
+        """
+        extra_step_kwargs = {}
+        for k, v in kwargs.items():
+            accepts = k in set(inspect.signature(func).parameters.keys())
+            if accepts:
+                extra_step_kwargs[k] = v
+        return extra_step_kwargs
+    def progress_bar(self, iterable: Iterable | None = None, total: int | None = None) -> tqdm:
+        """
+        Create a progress bar for the denoising process.
+        Args:
+            iterable: The iterable to iterate over.
+            total: The total number of items.
+        Returns:
+            A tqdm progress bar.
+        """
+        local_rank = get_world_group().local_rank
+        if local_rank == 0:
+            return tqdm(iterable=iterable, total=total)
+        else:
+            return tqdm(iterable=iterable, total=total, disable=True)
+    def rescale_noise_cfg(self, noise_cfg, noise_pred_text, guidance_rescale=0.0) -> torch.Tensor:
+        """
+        Rescale noise prediction according to guidance_rescale.
+        Based on findings of "Common Diffusion Noise Schedules and Sample Steps are Flawed"
+        (https://arxiv.org/pdf/2305.08891.pdf), Section 3.4.
+        Args:
+            noise_cfg: The noise prediction with guidance.
+            noise_pred_text: The text-conditioned noise prediction.
+            guidance_rescale: The guidance rescale factor.
+        Returns:
+            The rescaled noise prediction.
+        """
+        std_text = noise_pred_text.std(dim=list(range(1, noise_pred_text.ndim)), keepdim=True)
+        std_cfg = noise_cfg.std(dim=list(range(1, noise_cfg.ndim)), keepdim=True)
+        # Rescale the results from guidance (fixes overexposure)
+        noise_pred_rescaled = noise_cfg * (std_text / std_cfg)
+        # Mix with the original results from guidance by factor guidance_rescale
+        noise_cfg = (guidance_rescale * noise_pred_rescaled + (1 - guidance_rescale) * noise_cfg)
+        return noise_cfg
+    def verify_input(self, batch: ForwardBatch, fastvideo_args: FastVideoArgs) -> VerificationResult:
+        """Verify denoising stage inputs."""
+        result = VerificationResult()
+        result.add_check("timesteps", batch.timesteps, [V.is_tensor, V.min_dims(1)])
+        result.add_check("latents", batch.latents, [V.is_tensor, V.with_dims(5)])
+        result.add_check("prompt_embeds", batch.prompt_embeds, V.list_not_empty)
+        result.add_check("image_embeds", batch.image_embeds, V.is_list)
+        result.add_check("image_latent", batch.image_latent, V.none_or_tensor_with_dims(5))
+        result.add_check("num_inference_steps", batch.num_inference_steps, V.positive_int)
+        result.add_check("guidance_scale", batch.guidance_scale, V.positive_float)
+        result.add_check("eta", batch.eta, V.non_negative_float)
+        result.add_check("generator", batch.generator, V.generator_or_list_generators)
+        result.add_check("do_classifier_free_guidance", batch.do_classifier_free_guidance, V.bool_value)
+        result.add_check("negative_prompt_embeds", batch.negative_prompt_embeds,
+                         lambda x: not batch.do_classifier_free_guidance or V.list_not_empty(x))
+        return result
+    def verify_output(self, batch: ForwardBatch, fastvideo_args: FastVideoArgs) -> VerificationResult:
+        """Verify denoising stage outputs."""
+        result = VerificationResult()
+        result.add_check("latents", batch.latents, [V.is_tensor, V.with_dims(5)])
+        return result
+class CosmosDenoisingStage(DenoisingStage):
+    """
+    Denoising stage for Cosmos models using FlowMatchEulerDiscreteScheduler.
+    """
+    def __init__(self, transformer, scheduler, pipeline=None) -> None:
+        super().__init__(transformer, scheduler, pipeline)
+    def forward(
+        self,
+        batch: ForwardBatch,
+        fastvideo_args: FastVideoArgs,
+    ) -> ForwardBatch:
+        pipeline = self.pipeline() if self.pipeline else None
+        if not fastvideo_args.model_loaded["transformer"]:
+            loader = TransformerLoader()
+            self.transformer = loader.load(fastvideo_args.model_paths["transformer"], fastvideo_args)
+            if pipeline:
+                pipeline.add_module("transformer", self.transformer)
+            fastvideo_args.model_loaded["transformer"] = True
+        extra_step_kwargs = self.prepare_extra_func_kwargs(
+            self.scheduler.step,
+            {
+                "generator": batch.generator,
+                "eta": batch.eta
+            },
+        )
+        if hasattr(self.transformer, 'module'):
+            transformer_dtype = next(self.transformer.module.parameters()).dtype
+        else:
+            transformer_dtype = next(self.transformer.parameters()).dtype
+        target_dtype = transformer_dtype
+        autocast_enabled = (target_dtype != torch.float32) and not fastvideo_args.disable_autocast
+        latents = batch.latents
+        num_inference_steps = batch.num_inference_steps
+        guidance_scale = batch.guidance_scale
+        sigma_max = 80.0
+        sigma_min = 0.002
+        sigma_data = 1.0
+        final_sigmas_type = "sigma_min"
+        if self.scheduler is not None:
+            self.scheduler.register_to_config(
+                sigma_max=sigma_max,
+                sigma_min=sigma_min,
+                sigma_data=sigma_data,
+                final_sigmas_type=final_sigmas_type,
+            )
+        self.scheduler.set_timesteps(num_inference_steps, device=latents.device)
+        timesteps = self.scheduler.timesteps
+        if (hasattr(self.scheduler.config, 'final_sigmas_type')
+                and self.scheduler.config.final_sigmas_type == "sigma_min" and len(self.scheduler.sigmas) > 1):
+            self.scheduler.sigmas[-1] = self.scheduler.sigmas[-2]
+        conditioning_latents = getattr(batch, 'conditioning_latents', None)
+        unconditioning_latents = conditioning_latents
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                if hasattr(self, 'interrupt') and self.interrupt:
+                    continue
+                current_sigma = self.scheduler.sigmas[i]
+                current_t = current_sigma / (current_sigma + 1)
+                c_in = 1 - current_t
+                c_skip = 1 - current_t
+                c_out = -current_t
+                timestep = current_t.view(1, 1, 1, 1, 1).expand(latents.size(0), -1, latents.size(2), -1,
+                                                                -1)  # [B, 1, T, 1, 1]
+                with torch.autocast(device_type="cuda", dtype=target_dtype, enabled=autocast_enabled):
+                    cond_latent = latents * c_in
+                    if hasattr(
+                            batch,
+                            'cond_indicator') and batch.cond_indicator is not None and conditioning_latents is not None:
+                        cond_latent = batch.cond_indicator * conditioning_latents + (1 -
+                                                                                     batch.cond_indicator) * cond_latent
+                    else:
+                        logger.warning(
+                            "Step %s: Missing conditioning data - cond_indicator: %s, conditioning_latents: %s", i,
+                            hasattr(batch, 'cond_indicator'), conditioning_latents is not None)
+                    cond_latent = cond_latent.to(target_dtype)
+                    cond_timestep = timestep
+                    if hasattr(batch, 'cond_indicator') and batch.cond_indicator is not None:
+                        sigma_conditioning = 0.0001
+                        t_conditioning = sigma_conditioning / (sigma_conditioning + 1)
+                        cond_timestep = batch.cond_indicator * t_conditioning + (1 - batch.cond_indicator) * timestep
+                        cond_timestep = cond_timestep.to(target_dtype)
+                    with set_forward_context(
+                            current_timestep=i,
+                            attn_metadata=None,
+                            forward_batch=batch,
+                    ):
+                        # Use conditioning masks from CosmosLatentPreparationStage
+                        condition_mask = batch.cond_mask.to(target_dtype) if hasattr(batch, 'cond_mask') else None
+                        padding_mask = torch.zeros(1,
+                                                   1,
+                                                   batch.height,
+                                                   batch.width,
+                                                   device=cond_latent.device,
+                                                   dtype=target_dtype)
+                        # Fallback if masks not available
+                        if condition_mask is None:
+                            batch_size, num_channels, num_frames, height, width = cond_latent.shape
+                            condition_mask = torch.zeros(batch_size,
+                                                         1,
+                                                         num_frames,
+                                                         height,
+                                                         width,
+                                                         device=cond_latent.device,
+                                                         dtype=target_dtype)
+                        noise_pred = self.transformer(
+                            hidden_states=cond_latent,
+                            timestep=cond_timestep.to(target_dtype),
+                            encoder_hidden_states=batch.prompt_embeds[0].to(target_dtype),
+                            fps=24,  # TODO: get fps from batch or config
+                            condition_mask=condition_mask,
+                            padding_mask=padding_mask,
+                            return_dict=False,
+                        )[0]
+                    cond_pred = (c_skip * latents + c_out * noise_pred.float()).to(target_dtype)
+                    if hasattr(
+                            batch,
+                            'cond_indicator') and batch.cond_indicator is not None and conditioning_latents is not None:
+                        cond_pred = batch.cond_indicator * conditioning_latents + (1 - batch.cond_indicator) * cond_pred
+                    if batch.do_classifier_free_guidance and batch.negative_prompt_embeds is not None:
+                        uncond_latent = latents * c_in
+                        if hasattr(batch, 'uncond_indicator'
+                                   ) and batch.uncond_indicator is not None and unconditioning_latents is not None:
+                            uncond_latent = batch.uncond_indicator * unconditioning_latents + (
+                                1 - batch.uncond_indicator) * uncond_latent
+                        with set_forward_context(
+                                current_timestep=i,
+                                attn_metadata=None,
+                                forward_batch=batch,
+                        ):
+                            uncond_condition_mask = batch.uncond_mask.to(target_dtype) if hasattr(
+                                batch, 'uncond_mask') and batch.uncond_mask is not None else condition_mask
+                            uncond_timestep = timestep
+                            if hasattr(batch, 'uncond_indicator') and batch.uncond_indicator is not None:
+                                sigma_conditioning = 0.0001
+                                t_conditioning = sigma_conditioning / (sigma_conditioning + 1)
+                                uncond_timestep = batch.uncond_indicator * t_conditioning + (
+                                    1 - batch.uncond_indicator) * timestep
+                                uncond_timestep = uncond_timestep.to(target_dtype)
+                            noise_pred_uncond = self.transformer(
+                                hidden_states=uncond_latent.to(target_dtype),
+                                timestep=uncond_timestep.to(target_dtype),
+                                encoder_hidden_states=batch.negative_prompt_embeds[0].to(target_dtype),
+                                fps=24,  # TODO: get fps from batch or config
+                                condition_mask=uncond_condition_mask,
+                                padding_mask=padding_mask,
+                                return_dict=False,
+                            )[0]
+                        uncond_pred = (c_skip * latents + c_out * noise_pred_uncond.float()).to(target_dtype)
+                        if hasattr(batch, 'uncond_indicator'
+                                   ) and batch.uncond_indicator is not None and unconditioning_latents is not None:
+                            uncond_pred = batch.uncond_indicator * unconditioning_latents + (
+                                1 - batch.uncond_indicator) * uncond_pred
+                        guidance_diff = cond_pred - uncond_pred
+                        final_pred = cond_pred + guidance_scale * guidance_diff
+                    else:
+                        final_pred = cond_pred
+                # Convert to noise for scheduler step
+                if current_sigma > 1e-8:
+                    noise_for_scheduler = (latents - final_pred) / current_sigma
+                else:
+                    logger.warning("Step %s: current_sigma too small (%s), using final_pred directly", i, current_sigma)
+                    noise_for_scheduler = final_pred
+                if torch.isnan(noise_for_scheduler).sum() > 0:
+                    logger.error("Step %s: NaN detected in noise_for_scheduler, sum: %s", i,
+                                 noise_for_scheduler.float().sum().item())
+                    logger.error("Step %s: latents sum: %s, final_pred sum: %s, current_sigma: %s", i,
+                                 latents.float().sum().item(),
+                                 final_pred.float().sum().item(), current_sigma)
+                latents = self.scheduler.step(noise_for_scheduler, t, latents, **extra_step_kwargs,
+                                              return_dict=False)[0]
+                progress_bar.update()
+        batch.latents = latents
+        return batch
+    def verify_input(self, batch: ForwardBatch, fastvideo_args: FastVideoArgs) -> VerificationResult:
+        """Verify Cosmos denoising stage inputs."""
+        result = VerificationResult()
+        result.add_check("latents", batch.latents, [V.is_tensor, V.with_dims(5)])
+        result.add_check("prompt_embeds", batch.prompt_embeds, V.list_not_empty)
+        result.add_check("num_inference_steps", batch.num_inference_steps, V.positive_int)
+        result.add_check("guidance_scale", batch.guidance_scale, V.positive_float)
+        result.add_check("do_classifier_free_guidance", batch.do_classifier_free_guidance, V.bool_value)
+        result.add_check("negative_prompt_embeds", batch.negative_prompt_embeds,
+                         lambda x: not batch.do_classifier_free_guidance or V.list_not_empty(x))
+        return result
+    def verify_output(self, batch: ForwardBatch, fastvideo_args: FastVideoArgs) -> VerificationResult:
+        """Verify Cosmos denoising stage outputs."""
+        result = VerificationResult()
+        result.add_check("latents", batch.latents, [V.is_tensor, V.with_dims(5)])
+        return result
+class Cosmos25DenoisingStage(CosmosDenoisingStage):
+    """Denoising stage for Cosmos 2.5 DiT (expects 1D/2D timestep, not 5D)."""
+    def forward(
+        self,
+        batch: ForwardBatch,
+        fastvideo_args: FastVideoArgs,
+    ) -> ForwardBatch:
+        pipeline = self.pipeline() if self.pipeline else None
+        if not fastvideo_args.model_loaded["transformer"]:
+            loader = TransformerLoader()
+            self.transformer = loader.load(fastvideo_args.model_paths["transformer"], fastvideo_args)
+            if pipeline:
+                pipeline.add_module("transformer", self.transformer)
+            fastvideo_args.model_loaded["transformer"] = True
+        extra_step_kwargs = self.prepare_extra_func_kwargs(
+            self.scheduler.step,
+            {
+                "generator": batch.generator,
+                "eta": batch.eta
+            },
+        )
+        if hasattr(self.transformer, 'module'):
+            transformer_dtype = next(self.transformer.module.parameters()).dtype
+        else:
+            transformer_dtype = next(self.transformer.parameters()).dtype
+        target_dtype = transformer_dtype
+        autocast_enabled = (target_dtype != torch.float32) and not fastvideo_args.disable_autocast
+        latents = batch.latents
+        if latents is None:
+            raise ValueError("latents must be provided for Cosmos25DenoisingStage")
+        guidance_scale = batch.guidance_scale
+        if batch.timesteps is None:
+            self.scheduler.set_timesteps(batch.num_inference_steps, device=latents.device)
+            timesteps = self.scheduler.timesteps
+        else:
+            timesteps = batch.timesteps.to(latents.device)
+        cfg = fastvideo_args.pipeline_config
+        if batch.fps is None:
+            gen = batch.generator
+            if isinstance(gen, list) and len(gen) > 0:
+                gen = gen[0]
+            fps_tensor = torch.randint(
+                16,
+                32,
+                (1, ),
+                generator=gen if isinstance(gen, torch.Generator) else None,
+                device=latents.device,
+            ).float().to(dtype=target_dtype)
+        else:
+            fps_val = batch.fps
+            fps_tensor = torch.tensor(
+                [fps_val],
+                device=latents.device,
+                dtype=target_dtype,
+            )
+        latents_4d = latents[0]
+        # Masks are optional for T2W.
+        cond_mask = getattr(batch, "cond_mask", None)
+        condition_mask = cond_mask.to(target_dtype) if isinstance(cond_mask, torch.Tensor) else None
+        pad_mask = getattr(batch, "padding_mask", None)
+        padding_mask = pad_mask.to(target_dtype) if isinstance(pad_mask, torch.Tensor) else None
+        # Conditioning fields are attached by latent preparation stage.
+        conditioning_latents = getattr(batch, "conditioning_latents", None)
+        cond_indicator = getattr(batch, "cond_indicator", None)
+        # Infer whether this is a conditioned run (V2W/I2W) purely from the presence
+        # of conditioning latents. Avoid carrying explicit mode flags on the batch.
+        is_conditioned = (conditioning_latents is not None)
+        init_noise_4d = latents_4d.clone()
+        if condition_mask is None:
+            _, t, h, w = latents_4d.shape
+            condition_mask = torch.zeros(1, 1, t, h, w, device=latents.device, dtype=target_dtype)
+        if padding_mask is None:
+            _, _, h, w = latents_4d.shape
+            padding_default = 0.0 if is_conditioned else 1.0
+            padding_mask = torch.full(
+                (1, 1, h, w),
+                float(padding_default),
+                device=latents.device,
+                dtype=target_dtype,
+            )
+        timestep_scale = 0.001
+        state_dtype = torch.float32
+        conditional_frame_timestep = 0.1
+        latents_4d = latents_4d.to(state_dtype)
+        init_noise_4d = init_noise_4d.to(state_dtype)
+        clamp_every_step = bool(getattr(cfg, "cosmos25_clamp_every_step", True)) if is_conditioned else False
+        with self.progress_bar(total=len(timesteps)) as progress_bar:
+            for i, t in enumerate(timesteps):
+                t_val = float(t)
+                if is_conditioned:
+                    t_frames = int(latents_4d.shape[1])
+                    timestep = torch.full(
+                        (1, t_frames),
+                        float(t_val * timestep_scale),
+                        device=latents.device,
+                        dtype=torch.float32,
+                    )
+                    if cond_indicator is not None and t_frames > 0:
+                        cond_t = cond_indicator[0, 0, :t_frames, 0, 0]
+                        cond_mask_t = (cond_t > 0.5)
+                        if bool(cond_mask_t.any().item()):
+                            timestep[0, cond_mask_t] = float(conditional_frame_timestep)
+                else:
+                    timestep_val = t_val * timestep_scale
+                    timestep = torch.tensor(
+                        [[float(timestep_val)]],
+                        device=latents.device,
+                        dtype=target_dtype,
+                    )
+                # Conditioned runs: replace x_t with GT x0 on the conditioned frames.
+                if (is_conditioned and cond_indicator is not None and conditioning_latents is not None
+                        and (clamp_every_step or i == 0)):
+                    cond_ind_4d = cond_indicator[0].to(state_dtype)
+                    gt_x0 = conditioning_latents[0].to(state_dtype)
+                    latents_4d = gt_x0 * cond_ind_4d + latents_4d * (1 - cond_ind_4d)
+                model_hidden_states = latents_4d.unsqueeze(0)
+                with (
+                        set_forward_context(current_timestep=int(t_val), attn_metadata=None, forward_batch=batch),
+                        torch.autocast(device_type="cuda", dtype=target_dtype, enabled=autocast_enabled),
+                ):
+                    cond_v = self.transformer(
+                        hidden_states=model_hidden_states.to(target_dtype),
+                        encoder_hidden_states=batch.prompt_embeds[0].to(target_dtype),
+                        timestep=timestep,
+                        fps=fps_tensor,
+                        condition_mask=condition_mask,
+                        padding_mask=padding_mask,
+                        return_dict=False,
+                    )[0]
+                    if batch.do_classifier_free_guidance and batch.negative_prompt_embeds:
+                        uncond_v = self.transformer(
+                            hidden_states=model_hidden_states.to(target_dtype),
+                            encoder_hidden_states=batch.negative_prompt_embeds[0].to(target_dtype),
+                            timestep=timestep,
+                            fps=fps_tensor,
+                            condition_mask=condition_mask,
+                            padding_mask=padding_mask,
+                            return_dict=False,
+                        )[0]
+                        if is_conditioned:
+                            v = cond_v + guidance_scale * (cond_v - uncond_v)
+                        else:
+                            v = uncond_v + guidance_scale * (cond_v - uncond_v)
+                    else:
+                        v = cond_v
+                # Conditioned runs: replace velocity on conditioned frames with GT velocity.
+                if (is_conditioned and cond_indicator is not None and conditioning_latents is not None):
+                    cond_ind_4d = cond_indicator[0].to(state_dtype)
+                    gt_x0 = conditioning_latents[0].to(state_dtype)
+                    gt_v = init_noise_4d.to(state_dtype) - gt_x0
+                    v = cond_ind_4d * gt_v + (1 - cond_ind_4d) * v.to(state_dtype)
+                prev = self.scheduler.step(v.unsqueeze(0),
+                                           t,
+                                           latents_4d.unsqueeze(0),
+                                           **extra_step_kwargs,
+                                           return_dict=False)[0]
+                latents_4d = prev.squeeze(0)
+                progress_bar.update()
+        batch.latents = latents_4d.to(target_dtype).unsqueeze(0)
+        return batch
+class Cosmos25T2WDenoisingStage(Cosmos25DenoisingStage):
+    """Cosmos 2.5 Text2World denoising stage."""
+    _CONDITIONING_FIELDS = (
+        "conditioning_latents",
+        "cond_indicator",
+        "uncond_indicator",
+    )
+    def forward(
+        self,
+        batch: ForwardBatch,
+        fastvideo_args: FastVideoArgs,
+    ) -> ForwardBatch:
+        for name in self._CONDITIONING_FIELDS:
+            if hasattr(batch, name):
+                setattr(batch, name, None)
+        return super().forward(batch, fastvideo_args)
+class Cosmos25V2WDenoisingStage(Cosmos25DenoisingStage):
+    """Cosmos 2.5 Video2World denoising stage."""
+    def forward(
+        self,
+        batch: ForwardBatch,
+        fastvideo_args: FastVideoArgs,
+    ) -> ForwardBatch:
+        return super().forward(batch, fastvideo_args)
+class Cosmos25AutoDenoisingStage(PipelineStage):
+    """Route Cosmos 2.5 denoising to T2W vs V2W/I2W."""
+    def __init__(self, transformer, scheduler) -> None:
+        super().__init__()
+        self._t2w = Cosmos25T2WDenoisingStage(transformer=transformer, scheduler=scheduler)
+        self._v2w = Cosmos25V2WDenoisingStage(transformer=transformer, scheduler=scheduler)
+    def pipeline(self):
+        return self._v2w.pipeline() if self._v2w.pipeline else None
+    def forward(
+        self,
+        batch: ForwardBatch,
+        fastvideo_args: FastVideoArgs,
+    ) -> ForwardBatch:
+        conditioning_latents = getattr(batch, "conditioning_latents", None)
+        if conditioning_latents is not None:
+            return self._v2w.forward(batch, fastvideo_args)
+        return self._t2w.forward(batch, fastvideo_args)
+    def verify_input(self, batch: ForwardBatch, fastvideo_args: FastVideoArgs) -> VerificationResult:
+        conditioning_latents = getattr(batch, "conditioning_latents", None)
+        if conditioning_latents is not None:
+            return self._v2w.verify_input(batch, fastvideo_args)
+        return self._t2w.verify_input(batch, fastvideo_args)
+    def verify_output(self, batch: ForwardBatch, fastvideo_args: FastVideoArgs) -> VerificationResult:
+        conditioning_latents = getattr(batch, "conditioning_latents", None)
+        if conditioning_latents is not None:
+            return self._v2w.verify_output(batch, fastvideo_args)
+        return self._t2w.verify_output(batch, fastvideo_args)
+class DmdDenoisingStage(DenoisingStage):
+    """
+    Denoising stage for DMD.
+    """
+    def __init__(self, transformer, scheduler) -> None:
+        super().__init__(transformer, scheduler)
+        self.scheduler = FlowMatchEulerDiscreteScheduler(shift=8.0)
+    def forward(
+        self,
+        batch: ForwardBatch,
+        fastvideo_args: FastVideoArgs,
+    ) -> ForwardBatch:
+        """
+        Run the denoising loop.
+        Args:
+            batch: The current batch information.
+            fastvideo_args: The inference arguments.
+        Returns:
+            The batch with denoised latents.
+        """
+        # Setup precision and autocast settings
+        # TODO(will): make the precision configurable for inference
+        # target_dtype = PRECISION_TO_TYPE[fastvideo_args.precision]
+        target_dtype = torch.bfloat16
+        autocast_enabled = (target_dtype != torch.float32) and not fastvideo_args.disable_autocast
+        # Get timesteps and calculate warmup steps
+        timesteps = batch.timesteps
+        # TODO(will): remove this once we add input/output validation for stages
+        if timesteps is None:
+            raise ValueError("Timesteps must be provided")
+        num_inference_steps = batch.num_inference_steps
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
+        # Prepare image latents and embeddings for I2V generation
+        image_embeds = batch.image_embeds
+        if len(image_embeds) > 0:
+            assert torch.isnan(image_embeds[0]).sum() == 0
+            image_embeds = [image_embed.to(target_dtype) for image_embed in image_embeds]
+        image_kwargs = self.prepare_extra_func_kwargs(
+            self.transformer.forward,
+            {
+                "encoder_hidden_states_image": image_embeds,
+                "mask_strategy": dict_to_3d_list(None, t_max=50, l_max=60, h_max=24)
+            },
+        )
+        pos_cond_kwargs = self.prepare_extra_func_kwargs(
+            self.transformer.forward,
+            {
+                "encoder_hidden_states_2": batch.clip_embedding_pos,
+                "encoder_attention_mask": batch.prompt_attention_mask,
+            },
+        )
+        # Get latents and embeddings
+        assert batch.latents is not None, "latents must be provided"
+        latents = batch.latents
+        video_raw_latent_shape = latents.shape
+        prompt_embeds = batch.prompt_embeds
+        assert not torch.isnan(prompt_embeds[0]).any(), "prompt_embeds contains nan"
+        timesteps = torch.tensor(fastvideo_args.pipeline_config.dmd_denoising_steps,
+                                 dtype=torch.long,
+                                 device=get_local_torch_device())
+        # Run denoising loop
+        with self.progress_bar(total=len(timesteps)) as progress_bar:
+            for i, t in enumerate(timesteps):
+                # Skip if interrupted
+                if hasattr(self, 'interrupt') and self.interrupt:
+                    continue
+                # Expand latents for I2V
+                noise_latents = latents.clone()
+                latent_model_input = latents.to(target_dtype)
+                if batch.image_latent is not None:
+                    latent_model_input = torch.cat(
+                        [latent_model_input, batch.image_latent.permute(0, 2, 1, 3, 4)], dim=2).to(target_dtype)
+                assert not torch.isnan(latent_model_input).any(), "latent_model_input contains nan"
+                # Prepare inputs for transformer
+                t_expand = t.repeat(latent_model_input.shape[0])
+                guidance_expand = (torch.tensor(
+                    [fastvideo_args.pipeline_config.embedded_cfg_scale] * latent_model_input.shape[0],
+                    dtype=torch.float32,
+                    device=get_local_torch_device(),
+                ).to(target_dtype) * 1000.0 if fastvideo_args.pipeline_config.embedded_cfg_scale is not None else None)
+                # Predict noise residual
+                with torch.autocast(device_type="cuda", dtype=target_dtype, enabled=autocast_enabled):
+                    if (vsa_available and self.attn_backend == VideoSparseAttentionBackend) or \
+                       (sparse_fp4_available and self.attn_backend in sparse_fp4_backends):
+                        self.attn_metadata_builder_cls = self.attn_backend.get_builder_cls()
+                        if self.attn_metadata_builder_cls is not None:
+                            self.attn_metadata_builder = self.attn_metadata_builder_cls()
+                            # TODO(will): clean this up
+                            attn_metadata = self.attn_metadata_builder.build(  # type: ignore
+                                current_timestep=i,  # type: ignore
+                                raw_latent_shape=batch.raw_latent_shape[2:5],  # type: ignore
+                                patch_size=fastvideo_args.pipeline_config.  # type: ignore
+                                dit_config.patch_size,  # type: ignore
+                                VSA_sparsity=fastvideo_args.VSA_sparsity,  # type: ignore
+                                device=get_local_torch_device(),  # type: ignore
+                            )  # type: ignore
+                            assert attn_metadata is not None, "attn_metadata cannot be None"
+                        else:
+                            attn_metadata = None
+                    else:
+                        attn_metadata = None
+                    batch.is_cfg_negative = False
+                    with set_forward_context(
+                            current_timestep=i,
+                            attn_metadata=attn_metadata,
+                            forward_batch=batch,
+                            # fastvideo_args=fastvideo_args
+                    ):
+                        # Run transformer
+                        pred_noise = self.transformer(
+                            latent_model_input.permute(0, 2, 1, 3, 4),
+                            prompt_embeds,
+                            t_expand,
+                            guidance=guidance_expand,
+                            **image_kwargs,
+                            **pos_cond_kwargs,
+                        ).permute(0, 2, 1, 3, 4)
+                    pred_video = pred_noise_to_pred_video(pred_noise=pred_noise.flatten(0, 1),
+                                                          noise_input_latent=noise_latents.flatten(0, 1),
+                                                          timestep=t_expand,
+                                                          scheduler=self.scheduler).unflatten(0, pred_noise.shape[:2])
+                    if i < len(timesteps) - 1:
+                        next_timestep = timesteps[i + 1] * torch.ones([1], dtype=torch.long, device=pred_video.device)
+                        noise_generator = batch.generator[0] if isinstance(batch.generator, list) else batch.generator
+                        noise = torch.randn(video_raw_latent_shape, dtype=pred_video.dtype,
+                                            generator=noise_generator).to(self.device)
+                        latents = self.scheduler.add_noise(pred_video.flatten(0, 1), noise.flatten(0, 1),
+                                                           next_timestep).unflatten(0, pred_video.shape[:2])
+                    else:
+                        latents = pred_video
+                    # Update progress bar
+                    if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and
+                                                   (i + 1) % self.scheduler.order == 0 and progress_bar is not None):
+                        progress_bar.update()
+        # Gather results if using sequence parallelism
+        latents = latents.permute(0, 2, 1, 3, 4)
+        # Update batch with final latents
+        batch.latents = latents
+        return batch

standalone_inference/overlay_files/fastvideo/platforms/cuda.py ADDED Viewed

	@@ -0,0 +1,440 @@

+# SPDX-License-Identifier: Apache-2.0
+# Adapted from vllm: https://github.com/vllm-project/vllm/blob/v0.7.3/vllm/platforms/cuda.py
+"""Code inside this file can safely assume cuda platform, e.g. importing
+pynvml. However, it should not initialize cuda context.
+"""
+import os
+from collections.abc import Callable
+from functools import lru_cache, wraps
+from typing import TypeVar
+import torch
+from typing_extensions import ParamSpec
+import fastvideo.envs as envs
+from fastvideo.logger import init_logger
+from fastvideo.platforms.interface import (AttentionBackendEnum, DeviceCapability, Platform, PlatformEnum)
+from fastvideo.utils import import_pynvml
+logger = init_logger(__name__)
+_P = ParamSpec("_P")
+_R = TypeVar("_R")
+pynvml = import_pynvml()  # type: ignore[no-untyped-call]
+# pytorch 2.5 uses cudnn sdpa by default, which will cause crash on some models
+# see https://github.com/huggingface/diffusers/issues/9704 for details
+torch.backends.cuda.enable_cudnn_sdp(False)
+def device_id_to_physical_device_id(device_id: int) -> int:
+    if "CUDA_VISIBLE_DEVICES" in os.environ:
+        device_ids = os.environ["CUDA_VISIBLE_DEVICES"].split(",")
+        if device_ids == [""]:
+            msg = ("CUDA_VISIBLE_DEVICES is set to empty string, which means"
+                   " GPU support is disabled. If you are using ray, please unset"
+                   " the environment variable `CUDA_VISIBLE_DEVICES` inside the"
+                   " worker/actor. "
+                   "Check https://github.com/vllm-project/vllm/issues/8402 for"
+                   " more information.")
+            raise RuntimeError(msg)
+        physical_device_id = device_ids[device_id]
+        return int(physical_device_id)
+    else:
+        return device_id
+def with_nvml_context(fn: Callable[_P, _R]) -> Callable[_P, _R]:
+    @wraps(fn)
+    def wrapper(*args: _P.args, **kwargs: _P.kwargs) -> _R:
+        pynvml.nvmlInit()
+        try:
+            return fn(*args, **kwargs)
+        finally:
+            pynvml.nvmlShutdown()
+    return wrapper
+class CudaPlatformBase(Platform):
+    _enum = PlatformEnum.CUDA
+    device_name: str = "cuda"
+    device_type: str = "cuda"
+    dispatch_key: str = "CUDA"
+    ray_device_key: str = "GPU"
+    device_control_env_var: str = "CUDA_VISIBLE_DEVICES"
+    @classmethod
+    def get_device_capability(cls, device_id: int = 0) -> DeviceCapability | None:
+        raise NotImplementedError
+    @classmethod
+    def get_device_name(cls, device_id: int = 0) -> str:
+        raise NotImplementedError
+    @classmethod
+    def get_device_total_memory(cls, device_id: int = 0) -> int:
+        raise NotImplementedError
+    @classmethod
+    def is_async_output_supported(cls, enforce_eager: bool | None) -> bool:
+        if enforce_eager:
+            logger.warning("To see benefits of async output processing, enable CUDA "
+                           "graph. Since, enforce-eager is enabled, async output "
+                           "processor cannot be used")
+            return False
+        return True
+    @classmethod
+    def is_full_nvlink(cls, device_ids: list[int]) -> bool:
+        raise NotImplementedError
+    @classmethod
+    def log_warnings(cls) -> None:
+        pass
+    @classmethod
+    def get_current_memory_usage(cls, device: torch.types.Device | None = None) -> float:
+        torch.cuda.reset_peak_memory_stats(device)
+        return float(torch.cuda.max_memory_allocated(device))
+    @classmethod
+    def get_torch_device(cls) -> object:
+        """
+        Return torch.cuda
+        """
+        return torch.cuda
+    @classmethod
+    def get_attn_backend_cls(cls, selected_backend: AttentionBackendEnum | None, head_size: int,
+                             dtype: torch.dtype) -> str:
+        # TODO(will): maybe come up with a more general interface for local attention
+        # if distributed is False, we always try to use Flash attn
+        logger.info("Trying FASTVIDEO_ATTENTION_BACKEND=%s", envs.FASTVIDEO_ATTENTION_BACKEND)
+        logger.info("Selected backend: %s", selected_backend)
+        if selected_backend == AttentionBackendEnum.SAGE_ATTN:
+            try:
+                from sageattention import sageattn  # noqa: F401
+                from fastvideo.attention.backends.sage_attn import (  # noqa: F401
+                    SageAttentionBackend)
+                logger.info("Using Sage Attention backend.")
+                return "fastvideo.attention.backends.sage_attn.SageAttentionBackend"
+            except ImportError as e:
+                logger.info(e)
+                logger.info("Sage Attention backend is not installed. Fall back to Flash Attention.")
+        elif selected_backend == AttentionBackendEnum.SAGE_ATTN_THREE:
+            try:
+                from sageattn3 import sageattn3_blackwell  # noqa: F401
+                from fastvideo.attention.backends.sage_attn3 import (  # noqa: F401
+                    SageAttention3Backend)
+                logger.info("Using Sage Attention 3 backend.")
+                return "fastvideo.attention.backends.sage_attn3.SageAttention3Backend"
+            except ImportError as e:
+                logger.info(e)
+                logger.info("Sage Attention 3 backend is not installed. Fall back to Flash Attention.")
+        elif selected_backend == AttentionBackendEnum.ATTN_QAT_INFER:
+            try:
+                from fastvideo.attention.backends.attn_qat_infer import (  # noqa: F401
+                    AttnQatInferBackend, is_attn_qat_infer_available,
+                )
+                if not is_attn_qat_infer_available():
+                    raise ImportError("attn_qat_infer could not be imported.")
+                logger.info("Using attn_qat_infer backend.")
+                return "fastvideo.attention.backends.attn_qat_infer.AttnQatInferBackend"
+            except ImportError as e:
+                logger.info(e)
+                logger.info("attn_qat_infer backend is not installed. Fall back to Flash Attention.")
+        elif selected_backend == AttentionBackendEnum.ATTN_QAT_TRAIN:
+            try:
+                from fastvideo_kernel.triton_kernels.attn_qat_train import attention  # noqa: F401
+                from fastvideo.attention.backends.attn_qat_train import (  # noqa: F401
+                    AttnQatTrainBackend)
+                logger.info("Using attn_qat_train backend.")
+                return "fastvideo.attention.backends.attn_qat_train.AttnQatTrainBackend"
+            except ImportError as e:
+                logger.info(e)
+                logger.info("attn_qat_train backend is not installed. Fall back to Flash Attention.")
+        elif selected_backend == AttentionBackendEnum.VIDEO_SPARSE_ATTN:
+            try:
+                from fastvideo_kernel import video_sparse_attn  # noqa: F401
+                from fastvideo.attention.backends.video_sparse_attn import (  # noqa: F401
+                    VideoSparseAttentionBackend)
+                logger.info("Using Video Sparse Attention backend.")
+                return "fastvideo.attention.backends.video_sparse_attn.VideoSparseAttentionBackend"
+            except ImportError as e:
+                logger.error("Failed to import Video Sparse Attention backend: %s", str(e))
+                raise ImportError("The Video Sparse Attention backend is not installed. "
+                                  "To install it, please follow the instructions at: "
+                                  "https://hao-ai-lab.github.io/FastVideo/video_sparse_attention/installation ") from e
+        elif selected_backend == AttentionBackendEnum.SPARSE_FP4_ATTN:
+            try:
+                from fastvideo.attention.backends.sparse_fp4_attn import (  # noqa: F401
+                    SparseFP4AttentionBackend)
+                logger.info("Using Sparse FP4 Attention backend (FP4 quant + VSA).")
+                return "fastvideo.attention.backends.sparse_fp4_attn.SparseFP4AttentionBackend"
+            except ImportError as e:
+                logger.error("Failed to import Sparse FP4 Attention backend: %s", str(e))
+                raise ImportError("Sparse FP4 Attention backend is not available.") from e
+        elif selected_backend == AttentionBackendEnum.SPARSE_FP4_OURS_P_ATTN:
+            try:
+                from fastvideo.attention.backends.sparse_fp4_ours_p_attn import (  # noqa: F401
+                    SparseFP4OursPAttentionBackend)
+                logger.info(
+                    "Using Sparse FP4 Ours-P Attention backend (group-local P quant + VSA)."
+                )
+                return "fastvideo.attention.backends.sparse_fp4_ours_p_attn.SparseFP4OursPAttentionBackend"
+            except ImportError as e:
+                logger.error("Failed to import Sparse FP4 Ours-P Attention backend: %s", str(e))
+                raise ImportError("Sparse FP4 Ours-P Attention backend is not available.") from e
+        elif selected_backend == AttentionBackendEnum.BSA_ATTN:
+            try:
+                from fastvideo.attention.backends.bsa_attn import (  # noqa: F401
+                    BSAAttentionBackend)
+                logger.info("Using BSA Attention backend.")
+                return "fastvideo.attention.backends.bsa_attn.BSAAttentionBackend"
+            except ImportError as e:
+                logger.error("Failed to import BSA Attention backend: %s", str(e))
+                raise ImportError("The BSA Attention backend failed to import.") from e
+        elif selected_backend == AttentionBackendEnum.VMOBA_ATTN:
+            try:
+                from fastvideo_kernel import moba_attn_varlen  # noqa: F401
+                from fastvideo.attention.backends.vmoba import (  # noqa: F401
+                    VMOBAAttentionBackend)
+                logger.info("Using Video MOBA Attention backend.")
+                return "fastvideo.attention.backends.vmoba.VMOBAAttentionBackend"
+            except ImportError as e:
+                logger.error("Failed to import Video MoBA Attention backend: %s", str(e))
+                raise ImportError("Video MoBA Attention backend is not installed. ") from e
+        elif selected_backend == AttentionBackendEnum.SLA_ATTN:
+            try:
+                from fastvideo.attention.backends.sla import (  # noqa: F401
+                    SLAAttentionBackend)
+                logger.info("Using SLA (Sparse-Linear Attention) backend.")
+                return "fastvideo.attention.backends.sla.SLAAttentionBackend"
+            except ImportError as e:
+                logger.error("Failed to import SLA Attention backend: %s", str(e))
+                raise ImportError("SLA Attention backend is not available. ") from e
+        elif selected_backend == AttentionBackendEnum.SAGE_SLA_ATTN:
+            try:
+                from fastvideo.attention.backends.sla import (  # noqa: F401
+                    SageSLAAttentionBackend)
+                logger.info("Using SageSLA (Quantized Sparse-Linear Attention) backend.")
+                return "fastvideo.attention.backends.sla.SageSLAAttentionBackend"
+            except ImportError as e:
+                logger.error("Failed to import SageSLA Attention backend: %s", str(e))
+                raise ImportError("SageSLA Attention backend requires spas_sage_attn. "
+                                  "Install with: pip install git+https://github.com/thu-ml/SpargeAttn.git") from e
+        elif selected_backend == AttentionBackendEnum.TORCH_SDPA:
+            logger.info("Using Torch SDPA backend.")
+            return "fastvideo.attention.backends.sdpa.SDPABackend"
+        elif selected_backend == AttentionBackendEnum.FLASH_ATTN or selected_backend is None:
+            pass
+        elif selected_backend:
+            raise ValueError(f"Invalid attention backend for {cls.device_name}")
+        target_backend = AttentionBackendEnum.FLASH_ATTN
+        if not cls.has_device_capability(80):
+            logger.info("Cannot use FlashAttention-2 backend for Volta and Turing "
+                        "GPUs.")
+            target_backend = AttentionBackendEnum.TORCH_SDPA
+        elif dtype not in (torch.float16, torch.bfloat16):
+            logger.info("Cannot use FlashAttention-2 backend for dtype other than "
+                        "torch.float16 or torch.bfloat16.")
+            target_backend = AttentionBackendEnum.TORCH_SDPA
+        # FlashAttn is valid for the model, checking if the package is
+        # installed.
+        if target_backend == AttentionBackendEnum.FLASH_ATTN:
+            try:
+                import flash_attn  # noqa: F401
+                from fastvideo.attention.backends.flash_attn import (  # noqa: F401
+                    FlashAttentionBackend)
+                supported_sizes = \
+                    FlashAttentionBackend.get_supported_head_sizes()
+                if head_size not in supported_sizes:
+                    logger.info("Cannot use FlashAttention-2 backend for head size %d.", head_size)
+                    target_backend = AttentionBackendEnum.TORCH_SDPA
+            except ImportError:
+                logger.info("Cannot use FlashAttention-2 backend because the "
+                            "flash_attn package is not found. "
+                            "Make sure that flash_attn was built and installed "
+                            "(on by default).")
+                target_backend = AttentionBackendEnum.TORCH_SDPA
+        if target_backend == AttentionBackendEnum.TORCH_SDPA:
+            logger.info("Using Torch SDPA backend.")
+            return "fastvideo.attention.backends.sdpa.SDPABackend"
+        logger.info("Using Flash Attention backend.")
+        return "fastvideo.attention.backends.flash_attn.FlashAttentionBackend"
+    @classmethod
+    def get_device_communicator_cls(cls) -> str:
+        return "fastvideo.distributed.device_communicators.cuda_communicator.CudaCommunicator"  # noqa
+# NVML utils
+# Note that NVML is not affected by `CUDA_VISIBLE_DEVICES`,
+# all the related functions work on real physical device ids.
+# the major benefit of using NVML is that it will not initialize CUDA
+class NvmlCudaPlatform(CudaPlatformBase):
+    @classmethod
+    @lru_cache(maxsize=8)
+    @with_nvml_context
+    def get_device_capability(cls, device_id: int = 0) -> DeviceCapability | None:
+        try:
+            physical_device_id = device_id_to_physical_device_id(device_id)
+            handle = pynvml.nvmlDeviceGetHandleByIndex(physical_device_id)
+            major, minor = pynvml.nvmlDeviceGetCudaComputeCapability(handle)
+            return DeviceCapability(major=major, minor=minor)
+        except RuntimeError:
+            return None
+    @classmethod
+    @lru_cache(maxsize=8)
+    @with_nvml_context
+    def has_device_capability(
+        cls,
+        capability: tuple[int, int] | int,
+        device_id: int = 0,
+    ) -> bool:
+        try:
+            return bool(super().has_device_capability(capability, device_id))
+        except RuntimeError:
+            return False
+    @classmethod
+    @lru_cache(maxsize=8)
+    @with_nvml_context
+    def get_device_name(cls, device_id: int = 0) -> str:
+        physical_device_id = device_id_to_physical_device_id(device_id)
+        return cls._get_physical_device_name(physical_device_id)
+    @classmethod
+    @lru_cache(maxsize=8)
+    @with_nvml_context
+    def get_device_uuid(cls, device_id: int = 0) -> str:
+        physical_device_id = device_id_to_physical_device_id(device_id)
+        handle = pynvml.nvmlDeviceGetHandleByIndex(physical_device_id)
+        return str(pynvml.nvmlDeviceGetUUID(handle))
+    @classmethod
+    @lru_cache(maxsize=8)
+    @with_nvml_context
+    def get_device_total_memory(cls, device_id: int = 0) -> int:
+        physical_device_id = device_id_to_physical_device_id(device_id)
+        handle = pynvml.nvmlDeviceGetHandleByIndex(physical_device_id)
+        return int(pynvml.nvmlDeviceGetMemoryInfo(handle).total)
+    @classmethod
+    @with_nvml_context
+    def is_full_nvlink(cls, physical_device_ids: list[int]) -> bool:
+        """
+        query if the set of gpus are fully connected by nvlink (1 hop)
+        """
+        handles = [pynvml.nvmlDeviceGetHandleByIndex(i) for i in physical_device_ids]
+        for i, handle in enumerate(handles):
+            for j, peer_handle in enumerate(handles):
+                if i < j:
+                    try:
+                        p2p_status = pynvml.nvmlDeviceGetP2PStatus(
+                            handle,
+                            peer_handle,
+                            pynvml.NVML_P2P_CAPS_INDEX_NVLINK,
+                        )
+                        if p2p_status != pynvml.NVML_P2P_STATUS_OK:
+                            return False
+                    except pynvml.NVMLError:
+                        logger.exception("NVLink detection failed. This is normal if"
+                                         " your machine has no NVLink equipped.")
+                        return False
+        return True
+    @classmethod
+    def _get_physical_device_name(cls, device_id: int = 0) -> str:
+        handle = pynvml.nvmlDeviceGetHandleByIndex(device_id)
+        return str(pynvml.nvmlDeviceGetName(handle))
+    @classmethod
+    @with_nvml_context
+    def log_warnings(cls) -> None:
+        device_ids: int = pynvml.nvmlDeviceGetCount()
+        if device_ids > 1:
+            device_names = [cls._get_physical_device_name(i) for i in range(device_ids)]
+            if (len(set(device_names)) > 1 and os.environ.get("CUDA_DEVICE_ORDER") != "PCI_BUS_ID"):
+                logger.warning(
+                    "Detected different devices in the system: %s. Please"
+                    " make sure to set `CUDA_DEVICE_ORDER=PCI_BUS_ID` to "
+                    "avoid unexpected behavior.",
+                    ", ".join(device_names),
+                )
+class NonNvmlCudaPlatform(CudaPlatformBase):
+    @classmethod
+    def get_device_capability(cls, device_id: int = 0) -> DeviceCapability:
+        major, minor = torch.cuda.get_device_capability(device_id)
+        return DeviceCapability(major=major, minor=minor)
+    @classmethod
+    def get_device_name(cls, device_id: int = 0) -> str:
+        return str(torch.cuda.get_device_name(device_id))
+    @classmethod
+    def get_device_total_memory(cls, device_id: int = 0) -> int:
+        device_props = torch.cuda.get_device_properties(device_id)
+        return int(device_props.total_memory)
+    @classmethod
+    def is_full_nvlink(cls, physical_device_ids: list[int]) -> bool:
+        logger.exception("NVLink detection not possible, as context support was"
+                         " not found. Assuming no NVLink available.")
+        return False
+# Autodetect either NVML-enabled or non-NVML platform
+# based on whether NVML is available.
+nvml_available = False
+try:
+    try:
+        pynvml.nvmlInit()
+        nvml_available = True
+    except Exception:
+        # On Jetson, NVML is not supported.
+        nvml_available = False
+finally:
+    if nvml_available:
+        pynvml.nvmlShutdown()
+CudaPlatform = NvmlCudaPlatform if nvml_available else NonNvmlCudaPlatform
+try:
+    from sphinx.ext.autodoc.mock import _MockModule
+    if not isinstance(pynvml, _MockModule):
+        CudaPlatform.log_warnings()
+except ModuleNotFoundError:
+    CudaPlatform.log_warnings()

standalone_inference/overlay_files/fastvideo/platforms/interface.py ADDED Viewed

	@@ -0,0 +1,255 @@

+import enum
+import random
+from typing import Any, NamedTuple
+import numpy as np
+import torch
+from fastvideo.logger import init_logger
+logger = init_logger(__name__)
+class AttentionBackendEnum(enum.Enum):
+    FLASH_ATTN = enum.auto()
+    TORCH_SDPA = enum.auto()
+    SAGE_ATTN = enum.auto()
+    SAGE_ATTN_THREE = enum.auto()
+    ATTN_QAT_INFER = enum.auto()
+    ATTN_QAT_TRAIN = enum.auto()
+    VIDEO_SPARSE_ATTN = enum.auto()
+    BSA_ATTN = enum.auto()
+    VMOBA_ATTN = enum.auto()
+    SLA_ATTN = enum.auto()
+    SAGE_SLA_ATTN = enum.auto()
+    SPARSE_FP4_ATTN = enum.auto()
+    SPARSE_FP4_OURS_P_ATTN = enum.auto()
+    NO_ATTENTION = enum.auto()
+class PlatformEnum(enum.Enum):
+    CUDA = enum.auto()
+    ROCM = enum.auto()
+    TPU = enum.auto()
+    XPU = enum.auto()
+    CPU = enum.auto()
+    MPS = enum.auto()
+    OOT = enum.auto()
+    UNSPECIFIED = enum.auto()
+    NPU = enum.auto()
+class CpuArchEnum(enum.Enum):
+    X86 = enum.auto()
+    ARM = enum.auto()
+    UNSPECIFIED = enum.auto()
+class DeviceCapability(NamedTuple):
+    major: int
+    minor: int
+    def as_version_str(self) -> str:
+        return f"{self.major}.{self.minor}"
+    def to_int(self) -> int:
+        """
+        Express device capability as an integer ``<major><minor>``.
+        It is assumed that the minor version is always a single digit.
+        """
+        assert 0 <= self.minor < 10
+        return self.major * 10 + self.minor
+class Platform:
+    _enum: PlatformEnum
+    device_name: str
+    device_type: str
+    dispatch_key: str = "CPU"
+    # platform-agnostic way to specify the device control environment variable,
+    # .e.g. CUDA_VISIBLE_DEVICES for CUDA.
+    # hint: search for "get_visible_accelerator_ids_env_var" in
+    # https://github.com/ray-project/ray/tree/master/python/ray/_private/accelerators # noqa
+    device_control_env_var: str = "FASTVIDEO_DEVICE_CONTROL_ENV_VAR_PLACEHOLDER"
+    # available ray device keys:
+    # https://github.com/ray-project/ray/blob/10ba5adadcc49c60af2c358a33bb943fb491a171/python/ray/_private/ray_constants.py#L438 # noqa
+    # empty string means the device does not support ray
+    ray_device_key: str = ""
+    # The torch.compile backend for compiling simple and
+    # standalone functions. The default value is "inductor" to keep
+    # the same behavior as PyTorch.
+    # NOTE: for the forward part of the model, vLLM has another separate
+    # compilation strategy.
+    simple_compile_backend: str = "inductor"
+    supported_quantization: list[str] = []
+    additional_env_vars: list[str] = []
+    def is_cuda(self) -> bool:
+        return self._enum == PlatformEnum.CUDA
+    def is_rocm(self) -> bool:
+        return self._enum == PlatformEnum.ROCM
+    def is_tpu(self) -> bool:
+        return self._enum == PlatformEnum.TPU
+    def is_xpu(self) -> bool:
+        return self._enum == PlatformEnum.XPU
+    def is_cpu(self) -> bool:
+        return self._enum == PlatformEnum.CPU
+    def is_out_of_tree(self) -> bool:
+        return self._enum == PlatformEnum.OOT
+    def is_cuda_alike(self) -> bool:
+        """Stateless version of :func:`torch.cuda.is_available`."""
+        return self._enum in (PlatformEnum.CUDA, PlatformEnum.ROCM)
+    def is_mps(self) -> bool:
+        return self._enum == PlatformEnum.MPS
+    def is_npu(self) -> bool:
+        return self._enum == PlatformEnum.NPU
+    @classmethod
+    def get_attn_backend_cls(cls, selected_backend: AttentionBackendEnum | None, head_size: int,
+                             dtype: torch.dtype) -> str:
+        """Get the attention backend class of a device."""
+        return ""
+    @classmethod
+    def get_device_capability(
+        cls,
+        device_id: int = 0,
+    ) -> DeviceCapability | None:
+        """Stateless version of :func:`torch.cuda.get_device_capability`."""
+        return None
+    @classmethod
+    def has_device_capability(
+        cls,
+        capability: tuple[int, int] | int,
+        device_id: int = 0,
+    ) -> bool:
+        """
+        Test whether this platform is compatible with a device capability.
+        The ``capability`` argument can either be:
+        - A tuple ``(major, minor)``.
+        - An integer ``<major><minor>``. (See :meth:`DeviceCapability.to_int`)
+        """
+        current_capability = cls.get_device_capability(device_id=device_id)
+        if current_capability is None:
+            return False
+        if isinstance(capability, tuple):
+            return current_capability >= capability
+        return current_capability.to_int() >= capability
+    @classmethod
+    def get_device_name(cls, device_id: int = 0) -> str:
+        """Get the name of a device."""
+        raise NotImplementedError
+    @classmethod
+    def get_device_uuid(cls, device_id: int = 0) -> str:
+        """Get the uuid of a device, e.g. the PCI bus ID."""
+        raise NotImplementedError
+    @classmethod
+    def get_device_total_memory(cls, device_id: int = 0) -> int:
+        """Get the total memory of a device in bytes."""
+        raise NotImplementedError
+    @classmethod
+    def is_async_output_supported(cls, enforce_eager: bool | None) -> bool:
+        """
+        Check if the current platform supports async output.
+        """
+        raise NotImplementedError
+    @classmethod
+    def get_torch_device(cls) -> Any:
+        """
+        Check if the current platform supports torch device.
+        """
+        raise NotImplementedError
+    @classmethod
+    def inference_mode(cls):
+        """A device-specific wrapper of `torch.inference_mode`.
+        This wrapper is recommended because some hardware backends such as TPU
+        do not support `torch.inference_mode`. In such a case, they will fall
+        back to `torch.no_grad` by overriding this method.
+        """
+        return torch.inference_mode(mode=True)
+    @classmethod
+    def seed_everything(cls, seed: int | None = None) -> None:
+        """
+        Set the seed of each random module.
+        `torch.manual_seed` will set seed on all devices.
+        Loosely based on: https://github.com/Lightning-AI/pytorch-lightning/blob/2.4.0/src/lightning/fabric/utilities/seed.py#L20
+        """
+        if seed is not None:
+            random.seed(seed)
+            np.random.seed(seed)
+            torch.manual_seed(seed)
+            torch.cuda.manual_seed_all(seed)
+    @classmethod
+    def verify_model_arch(cls, model_arch: str) -> None:
+        """
+        Verify whether the current platform supports the specified model
+        architecture.
+        - This will raise an Error or Warning based on the model support on
+        the current platform.
+        - By default all models are considered supported.
+        """
+        pass
+    @classmethod
+    def verify_quantization(cls, quant: str) -> None:
+        """
+        Verify whether the quantization is supported by the current platform.
+        """
+        if cls.supported_quantization and \
+            quant not in cls.supported_quantization:
+            raise ValueError(f"{quant} quantization is currently not supported in "
+                             f"{cls.device_name}.")
+    @classmethod
+    def get_current_memory_usage(cls, device: torch.types.Device | None = None) -> float:
+        """
+        Return the memory usage in bytes.
+        """
+        raise NotImplementedError
+    @classmethod
+    def get_device_communicator_cls(cls) -> str:
+        """
+        Get device specific communicator class for distributed communication.
+        """
+        return "fastvideo.distributed.device_communicators.base_device_communicator.DeviceCommunicatorBase"  # noqa
+    @classmethod
+    def get_cpu_architecture(cls) -> CpuArchEnum:
+        """Get the CPU architecture of the current platform."""
+        return CpuArchEnum.UNSPECIFIED
+class UnspecifiedPlatform(Platform):
+    _enum = PlatformEnum.UNSPECIFIED
+    device_type = ""

standalone_inference/overlay_files/fastvideo/train/models/wan/wan.py ADDED Viewed

	@@ -0,0 +1,680 @@

+# SPDX-License-Identifier: Apache-2.0
+"""Wan model plugin (per-role instance)."""
+from __future__ import annotations
+import copy
+import gc
+from typing import Any, Literal, TYPE_CHECKING
+import torch
+import fastvideo.envs as envs
+from fastvideo.configs.sample import SamplingParam
+from fastvideo.distributed import (
+    get_sp_group,
+    get_world_group,
+)
+from fastvideo.forward_context import set_forward_context
+from fastvideo.models.schedulers.scheduling_flow_match_euler_discrete import (
+    FlowMatchEulerDiscreteScheduler, )
+from fastvideo.pipelines import TrainingBatch
+from fastvideo.pipelines.basic.wan.wan_pipeline import (
+    WanPipeline, )
+from fastvideo.pipelines.pipeline_batch_info import (
+    ForwardBatch, )
+from fastvideo.training.activation_checkpoint import (
+    apply_activation_checkpointing, )
+from fastvideo.training.training_utils import (
+    compute_density_for_timestep_sampling,
+    get_sigmas,
+    normalize_dit_input,
+    shift_timestep,
+)
+from fastvideo.utils import (
+    is_vmoba_available,
+    is_vsa_available,
+)
+from fastvideo.train.models.base import ModelBase
+from fastvideo.train.utils.module_state import (
+    apply_trainable, )
+from fastvideo.train.utils.moduleloader import (
+    load_module_from_path, )
+if TYPE_CHECKING:
+    from fastvideo.train.utils.training_config import (
+        TrainingConfig, )
+VideoSparseAttentionMetadataBuilder: type[Any] | None
+VideoMobaAttentionMetadataBuilder: type[Any] | None
+try:
+    from fastvideo.attention.backends.video_sparse_attn import (
+        VideoSparseAttentionMetadataBuilder as _VideoSparseAttentionMetadataBuilder, )
+    from fastvideo.attention.backends.vmoba import (
+        VideoMobaAttentionMetadataBuilder as _VideoMobaAttentionMetadataBuilder, )
+    VideoSparseAttentionMetadataBuilder = _VideoSparseAttentionMetadataBuilder
+    VideoMobaAttentionMetadataBuilder = _VideoMobaAttentionMetadataBuilder
+except Exception:
+    VideoSparseAttentionMetadataBuilder = None
+    VideoMobaAttentionMetadataBuilder = None
+class WanModel(ModelBase):
+    """Wan per-role model: owns transformer + noise_scheduler."""
+    _transformer_cls_name: str = "WanTransformer3DModel"
+    def __init__(
+        self,
+        *,
+        init_from: str,
+        training_config: TrainingConfig,
+        trainable: bool = True,
+        disable_custom_init_weights: bool = False,
+        flow_shift: float = 3.0,
+        enable_gradient_checkpointing_type: str
+        | None = None,
+        transformer_override_safetensor: str
+        | None = None,
+    ) -> None:
+        self._init_from = str(init_from)
+        self._trainable = bool(trainable)
+        self.transformer = self._load_transformer(
+            init_from=self._init_from,
+            trainable=self._trainable,
+            disable_custom_init_weights=(disable_custom_init_weights),
+            enable_gradient_checkpointing_type=(enable_gradient_checkpointing_type),
+            training_config=training_config,
+            transformer_override_safetensor=(transformer_override_safetensor),
+        )
+        self.noise_scheduler = (FlowMatchEulerDiscreteScheduler(shift=float(flow_shift)))
+        # Filled by init_preprocessors (student only).
+        self.vae: Any = None
+        self.training_config: TrainingConfig = training_config
+        self.dataloader: Any = None
+        self.validator: Any = None
+        self.start_step: int = 0
+        self.world_group: Any = None
+        self.sp_group: Any = None
+        self.negative_prompt_embeds: (torch.Tensor | None) = None
+        self.negative_prompt_attention_mask: (torch.Tensor | None) = None
+        # Timestep mechanics.
+        self.timestep_shift: float = float(flow_shift)
+        self.num_train_timestep: int = int(self.noise_scheduler.num_train_timesteps)
+        self.min_timestep: int = 0
+        self.max_timestep: int = self.num_train_timestep
+    def _load_transformer(
+        self,
+        *,
+        init_from: str,
+        trainable: bool,
+        disable_custom_init_weights: bool,
+        enable_gradient_checkpointing_type: str | None,
+        training_config: TrainingConfig,
+        transformer_override_safetensor: str | None = None,
+    ) -> torch.nn.Module:
+        transformer = load_module_from_path(
+            model_path=init_from,
+            module_type="transformer",
+            training_config=training_config,
+            disable_custom_init_weights=(disable_custom_init_weights),
+            override_transformer_cls_name=(self._transformer_cls_name),
+            transformer_override_safetensor=(transformer_override_safetensor),
+        )
+        transformer = apply_trainable(transformer, trainable=trainable)
+        # Fall back to training_config.model if not set on the
+        # model YAML section directly.
+        ckpt_type = (enable_gradient_checkpointing_type or getattr(
+            getattr(training_config, "model", None),
+            "enable_gradient_checkpointing_type",
+            None,
+        ))
+        if trainable and ckpt_type:
+            transformer = apply_activation_checkpointing(
+                transformer,
+                checkpointing_type=ckpt_type,
+            )
+        return transformer
+    # ------------------------------------------------------------------
+    # Lifecycle
+    # ------------------------------------------------------------------
+    def init_preprocessors(self, training_config: TrainingConfig) -> None:
+        self.vae = load_module_from_path(
+            model_path=str(training_config.model_path),
+            module_type="vae",
+            training_config=training_config,
+        )
+        self.world_group = get_world_group()
+        self.sp_group = get_sp_group()
+        self._init_timestep_mechanics()
+        from fastvideo.dataset.dataloader.schema import (
+            pyarrow_schema_t2v, )
+        from fastvideo.train.utils.dataloader import (
+            build_parquet_t2v_train_dataloader, )
+        text_len = (
+            training_config.pipeline_config.text_encoder_configs[  # type: ignore[union-attr]
+                0].arch_config.text_len)
+        self.dataloader = build_parquet_t2v_train_dataloader(
+            training_config.data,
+            text_len=int(text_len),
+            parquet_schema=pyarrow_schema_t2v,
+        )
+        self.start_step = 0
+    @property
+    def num_train_timesteps(self) -> int:
+        return int(self.num_train_timestep)
+    def shift_and_clamp_timestep(self, timestep: torch.Tensor) -> torch.Tensor:
+        timestep = shift_timestep(
+            timestep,
+            self.timestep_shift,
+            self.num_train_timestep,
+        )
+        return timestep.clamp(self.min_timestep, self.max_timestep)
+    def on_train_start(self) -> None:
+        self.ensure_negative_conditioning()
+    # ------------------------------------------------------------------
+    # Runtime primitives
+    # ------------------------------------------------------------------
+    def prepare_batch(
+        self,
+        raw_batch: dict[str, Any],
+        *,
+        generator: torch.Generator,
+        latents_source: Literal["data", "zeros"] = "data",
+    ) -> TrainingBatch:
+        self.ensure_negative_conditioning()
+        assert self.training_config is not None
+        tc = self.training_config
+        dtype = self._get_training_dtype()
+        device = self.device
+        training_batch = TrainingBatch()
+        encoder_hidden_states = raw_batch["text_embedding"]
+        encoder_attention_mask = raw_batch["text_attention_mask"]
+        infos = raw_batch.get("info_list")
+        if latents_source == "zeros":
+            batch_size = encoder_hidden_states.shape[0]
+            vae_config = (
+                tc.pipeline_config.vae_config.arch_config  # type: ignore[union-attr]
+            )
+            num_channels = vae_config.z_dim
+            spatial_compression_ratio = (vae_config.spatial_compression_ratio)
+            latent_height = (tc.data.num_height // spatial_compression_ratio)
+            latent_width = (tc.data.num_width // spatial_compression_ratio)
+            latents = torch.zeros(
+                batch_size,
+                num_channels,
+                tc.data.num_latent_t,
+                latent_height,
+                latent_width,
+                device=device,
+                dtype=dtype,
+            )
+        elif latents_source == "data":
+            if "vae_latent" not in raw_batch:
+                raise ValueError("vae_latent not found in batch "
+                                 "and latents_source='data'")
+            latents = raw_batch["vae_latent"]
+            latents = latents[:, :, :tc.data.num_latent_t]
+            latents = latents.to(device, dtype=dtype)
+        else:
+            raise ValueError(f"Unknown latents_source: "
+                             f"{latents_source!r}")
+        training_batch.latents = latents
+        training_batch.encoder_hidden_states = (encoder_hidden_states.to(device, dtype=dtype))
+        training_batch.encoder_attention_mask = (encoder_attention_mask.to(device, dtype=dtype))
+        training_batch.infos = infos
+        training_batch.latents = normalize_dit_input("wan", training_batch.latents, self.vae)
+        training_batch = self._prepare_dit_inputs(training_batch, generator)
+        training_batch = self._build_attention_metadata(training_batch)
+        training_batch.attn_metadata_vsa = copy.deepcopy(training_batch.attn_metadata)
+        if training_batch.attn_metadata is not None:
+            training_batch.attn_metadata.VSA_sparsity = 0.0  # type: ignore[attr-defined]
+        return training_batch
+    def add_noise(
+        self,
+        clean_latents: torch.Tensor,
+        noise: torch.Tensor,
+        timestep: torch.Tensor,
+    ) -> torch.Tensor:
+        b, t = clean_latents.shape[:2]
+        noisy = self.noise_scheduler.add_noise(
+            clean_latents.flatten(0, 1),
+            noise.flatten(0, 1),
+            timestep,
+        ).unflatten(0, (b, t))
+        return noisy
+    def predict_noise(
+        self,
+        noisy_latents: torch.Tensor,
+        timestep: torch.Tensor,
+        batch: TrainingBatch,
+        *,
+        conditional: bool,
+        cfg_uncond: dict[str, Any] | None = None,
+        attn_kind: Literal["dense", "vsa"] = "dense",
+        force_dense: bool = False,
+    ) -> torch.Tensor:
+        device_type = self.device.type
+        dtype = noisy_latents.dtype
+        if conditional:
+            text_dict = batch.conditional_dict
+            if text_dict is None:
+                raise RuntimeError("Missing conditional_dict in "
+                                   "TrainingBatch")
+        else:
+            text_dict = self._get_uncond_text_dict(batch, cfg_uncond=cfg_uncond)
+        if attn_kind == "dense":
+            attn_metadata = batch.attn_metadata
+        elif attn_kind in ("vsa", "sparse_fp4"):
+            attn_metadata = batch.attn_metadata_vsa
+        else:
+            raise ValueError(f"Unknown attn_kind: {attn_kind!r}")
+        with torch.autocast(device_type, dtype=dtype), set_forward_context(
+                current_timestep=batch.timesteps,
+                attn_metadata=attn_metadata,
+                force_dense=force_dense,
+        ):
+            input_kwargs = (self._build_distill_input_kwargs(noisy_latents, timestep, text_dict))
+            transformer = self._get_transformer(timestep)
+            pred_noise = transformer(**input_kwargs).permute(0, 2, 1, 3, 4)
+        return pred_noise
+    def backward(
+        self,
+        loss: torch.Tensor,
+        ctx: Any,
+        *,
+        grad_accum_rounds: int,
+    ) -> None:
+        timesteps, attn_metadata = ctx
+        with set_forward_context(
+                current_timestep=timesteps,
+                attn_metadata=attn_metadata,
+        ):
+            (loss / max(1, int(grad_accum_rounds))).backward()
+    # ------------------------------------------------------------------
+    # Internal helpers
+    # ------------------------------------------------------------------
+    def _get_training_dtype(self) -> torch.dtype:
+        return torch.bfloat16
+    def _init_timestep_mechanics(self) -> None:
+        assert self.training_config is not None
+        tc = self.training_config
+        flow_shift = tc.pipeline_config.flow_shift
+        self.timestep_shift = float(0.0 if flow_shift is None else flow_shift)
+        self.num_train_timestep = int(self.noise_scheduler.num_train_timesteps)
+        # min/max timestep ratios now come from method_config;
+        # default to full range.
+        self.min_timestep = 0
+        self.max_timestep = self.num_train_timestep
+    def ensure_negative_conditioning(self) -> None:
+        if self.negative_prompt_embeds is not None:
+            return
+        assert self.training_config is not None
+        tc = self.training_config
+        world_group = self.world_group
+        device = self.device
+        dtype = self._get_training_dtype()
+        from fastvideo.train.utils.moduleloader import (
+            make_inference_args, )
+        neg_embeds: torch.Tensor | None = None
+        neg_mask: torch.Tensor | None = None
+        if world_group.rank_in_group == 0:
+            sampling_param = SamplingParam.from_pretrained(tc.model_path)
+            negative_prompt = sampling_param.negative_prompt
+            inference_args = make_inference_args(tc, model_path=tc.model_path)
+            prompt_pipeline = WanPipeline.from_pretrained(
+                tc.model_path,
+                args=inference_args,
+                inference_mode=True,
+                loaded_modules={"transformer": self.transformer},
+                tp_size=tc.distributed.tp_size,
+                sp_size=tc.distributed.sp_size,
+                num_gpus=tc.distributed.num_gpus,
+                pin_cpu_memory=(tc.distributed.pin_cpu_memory),
+                dit_cpu_offload=True,
+            )
+            batch_negative = ForwardBatch(
+                data_type="video",
+                prompt=negative_prompt,
+                prompt_embeds=[],
+                prompt_attention_mask=[],
+            )
+            result_batch = prompt_pipeline.prompt_encoding_stage(  # type: ignore[attr-defined]
+                batch_negative,
+                inference_args,
+            )
+            neg_embeds = result_batch.prompt_embeds[0].to(device=device, dtype=dtype)
+            neg_mask = (result_batch.prompt_attention_mask[0].to(device=device, dtype=dtype))
+            del prompt_pipeline
+            gc.collect()
+            if torch.cuda.is_available():
+                torch.cuda.empty_cache()
+        meta = torch.zeros((2, ), device=device, dtype=torch.int64)
+        if world_group.rank_in_group == 0:
+            assert neg_embeds is not None
+            assert neg_mask is not None
+            meta[0] = neg_embeds.ndim
+            meta[1] = neg_mask.ndim
+        world_group.broadcast(meta, src=0)
+        embed_ndim, mask_ndim = (
+            int(meta[0].item()),
+            int(meta[1].item()),
+        )
+        max_ndim = 8
+        embed_shape = torch.full((max_ndim, ), -1, device=device, dtype=torch.int64)
+        mask_shape = torch.full((max_ndim, ), -1, device=device, dtype=torch.int64)
+        if world_group.rank_in_group == 0:
+            assert neg_embeds is not None
+            assert neg_mask is not None
+            embed_shape[:embed_ndim] = torch.tensor(
+                list(neg_embeds.shape),
+                device=device,
+                dtype=torch.int64,
+            )
+            mask_shape[:mask_ndim] = torch.tensor(
+                list(neg_mask.shape),
+                device=device,
+                dtype=torch.int64,
+            )
+        world_group.broadcast(embed_shape, src=0)
+        world_group.broadcast(mask_shape, src=0)
+        embed_sizes = tuple(int(x) for x in embed_shape[:embed_ndim].tolist())
+        mask_sizes = tuple(int(x) for x in mask_shape[:mask_ndim].tolist())
+        if world_group.rank_in_group != 0:
+            neg_embeds = torch.empty(embed_sizes, device=device, dtype=dtype)
+            neg_mask = torch.empty(mask_sizes, device=device, dtype=dtype)
+        assert neg_embeds is not None
+        assert neg_mask is not None
+        world_group.broadcast(neg_embeds, src=0)
+        world_group.broadcast(neg_mask, src=0)
+        self.negative_prompt_embeds = neg_embeds
+        self.negative_prompt_attention_mask = neg_mask
+    def _sample_timesteps(
+        self,
+        batch_size: int,
+        device: torch.device,
+        generator: torch.Generator,
+    ) -> torch.Tensor:
+        assert self.training_config is not None
+        tc = self.training_config
+        u = compute_density_for_timestep_sampling(
+            weighting_scheme=tc.model.weighting_scheme,
+            batch_size=batch_size,
+            generator=generator,
+            device=device,
+            logit_mean=tc.model.logit_mean,
+            logit_std=tc.model.logit_std,
+            mode_scale=tc.model.mode_scale,
+        )
+        indices = (u * self.noise_scheduler.config.num_train_timesteps).long()
+        return self.noise_scheduler.timesteps[indices.cpu()].to(device=device)
+    def _build_attention_metadata(self, training_batch: TrainingBatch) -> TrainingBatch:
+        assert self.training_config is not None
+        tc = self.training_config
+        latents_shape = training_batch.raw_latent_shape
+        patch_size = (
+            tc.pipeline_config.dit_config.patch_size  # type: ignore[union-attr]
+        )
+        assert latents_shape is not None
+        assert training_batch.timesteps is not None
+        if envs.FASTVIDEO_ATTENTION_BACKEND in (
+            "VIDEO_SPARSE_ATTN", "SPARSE_FP4_ATTN", "SPARSE_FP4_OURS_P_ATTN",
+        ):
+            if (not is_vsa_available() or VideoSparseAttentionMetadataBuilder is None):
+                raise ImportError(
+                    f"FASTVIDEO_ATTENTION_BACKEND is "
+                    f"{envs.FASTVIDEO_ATTENTION_BACKEND}, but "
+                    f"fastvideo_kernel is not correctly "
+                    f"installed or detected.")
+            training_batch.attn_metadata = VideoSparseAttentionMetadataBuilder().build(  # type: ignore[misc]
+                raw_latent_shape=latents_shape[2:5],
+                current_timestep=(training_batch.timesteps),
+                patch_size=patch_size,
+                VSA_sparsity=tc.vsa_sparsity,
+                device=self.device,
+            )
+        elif (envs.FASTVIDEO_ATTENTION_BACKEND == "VMOBA_ATTN"):
+            if (not is_vmoba_available() or VideoMobaAttentionMetadataBuilder is None):
+                raise ImportError("FASTVIDEO_ATTENTION_BACKEND is "
+                                  "VMOBA_ATTN, but fastvideo_kernel "
+                                  "(or flash_attn>=2.7.4) is not "
+                                  "correctly installed.")
+            moba_params = tc.model.moba_config.copy()
+            assert training_batch.raw_latent_shape is not None
+            moba_params.update({
+                "current_timestep": (training_batch.timesteps),
+                "raw_latent_shape": (training_batch.raw_latent_shape[2:5]),
+                "patch_size": patch_size,
+                "device": self.device,
+            })
+            training_batch.attn_metadata = VideoMobaAttentionMetadataBuilder().build(**
+                                                                                     moba_params)  # type: ignore[misc]
+        else:
+            training_batch.attn_metadata = None
+        return training_batch
+    def _prepare_dit_inputs(
+        self,
+        training_batch: TrainingBatch,
+        generator: torch.Generator,
+    ) -> TrainingBatch:
+        assert self.training_config is not None
+        tc = self.training_config
+        latents = training_batch.latents
+        assert isinstance(latents, torch.Tensor)
+        batch_size = latents.shape[0]
+        noise = torch.randn(
+            latents.shape,
+            generator=generator,
+            device=latents.device,
+            dtype=latents.dtype,
+        )
+        timesteps = self._sample_timesteps(
+            batch_size,
+            latents.device,
+            generator,
+        )
+        if int(tc.distributed.sp_size or 1) > 1:
+            self.sp_group.broadcast(timesteps, src=0)
+        sigmas = get_sigmas(
+            self.noise_scheduler,
+            latents.device,
+            timesteps,
+            n_dim=latents.ndim,
+            dtype=latents.dtype,
+        )
+        noisy_model_input = ((1.0 - sigmas) * latents + sigmas * noise)
+        training_batch.noisy_model_input = (noisy_model_input)
+        training_batch.timesteps = timesteps
+        training_batch.sigmas = sigmas
+        training_batch.noise = noise
+        training_batch.raw_latent_shape = latents.shape
+        training_batch.conditional_dict = {
+            "encoder_hidden_states": (training_batch.encoder_hidden_states),
+            "encoder_attention_mask": (training_batch.encoder_attention_mask),
+        }
+        if (self.negative_prompt_embeds is not None and self.negative_prompt_attention_mask is not None):
+            neg_embeds = self.negative_prompt_embeds
+            neg_mask = (self.negative_prompt_attention_mask)
+            if (neg_embeds.shape[0] == 1 and batch_size > 1):
+                neg_embeds = neg_embeds.expand(batch_size, *neg_embeds.shape[1:]).contiguous()
+            if (neg_mask.shape[0] == 1 and batch_size > 1):
+                neg_mask = neg_mask.expand(batch_size, *neg_mask.shape[1:]).contiguous()
+            training_batch.unconditional_dict = {
+                "encoder_hidden_states": neg_embeds,
+                "encoder_attention_mask": neg_mask,
+            }
+        training_batch.latents = (training_batch.latents.permute(0, 2, 1, 3, 4))
+        return training_batch
+    def _build_distill_input_kwargs(
+        self,
+        noise_input: torch.Tensor,
+        timestep: torch.Tensor,
+        text_dict: dict[str, torch.Tensor] | None,
+    ) -> dict[str, Any]:
+        if text_dict is None:
+            raise ValueError("text_dict cannot be None for "
+                             "Wan distillation")
+        return {
+            "hidden_states": noise_input.permute(0, 2, 1, 3, 4),
+            "encoder_hidden_states": text_dict["encoder_hidden_states"],
+            "encoder_attention_mask": text_dict["encoder_attention_mask"],
+            "timestep": timestep,
+            "return_dict": False,
+        }
+    def _get_transformer(self, timestep: torch.Tensor) -> torch.nn.Module:
+        return self.transformer
+    def _get_uncond_text_dict(
+        self,
+        batch: TrainingBatch,
+        *,
+        cfg_uncond: dict[str, Any] | None,
+    ) -> dict[str, torch.Tensor]:
+        if cfg_uncond is None:
+            text_dict = getattr(batch, "unconditional_dict", None)
+            if text_dict is None:
+                raise RuntimeError("Missing unconditional_dict; "
+                                   "ensure_negative_conditioning() "
+                                   "may have failed")
+            return text_dict
+        on_missing_raw = cfg_uncond.get("on_missing", "error")
+        if not isinstance(on_missing_raw, str):
+            raise ValueError("method_config.cfg_uncond.on_missing "
+                             "must be a string, got "
+                             f"{type(on_missing_raw).__name__}")
+        on_missing = on_missing_raw.strip().lower()
+        if on_missing not in {"error", "ignore"}:
+            raise ValueError("method_config.cfg_uncond.on_missing "
+                             "must be one of {error, ignore}, got "
+                             f"{on_missing_raw!r}")
+        for channel, policy_raw in cfg_uncond.items():
+            if channel in {"on_missing", "text"}:
+                continue
+            if policy_raw is None:
+                continue
+            if not isinstance(policy_raw, str):
+                raise ValueError("method_config.cfg_uncond values "
+                                 "must be strings, got "
+                                 f"{channel}="
+                                 f"{type(policy_raw).__name__}")
+            policy = policy_raw.strip().lower()
+            if policy == "keep":
+                continue
+            if on_missing == "ignore":
+                continue
+            raise ValueError("WanModel does not support "
+                             "cfg_uncond channel "
+                             f"{channel!r} (policy={policy!r}). "
+                             "Set cfg_uncond.on_missing=ignore or "
+                             "remove the channel.")
+        text_policy_raw = cfg_uncond.get("text", None)
+        if text_policy_raw is None:
+            text_policy = "negative_prompt"
+        elif not isinstance(text_policy_raw, str):
+            raise ValueError("method_config.cfg_uncond.text must be "
+                             "a string, got "
+                             f"{type(text_policy_raw).__name__}")
+        else:
+            text_policy = (text_policy_raw.strip().lower())
+        if text_policy in {"negative_prompt"}:
+            text_dict = getattr(batch, "unconditional_dict", None)
+            if text_dict is None:
+                raise RuntimeError("Missing unconditional_dict; "
+                                   "ensure_negative_conditioning() "
+                                   "may have failed")
+            return text_dict
+        if text_policy == "keep":
+            if batch.conditional_dict is None:
+                raise RuntimeError("Missing conditional_dict in "
+                                   "TrainingBatch")
+            return batch.conditional_dict
+        if text_policy == "zero":
+            if batch.conditional_dict is None:
+                raise RuntimeError("Missing conditional_dict in "
+                                   "TrainingBatch")
+            cond = batch.conditional_dict
+            enc = cond["encoder_hidden_states"]
+            mask = cond["encoder_attention_mask"]
+            if not torch.is_tensor(enc) or not torch.is_tensor(mask):
+                raise TypeError("conditional_dict must contain "
+                                "tensor text inputs")
+            return {
+                "encoder_hidden_states": (torch.zeros_like(enc)),
+                "encoder_attention_mask": (torch.zeros_like(mask)),
+            }
+        if text_policy == "drop":
+            raise ValueError("cfg_uncond.text=drop is not supported "
+                             "for Wan. Use "
+                             "{negative_prompt, keep, zero}.")
+        raise ValueError("cfg_uncond.text must be one of "
+                         "{negative_prompt, keep, zero, drop}, got "
+                         f"{text_policy_raw!r}")

standalone_inference/overlay_files/fastvideo/training/training_pipeline.py ADDED Viewed

	@@ -0,0 +1,1044 @@

+# SPDX-License-Identifier: Apache-2.0
+from dataclasses import asdict
+from contextlib import AbstractContextManager, nullcontext
+import math
+import os
+import shutil
+import tempfile
+import time
+from abc import ABC, abstractmethod
+from collections import deque
+from collections.abc import Iterator
+from typing import Any
+from fastvideo.profiler import profile_region
+import imageio
+import numpy as np
+import torch
+import torch.distributed as dist
+import torchvision
+from einops import rearrange
+from torch.utils.data import DataLoader
+from torchdata.stateful_dataloader import StatefulDataLoader
+from tqdm.auto import tqdm
+from diffusers import FlowMatchEulerDiscreteScheduler
+import fastvideo.envs as envs
+try:
+    from fastvideo.attention.backends.video_sparse_attn import (VideoSparseAttentionMetadataBuilder)
+    from fastvideo.attention.backends.vmoba import VideoMobaAttentionMetadataBuilder
+except Exception:
+    pass
+from fastvideo.configs.sample import SamplingParam
+from fastvideo.dataset import build_parquet_map_style_dataloader
+from fastvideo.dataset.dataloader.schema import pyarrow_schema_t2v
+from fastvideo.dataset.validation_dataset import ValidationDataset
+from fastvideo.distributed import (cleanup_dist_env_and_memory, get_local_torch_device, get_sp_group, get_world_group)
+from fastvideo.fastvideo_args import FastVideoArgs, TrainingArgs
+from fastvideo.forward_context import set_forward_context
+from fastvideo.logger import init_logger
+from fastvideo.attention.selector import global_force_attn_backend_context_manager
+from fastvideo.pipelines import (ComposedPipelineBase, ForwardBatch, LoRAPipeline, TrainingBatch)
+from fastvideo.platforms import AttentionBackendEnum, current_platform
+from fastvideo.training.activation_checkpoint import (apply_activation_checkpointing)
+from fastvideo.training.trackers import (DummyTracker, TrackerType, initialize_trackers, Trackers)
+from fastvideo.training.training_utils import (clip_grad_norm_while_handling_failing_dtensor_cases,
+                                               compute_density_for_timestep_sampling, count_trainable, get_scheduler,
+                                               get_sigmas, load_checkpoint, normalize_dit_input, save_checkpoint,
+                                               swap_fp4_linear, traverse_swap_module)
+from fastvideo.utils import (is_vmoba_available, is_vsa_available, set_random_seed, shallow_asdict)
+try:
+    vsa_available = is_vsa_available()
+    vmoba_available = is_vmoba_available()
+except Exception:
+    vsa_available = False
+    vmoba_available = False
+logger = init_logger(__name__)
+class TrainingPipeline(LoRAPipeline, ABC):
+    """
+    A pipeline for training a model. All training pipelines should inherit from this class.
+    All reusable components and code should be implemented in this class.
+    """
+    _required_config_modules = ["scheduler", "transformer"]
+    validation_pipeline: ComposedPipelineBase
+    train_dataloader: StatefulDataLoader
+    train_loader_iter: Iterator[dict[str, Any]]
+    current_epoch: int = 0
+    train_transformer_2: bool = False
+    tracker: TrackerType
+    def __init__(self,
+                 model_path: str,
+                 fastvideo_args: TrainingArgs,
+                 required_config_modules: list[str] | None = None,
+                 loaded_modules: dict[str, torch.nn.Module] | None = None) -> None:
+        fastvideo_args.inference_mode = False
+        self.lora_training = fastvideo_args.lora_training
+        if self.lora_training and fastvideo_args.lora_rank is None:
+            raise ValueError("lora rank must be set when using lora training")
+        set_random_seed(fastvideo_args.seed)  # for lora param init
+        super().__init__(model_path, fastvideo_args, required_config_modules, loaded_modules)  # type: ignore
+        self.tracker = DummyTracker()
+    def create_pipeline_stages(self, fastvideo_args: FastVideoArgs):
+        raise RuntimeError("create_pipeline_stages should not be called for training pipeline")
+    @staticmethod
+    def _should_force_generator_attn_qat_train(fastvideo_args: FastVideoArgs) -> bool:
+        if not isinstance(fastvideo_args, TrainingArgs):
+            return False
+        return (fastvideo_args.generator_4bit_attn or envs.FASTVIDEO_ATTENTION_BACKEND == "ATTN_QAT_TRAIN")
+    def load_modules(self,
+                     fastvideo_args: FastVideoArgs,
+                     loaded_modules: dict[str, torch.nn.Module] | None = None) -> dict[str, Any]:
+        force_generator_qat = self._should_force_generator_attn_qat_train(fastvideo_args)
+        load_context: AbstractContextManager[None] = nullcontext()
+        if force_generator_qat:
+            logger.info("Forcing generator attention backend to ATTN_QAT_TRAIN during module loading")
+            load_context = global_force_attn_backend_context_manager(AttentionBackendEnum.ATTN_QAT_TRAIN)
+        with load_context:
+            return super().load_modules(fastvideo_args, loaded_modules)
+    def set_schemas(self) -> None:
+        self.train_dataset_schema = pyarrow_schema_t2v
+    def initialize_training_pipeline(self, training_args: TrainingArgs):
+        logger.info("Initializing training pipeline...")
+        self.device = get_local_torch_device()
+        self.training_args = training_args
+        world_group = get_world_group()
+        self.world_size = world_group.world_size
+        self.global_rank = world_group.rank
+        self.sp_group = get_sp_group()
+        self.rank_in_sp_group = self.sp_group.rank_in_group
+        self.sp_world_size = self.sp_group.world_size
+        self.local_rank = world_group.local_rank
+        self.transformer = self.get_module("transformer")
+        self.transformer_2 = self.get_module("transformer_2", None)
+        self.seed = training_args.seed
+        self.set_schemas()
+        # Set random seeds for deterministic training
+        assert self.seed is not None, "seed must be set"
+        set_random_seed(self.seed + self.global_rank)
+        self.transformer.train()
+        if training_args.enable_gradient_checkpointing_type is not None:
+            self.transformer = apply_activation_checkpointing(
+                self.transformer, checkpointing_type=training_args.enable_gradient_checkpointing_type)
+            if self.transformer_2 is not None:
+                self.transformer_2 = apply_activation_checkpointing(
+                    self.transformer_2, checkpointing_type=training_args.enable_gradient_checkpointing_type)
+        if training_args.generator_4bit_linear:
+            num_swaps = traverse_swap_module(self.transformer, swap_fn=swap_fp4_linear)
+            logger.info("Swapped %s linear layers to the FP4 forward path in self.transformer", num_swaps)
+        noise_scheduler = self.modules["scheduler"]
+        self.set_trainable()
+        params_to_optimize = self.transformer.parameters()
+        params_to_optimize = list(filter(lambda p: p.requires_grad, params_to_optimize))
+        # Parse betas from string format "beta1,beta2"
+        betas_str = training_args.betas
+        betas = tuple(float(x.strip()) for x in betas_str.split(","))
+        self.optimizer = torch.optim.AdamW(
+            params_to_optimize,
+            lr=training_args.learning_rate,
+            betas=betas,
+            weight_decay=training_args.weight_decay,
+            eps=1e-8,
+        )
+        self.init_steps = 0
+        logger.info("optimizer: %s", self.optimizer)
+        self.lr_scheduler = get_scheduler(
+            training_args.lr_scheduler,
+            optimizer=self.optimizer,
+            num_warmup_steps=training_args.lr_warmup_steps,
+            num_training_steps=training_args.max_train_steps,
+            num_cycles=training_args.lr_num_cycles,
+            power=training_args.lr_power,
+            min_lr_ratio=training_args.min_lr_ratio,
+            last_epoch=self.init_steps - 1,
+        )
+        if self.transformer_2 is not None:
+            # Ensure transformer_2 has trainable parameters before creating optimizer
+            params_to_optimize_2 = self.transformer_2.parameters()
+            params_to_optimize_2 = list(filter(lambda p: p.requires_grad, params_to_optimize_2))
+            self.optimizer_2 = torch.optim.AdamW(
+                params_to_optimize_2,
+                lr=training_args.learning_rate,
+                betas=(0.9, 0.999),
+                weight_decay=training_args.weight_decay,
+                eps=1e-8,
+            )
+            self.lr_scheduler_2 = get_scheduler(
+                training_args.lr_scheduler,
+                optimizer=self.optimizer_2,
+                num_warmup_steps=training_args.lr_warmup_steps,
+                num_training_steps=training_args.max_train_steps,
+                num_cycles=training_args.lr_num_cycles,
+                power=training_args.lr_power,
+                min_lr_ratio=training_args.min_lr_ratio,
+                last_epoch=self.init_steps - 1,
+            )
+        self.train_dataset, self.train_dataloader = build_parquet_map_style_dataloader(
+            training_args.data_path,
+            training_args.train_batch_size,
+            parquet_schema=self.train_dataset_schema,
+            num_data_workers=training_args.dataloader_num_workers,
+            cfg_rate=training_args.training_cfg_rate,
+            drop_last=True,
+            text_padding_length=training_args.pipeline_config.text_encoder_configs[0].arch_config.
+            text_len,  # type: ignore[attr-defined]
+            seed=self.seed)
+        self.noise_scheduler = noise_scheduler
+        if self.training_args.boundary_ratio is not None:
+            self.boundary_timestep = self.training_args.boundary_ratio * self.noise_scheduler.num_train_timesteps
+        else:
+            self.boundary_timestep = None
+        logger.info("train_dataloader length: %s", len(self.train_dataloader))
+        logger.info("train_sp_batch_size: %s", training_args.train_sp_batch_size)
+        logger.info("gradient_accumulation_steps: %s", training_args.gradient_accumulation_steps)
+        logger.info("sp_size: %s", training_args.sp_size)
+        self.num_update_steps_per_epoch = math.ceil(
+            len(self.train_dataloader) / training_args.gradient_accumulation_steps * training_args.sp_size /
+            training_args.train_sp_batch_size)
+        self.num_train_epochs = math.ceil(training_args.max_train_steps / self.num_update_steps_per_epoch)
+        # TODO(will): is there a cleaner way to track epochs?
+        self.current_epoch = 0
+        trackers = list(training_args.trackers)
+        if not trackers and training_args.tracker_project_name:
+            trackers.append(Trackers.WANDB.value)
+        if self.global_rank != 0:
+            trackers = []
+        tracker_log_dir = training_args.output_dir or os.getcwd()
+        if trackers:
+            tracker_log_dir = os.path.join(tracker_log_dir, "tracker")
+        tracker_config = asdict(training_args) if trackers else None
+        tracker_run_name = training_args.wandb_run_name or None
+        project = training_args.tracker_project_name or "fastvideo"
+        self.tracker = initialize_trackers(
+            trackers,
+            experiment_name=project,
+            config=tracker_config,
+            log_dir=tracker_log_dir,
+            run_name=tracker_run_name,
+        )
+    @abstractmethod
+    def initialize_validation_pipeline(self, training_args: TrainingArgs):
+        raise NotImplementedError("Training pipelines must implement this method")
+    def _prepare_training(self, training_batch: TrainingBatch) -> TrainingBatch:
+        self.optimizer.zero_grad()
+        if self.transformer_2 is not None:
+            self.optimizer_2.zero_grad()
+        training_batch.total_loss = 0.0
+        return training_batch
+    def _get_next_batch(self, training_batch: TrainingBatch) -> TrainingBatch:
+        with self.tracker.timed("timing/get_next_batch"):
+            batch = next(self.train_loader_iter, None)  # type: ignore
+            if batch is None:
+                self.current_epoch += 1
+                logger.info("Starting epoch %s", self.current_epoch)
+                # Reset iterator for next epoch
+                self.train_loader_iter = iter(self.train_dataloader)
+                # Get first batch of new epoch
+                batch = next(self.train_loader_iter)
+            latents = batch['vae_latent']
+            latents = latents[:, :, :self.training_args.num_latent_t]
+            encoder_hidden_states = batch['text_embedding']
+            encoder_attention_mask = batch['text_attention_mask']
+            infos = batch['info_list']
+            training_batch.latents = latents.to(
+                get_local_torch_device(),
+                dtype=torch.bfloat16,
+                non_blocking=True,
+            )
+            training_batch.encoder_hidden_states = (encoder_hidden_states.to(
+                get_local_torch_device(),
+                dtype=torch.bfloat16,
+                non_blocking=True,
+            ))
+            training_batch.encoder_attention_mask = (encoder_attention_mask.to(
+                get_local_torch_device(),
+                dtype=torch.bfloat16,
+                non_blocking=True,
+            ))
+            training_batch.infos = infos
+        return training_batch
+    def _normalize_dit_input(self, training_batch: TrainingBatch) -> TrainingBatch:
+        # TODO(will): support other models
+        with self.tracker.timed("timing/normalize_input"):
+            training_batch.latents = normalize_dit_input(
+                'wan',
+                training_batch.latents,
+                self.get_module("vae"),
+            )
+        return training_batch
+    def _prepare_dit_inputs(self, training_batch: TrainingBatch) -> TrainingBatch:
+        assert self.training_args is not None, "training_args must be set"
+        with self.tracker.timed("timing/prepare_dit_inputs"):
+            latents = training_batch.latents
+            batch_size = latents.shape[0]
+            noise = torch.randn(latents.shape,
+                                generator=self.noise_gen_cuda,
+                                device=latents.device,
+                                dtype=latents.dtype)
+            timesteps = self._sample_timesteps(batch_size, latents.device)
+            if self.training_args.sp_size > 1:
+                # Make sure that the timesteps are the same across all sp processes.
+                sp_group = get_sp_group()
+                sp_group.broadcast(timesteps, src=0)
+                sp_group.broadcast(noise, src=0)
+            sigmas = get_sigmas(
+                self.noise_scheduler,
+                latents.device,
+                timesteps,
+                n_dim=latents.ndim,
+                dtype=latents.dtype,
+            )
+            noisy_model_input = (1.0 - sigmas) * training_batch.latents + sigmas * noise
+            training_batch.noisy_model_input = noisy_model_input
+            training_batch.timesteps = timesteps
+            training_batch.sigmas = sigmas
+            training_batch.noise = noise
+            training_batch.raw_latent_shape = training_batch.latents.shape
+        return training_batch
+    def _sample_timesteps(self, batch_size: int, device: torch.device) -> torch.Tensor:
+        # Determine which model to train based on the boundary timestep
+        if (self.transformer_2 is not None and self.boundary_timestep is not None
+                and torch.rand(1, generator=self.noise_random_generator).item() <= self.training_args.boundary_ratio):
+            self.train_transformer_2 = True
+        else:
+            self.train_transformer_2 = False
+        # Broadcast the decision to all processes
+        decision = torch.tensor(1.0 if self.train_transformer_2 else 0.0, device=self.device)
+        dist.broadcast(decision, src=0)
+        self.train_transformer_2 = decision.item() == 1.0
+        # Sample u from the appropriate range
+        u = compute_density_for_timestep_sampling(
+            weighting_scheme=self.training_args.weighting_scheme,
+            batch_size=batch_size,
+            generator=self.noise_random_generator,
+            logit_mean=self.training_args.logit_mean,
+            logit_std=self.training_args.logit_std,
+            mode_scale=self.training_args.mode_scale,
+        )
+        boundary_ratio = self.training_args.boundary_ratio
+        if self.train_transformer_2:
+            u = (1 - boundary_ratio) + u * boundary_ratio  # min: 1 - boundary_ratio, max: 1
+        # elif self.transformer_2 is not None:
+        #     u = u * (1 - boundary_ratio)  # min: 0, max: 1 - boundary_ratio
+        # else:  # patch for now to align with non-MoE timestep logic
+        #     pass
+        indices = (u * self.noise_scheduler.config.num_train_timesteps).long()
+        return self.noise_scheduler.timesteps[indices].to(device=device)
+    def _build_attention_metadata(self, training_batch: TrainingBatch) -> TrainingBatch:
+        latents_shape = training_batch.raw_latent_shape
+        patch_size = self.training_args.pipeline_config.dit_config.patch_size
+        current_vsa_sparsity = training_batch.current_vsa_sparsity
+        assert latents_shape is not None
+        assert isinstance(patch_size, tuple), f"Expected tuple patch_size, got {patch_size!r}"
+        assert training_batch.timesteps is not None
+        if envs.FASTVIDEO_ATTENTION_BACKEND in (
+            "VIDEO_SPARSE_ATTN",
+            "SPARSE_FP4_ATTN",
+            "SPARSE_FP4_OURS_P_ATTN",
+        ):
+            if not vsa_available:
+                raise ImportError("FASTVIDEO_ATTENTION_BACKEND is set to VIDEO_SPARSE_ATTN, "
+                                  "but fastvideo_kernel is not correctly installed or detected. "
+                                  "Please ensure fastvideo-kernel is installed.")
+            training_batch.attn_metadata = VideoSparseAttentionMetadataBuilder(  # type: ignore
+            ).build(  # type: ignore
+                raw_latent_shape=latents_shape[2:5],
+                current_timestep=training_batch.timesteps,
+                patch_size=patch_size,
+                VSA_sparsity=current_vsa_sparsity,
+                device=get_local_torch_device())
+        elif envs.FASTVIDEO_ATTENTION_BACKEND == "VMOBA_ATTN":
+            if not vmoba_available:
+                raise ImportError("FASTVIDEO_ATTENTION_BACKEND is set to VMOBA_ATTN, "
+                                  "but fastvideo_kernel (or flash_attn>=2.7.4) is not correctly installed.")
+            moba_params = self.training_args.moba_config.copy()
+            moba_params.update({
+                "current_timestep": training_batch.timesteps,
+                "raw_latent_shape": latents_shape[2:5],
+                "patch_size": self.training_args.pipeline_config.dit_config.patch_size,
+                "device": get_local_torch_device(),
+            })
+            training_batch.attn_metadata = VideoMobaAttentionMetadataBuilder().build(**moba_params)
+        else:
+            training_batch.attn_metadata = None
+        return training_batch
+    def _build_input_kwargs(self, training_batch: TrainingBatch) -> TrainingBatch:
+        training_batch.input_kwargs = {
+            "hidden_states": training_batch.noisy_model_input,
+            "encoder_hidden_states": training_batch.encoder_hidden_states,
+            "timestep": training_batch.timesteps.to(get_local_torch_device(), dtype=torch.bfloat16),
+            "encoder_attention_mask": training_batch.encoder_attention_mask,
+            "return_dict": False,
+        }
+        return training_batch
+    def _transformer_forward_and_compute_loss(self, training_batch: TrainingBatch) -> TrainingBatch:
+        if vsa_available and envs.FASTVIDEO_ATTENTION_BACKEND in (
+            "VIDEO_SPARSE_ATTN",
+            "SPARSE_FP4_ATTN",
+            "SPARSE_FP4_OURS_P_ATTN",
+        ) or vmoba_available and envs.FASTVIDEO_ATTENTION_BACKEND == "VMOBA_ATTN":
+            assert training_batch.attn_metadata is not None
+        else:
+            assert training_batch.attn_metadata is None
+        input_kwargs = training_batch.input_kwargs
+        # if 'hunyuan' in self.training_args.model_type:
+        #     input_kwargs["guidance"] = torch.tensor(
+        #         [1000.0],
+        #         device=training_batch.noisy_model_input.device,
+        #         dtype=torch.bfloat16)
+        current_model = self.transformer_2 if self.train_transformer_2 else self.transformer
+        with self.tracker.timed("timing/forward_backward"), set_forward_context(
+                current_timestep=training_batch.current_timestep, attn_metadata=training_batch.attn_metadata):
+            model_pred = current_model(**input_kwargs)
+            if self.training_args.precondition_outputs:
+                assert training_batch.sigmas is not None
+                model_pred = training_batch.noisy_model_input - model_pred * training_batch.sigmas
+            assert training_batch.latents is not None
+            assert training_batch.noise is not None
+            target = training_batch.latents if self.training_args.precondition_outputs else training_batch.noise - training_batch.latents
+            # make sure no implicit broadcasting happens
+            assert model_pred.shape == target.shape, f"model_pred.shape: {model_pred.shape}, target.shape: {target.shape}"
+            loss = (torch.mean(
+                (model_pred.float() - target.float())**2) / self.training_args.gradient_accumulation_steps)
+            loss.backward()
+            avg_loss = loss.detach().clone()
+        # Reduce across ranks without forcing a CPU sync
+        with self.tracker.timed("timing/reduce_loss"):
+            world_group = get_world_group()
+            avg_loss = world_group.all_reduce(avg_loss, op=dist.ReduceOp.AVG)
+        # Accumulate on GPU; materialize to CPU only once after
+        # all gradient-accumulation iterations (see train_one_step).
+        training_batch.total_loss += avg_loss
+        return training_batch
+    def _clip_grad_norm(self, training_batch: TrainingBatch) -> TrainingBatch:
+        max_grad_norm = self.training_args.max_grad_norm
+        # TODO(will): perhaps move this into transformer api so that we can do
+        # the following:
+        # grad_norm = transformer.clip_grad_norm_(max_grad_norm)
+        if max_grad_norm is not None:
+            with self.tracker.timed("timing/clip_grad_norm"):
+                # Only clip gradients for the model that is currently training
+                if self.train_transformer_2 and self.transformer_2 is not None:
+                    model_parts = [self.transformer_2]
+                else:
+                    model_parts = [self.transformer]
+                grad_norm = clip_grad_norm_while_handling_failing_dtensor_cases(
+                    [p for m in model_parts for p in m.parameters()],
+                    max_grad_norm,
+                    foreach=None,
+                )
+                assert grad_norm is not float('nan') or grad_norm is not float('inf')
+                grad_norm = grad_norm.item() if grad_norm is not None else 0.0
+        else:
+            grad_norm = 0.0
+        training_batch.grad_norm = grad_norm
+        return training_batch
+    @profile_region("profiler_region_training_train_one_step")
+    def train_one_step(self, training_batch: TrainingBatch) -> TrainingBatch:
+        training_batch = self._prepare_training(training_batch)
+        for _ in range(self.training_args.gradient_accumulation_steps):
+            training_batch = self._get_next_batch(training_batch)
+            # Normalize DIT input
+            training_batch = self._normalize_dit_input(training_batch)
+            # Create noisy model input
+            training_batch = self._prepare_dit_inputs(training_batch)
+            assert training_batch.latents is not None
+            assert training_batch.noisy_model_input is not None
+            assert training_batch.noise is not None
+            # old sharding code, need to shard latents and noise but not input
+            # Shard latents across sp groups
+            training_batch.latents = training_batch.latents[:, :, :self.training_args.num_latent_t]
+            # shard noisy_model_input to match
+            training_batch.noisy_model_input = training_batch.noisy_model_input[:, :, :self.training_args.num_latent_t]
+            # shard noise to match latents
+            training_batch.noise = training_batch.noise[:, :, :self.training_args.num_latent_t]
+            training_batch = self._build_attention_metadata(training_batch)
+            training_batch = self._build_input_kwargs(training_batch)
+            training_batch = self._transformer_forward_and_compute_loss(training_batch)
+        training_batch = self._clip_grad_norm(training_batch)
+        # Only step the optimizer and scheduler for the model that is currently training
+        with self.tracker.timed("timing/optimizer_step"):
+            if self.train_transformer_2 and self.transformer_2 is not None:
+                self.optimizer_2.step()
+                self.lr_scheduler_2.step()
+            else:
+                self.optimizer.step()
+                self.lr_scheduler.step()
+        return training_batch
+    def _compute_current_sparsity(self, step: int) -> float:
+        """Compute the VSA sparsity for a given step using the decay schedule."""
+        vsa_sparsity = self.training_args.VSA_sparsity
+        vsa_decay_rate = self.training_args.VSA_decay_rate
+        vsa_decay_interval = self.training_args.VSA_decay_interval_steps
+        vsa_init = getattr(self.training_args, 'VSA_init_sparsity', 0.0)
+        vsa_warmup = getattr(self.training_args, 'VSA_warmup_steps', 0)
+        if step <= vsa_warmup:
+            return vsa_init
+        ramp_step = step - vsa_warmup
+        max_times = int((vsa_sparsity - vsa_init) / vsa_decay_rate) if vsa_decay_rate > 0 else 0
+        times = min(ramp_step // vsa_decay_interval, max_times)
+        return vsa_init + times * vsa_decay_rate
+    def _resolve_checkpoint_path(self, path: str) -> str | None:
+        """Resolve 'latest' to the most recent checkpoint in output_dir."""
+        import glob
+        if path == "latest":
+            output_dir = self.training_args.output_dir
+            ckpt_dirs = sorted(
+                glob.glob(os.path.join(output_dir, "checkpoint-*")),
+                key=lambda d: int(d.split("-")[-1]) if d.split("-")[-1].isdigit() else 0,
+            )
+            if ckpt_dirs:
+                latest = ckpt_dirs[-1]
+                logger.info("Auto-resolved 'latest' to %s", latest)
+                return latest
+            logger.info("No checkpoints found in %s, starting from scratch", output_dir)
+            return None
+        return path
+    def _resume_from_checkpoint(self) -> None:
+        ckpt_path = self._resolve_checkpoint_path(self.training_args.resume_from_checkpoint)
+        if ckpt_path is None:
+            logger.info("No checkpoint to resume from, starting from step 0")
+            return
+        safetensors_path = os.path.join(ckpt_path, "transformer", "diffusion_pytorch_model.safetensors")
+        step = int(os.path.basename(os.path.normpath(ckpt_path)).split('-')[-1])
+        resumed_step = load_checkpoint(self.transformer, self.global_rank, ckpt_path,
+                                       self.optimizer, self.train_dataloader,
+                                       self.lr_scheduler, self.noise_random_generator)
+        if resumed_step > 0 or step == 0:
+            self.init_steps = resumed_step
+            logger.info("Successfully resumed full training state from step %s", resumed_step)
+            return
+        if os.path.exists(safetensors_path):
+            self.init_steps = step
+            logger.warning("Distributed checkpoint resume failed; falling back to safetensors weights at step %s",
+                           step)
+            return
+        logger.warning("No usable checkpoint state found at %s; starting from step 0", ckpt_path)
+        self.init_steps = 0
+    @profile_region("profiler_region_training_train")
+    def train(self) -> None:
+        assert self.seed is not None, "seed must be set"
+        assert self.training_args is not None, "training_args must be set"
+        set_random_seed(self.seed + self.global_rank)
+        logger.info('rank: %s: start training', self.global_rank, local_main_process_only=False)
+        if not self.post_init_called:
+            self.post_init()
+        num_trainable_params = count_trainable(self.transformer)
+        logger.info("Starting training with %s B trainable parameters", round(num_trainable_params / 1e9, 3))
+        if getattr(self, "transformer_2", None) is not None:
+            num_trainable_params = count_trainable(self.transformer_2)
+            logger.info("Transformer 2: Starting training with %s B trainable parameters",
+                        round(num_trainable_params / 1e9, 3))
+        # Set random seeds for deterministic training
+        self.noise_random_generator = torch.Generator(device="cpu").manual_seed(self.seed + self.global_rank)
+        self.noise_gen_cuda = torch.Generator(device=current_platform.device_name).manual_seed(self.seed +
+                                                                                               self.global_rank)
+        self.validation_random_generator = torch.Generator(device="cpu").manual_seed(self.seed + self.global_rank)
+        logger.info("Initialized random seeds with seed: %s", self.seed + self.global_rank)
+        self.noise_scheduler = FlowMatchEulerDiscreteScheduler()
+        if self.training_args.resume_from_checkpoint:
+            self._resume_from_checkpoint()
+        self.train_loader_iter = iter(self.train_dataloader)
+        step_times: deque[float] = deque(maxlen=100)
+        self._log_training_info()
+        # Validation at init uses the sparsity corresponding to init_steps
+        saved_sparsity = self.training_args.VSA_sparsity
+        self.training_args.VSA_sparsity = self._compute_current_sparsity(self.init_steps)
+        self._log_validation(self.transformer, self.training_args, self.init_steps)
+        self.training_args.VSA_sparsity = saved_sparsity
+        # Train!
+        progress_bar = tqdm(
+            range(0, self.training_args.max_train_steps),
+            initial=self.init_steps,
+            desc="Steps",
+            # Only show the progress bar once on each machine.
+            disable=self.local_rank > 0,
+        )
+        for step in range(self.init_steps + 1, self.training_args.max_train_steps + 1):
+            start_time = time.perf_counter()
+            if vsa_available:
+                vsa_sparsity = self.training_args.VSA_sparsity
+                vsa_decay_rate = self.training_args.VSA_decay_rate
+                vsa_decay_interval_steps = self.training_args.VSA_decay_interval_steps
+                vsa_init_sparsity = getattr(self.training_args, 'VSA_init_sparsity', 0.0)
+                vsa_warmup_steps = getattr(self.training_args, 'VSA_warmup_steps', 0)
+                if step <= vsa_warmup_steps:
+                    current_vsa_sparsity = vsa_init_sparsity
+                else:
+                    ramp_step = step - vsa_warmup_steps
+                    max_decay_times = int((vsa_sparsity - vsa_init_sparsity) / vsa_decay_rate)
+                    current_decay_times = min(ramp_step // vsa_decay_interval_steps, max_decay_times)
+                    current_vsa_sparsity = vsa_init_sparsity + current_decay_times * vsa_decay_rate
+            elif vmoba_available:
+                #TODO: add vmoba sparsity scheduling here
+                current_vsa_sparsity = 0.0
+            else:
+                current_vsa_sparsity = 0.0
+            training_batch = TrainingBatch()
+            training_batch.current_timestep = step
+            training_batch.current_vsa_sparsity = current_vsa_sparsity
+            training_batch = self.train_one_step(training_batch)
+            loss = float(training_batch.total_loss)
+            grad_norm = training_batch.grad_norm
+            step_time = time.perf_counter() - start_time
+            step_times.append(step_time)
+            avg_step_time = sum(step_times) / len(step_times)
+            progress_bar.set_postfix({
+                "loss": f"{loss:.4f}",
+                "step_time": f"{step_time:.2f}s",
+                "grad_norm": grad_norm,
+            })
+            progress_bar.update(1)
+            if self.global_rank == 0:
+                metrics = {
+                    "train_loss": loss,
+                    "learning_rate": self.lr_scheduler.get_last_lr()[0],
+                    "step_time": step_time,
+                    "avg_step_time": avg_step_time,
+                    "grad_norm": grad_norm,
+                    "vsa_sparsity": current_vsa_sparsity,
+                }
+                try:
+                    assert training_batch.raw_latent_shape is not None
+                    metrics["batch_size"] = int(training_batch.raw_latent_shape[0])
+                    patch_size = self.training_args.pipeline_config.dit_config.patch_size
+                    assert isinstance(patch_size, tuple), f"Expected tuple patch_size, got {patch_size!r}"
+                    patch_t, patch_h, patch_w = patch_size
+                    seq_len = (training_batch.raw_latent_shape[2] // patch_t) * (
+                        training_batch.raw_latent_shape[3] // patch_h) * (training_batch.raw_latent_shape[4] // patch_w)
+                    if training_batch.encoder_hidden_states is not None:
+                        context_len = int(training_batch.encoder_hidden_states.shape[1])
+                    else:
+                        context_len = 0
+                    metrics["dit_seq_len"] = int(seq_len)
+                    metrics["context_len"] = context_len
+                    arch_config = self.training_args.pipeline_config.dit_config.arch_config
+                    metrics["hidden_dim"] = arch_config.hidden_size
+                    metrics["num_layers"] = arch_config.num_layers
+                    metrics["ffn_dim"] = arch_config.ffn_dim
+                except Exception:
+                    pass
+                self.tracker.log(metrics, step)
+            if step % self.training_args.training_state_checkpointing_steps == 0:
+                with self.profiler_controller.region("profiler_region_training_save_checkpoint"):
+                    save_checkpoint(self.transformer, self.global_rank, self.training_args.output_dir, step,
+                                    self.optimizer, self.train_dataloader, self.lr_scheduler,
+                                    self.noise_random_generator,
+                                    self.training_args.checkpoints_total_limit)
+                self.transformer.train()
+                self.sp_group.barrier()
+            if self.training_args.log_visualization and step % self.training_args.visualization_steps == 0:
+                self.visualize_intermediate_latents(training_batch, self.training_args, step)
+            if self.training_args.log_validation and step % self.training_args.validation_steps == 0:
+                with self.profiler_controller.region("profiler_region_training_validation"):
+                    saved_sparsity = self.training_args.VSA_sparsity
+                    self.training_args.VSA_sparsity = current_vsa_sparsity
+                    self._log_validation(self.transformer, self.training_args, step)
+                    self.training_args.VSA_sparsity = saved_sparsity
+                    gpu_memory_usage = current_platform.get_torch_device().memory_allocated() / 1024**2
+                    trainable_params = round(count_trainable(self.transformer) / 1e9, 3)
+                    logger.info("GPU memory usage after validation: %s MB, trainable params: %sB", gpu_memory_usage,
+                                trainable_params)
+        self.tracker.finish()
+        save_checkpoint(self.transformer, self.global_rank, self.training_args.output_dir,
+                        self.training_args.max_train_steps, self.optimizer, self.train_dataloader, self.lr_scheduler,
+                        self.noise_random_generator, self.training_args.checkpoints_total_limit)
+        if envs.FASTVIDEO_TORCH_PROFILER_DIR:
+            logger.info("Stopping profiler...")
+            self.profiler_controller.stop()
+            logger.info("Profiler stopped.")
+        if get_sp_group():
+            cleanup_dist_env_and_memory()
+    def _log_training_info(self) -> None:
+        assert self.training_args is not None, "training_args must be set"
+        total_batch_size = (self.world_size * self.training_args.gradient_accumulation_steps /
+                            self.training_args.sp_size * self.training_args.train_sp_batch_size)
+        logger.info("***** Running training *****")
+        logger.info("  Num examples = %s", len(self.train_dataset))
+        logger.info("  Dataloader size = %s", len(self.train_dataloader))
+        logger.info("  Num Epochs = %s", self.num_train_epochs)
+        logger.info("  Resume training from step %s", self.init_steps)  # type: ignore
+        logger.info("  Instantaneous batch size per device = %s", self.training_args.train_batch_size)
+        logger.info("  Total train batch size (w. data & sequence parallel, accumulation) = %s", total_batch_size)
+        logger.info("  Gradient Accumulation steps = %s", self.training_args.gradient_accumulation_steps)
+        logger.info("  Total optimization steps = %s", self.training_args.max_train_steps)
+        logger.info("  Total training parameters per FSDP shard = %s B",
+                    round(count_trainable(self.transformer) / 1e9, 3))
+        # print dtype
+        logger.info("  Master weight dtype: %s", self.transformer.parameters().__next__().dtype)
+        gpu_memory_usage = current_platform.get_torch_device().memory_allocated() / 1024**2
+        logger.info("GPU memory usage before train_one_step: %s MB", gpu_memory_usage)
+        logger.info("VSA validation sparsity: %s", self.training_args.VSA_sparsity)
+    def _prepare_validation_batch(self, sampling_param: SamplingParam, training_args: TrainingArgs,
+                                  validation_batch: dict[str, Any], num_inference_steps: int) -> ForwardBatch:
+        sampling_param.prompt = validation_batch['prompt']
+        sampling_param.height = training_args.num_height
+        sampling_param.width = training_args.num_width
+        sampling_param.num_inference_steps = num_inference_steps
+        sampling_param.data_type = "video"
+        if training_args.validation_guidance_scale:
+            sampling_param.guidance_scale = float(training_args.validation_guidance_scale)
+        assert self.seed is not None
+        sampling_param.seed = self.seed
+        latents_size = [(sampling_param.num_frames - 1) // 4 + 1, sampling_param.height // 8, sampling_param.width // 8]
+        n_tokens = latents_size[0] * latents_size[1] * latents_size[2]
+        temporal_compression_factor = training_args.pipeline_config.vae_config.arch_config.temporal_compression_ratio
+        num_frames = (training_args.num_latent_t - 1) * temporal_compression_factor + 1
+        sampling_param.num_frames = num_frames
+        batch = ForwardBatch(
+            **shallow_asdict(sampling_param),
+            latents=None,
+            generator=self.validation_random_generator,
+            n_tokens=n_tokens,
+            eta=0.0,
+            VSA_sparsity=training_args.VSA_sparsity,
+        )
+        return batch
+    @torch.no_grad()
+    def _log_validation(self, transformer, training_args, global_step) -> None:
+        """
+        Generate a validation video and log it to the configured tracker to check the quality during training.
+        """
+        training_args.inference_mode = True
+        training_args.dit_cpu_offload = False
+        if not training_args.log_validation:
+            return
+        if self.validation_pipeline is None:
+            raise ValueError("Validation pipeline is not set")
+        logger.info("Starting validation")
+        # Create sampling parameters if not provided
+        sampling_param = SamplingParam.from_pretrained(training_args.model_path)
+        # Prepare validation prompts
+        logger.info('rank: %s: fastvideo_args.validation_dataset_file: %s',
+                    self.global_rank,
+                    training_args.validation_dataset_file,
+                    local_main_process_only=False)
+        validation_dataset = ValidationDataset(training_args.validation_dataset_file)
+        validation_dataloader = DataLoader(validation_dataset, batch_size=None, num_workers=0)
+        self.transformer.eval()
+        if getattr(self, "transformer_2", None) is not None:
+            self.transformer_2.eval()
+        validation_steps = training_args.validation_sampling_steps.split(",")
+        validation_steps = [int(step) for step in validation_steps]
+        validation_steps = [step for step in validation_steps if step > 0]
+        # Log validation results for this step
+        world_group = get_world_group()
+        num_sp_groups = world_group.world_size // self.sp_group.world_size
+        one_prompt_per_rank = os.environ.get(
+            "FASTVIDEO_VALIDATION_ONE_PROMPT_PER_RANK",
+            "",
+        ).lower() in {"1", "true", "yes", "on"}
+        # Process each validation prompt for each validation step
+        for num_inference_steps in validation_steps:
+            logger.info("rank: %s: num_inference_steps: %s",
+                        self.global_rank,
+                        num_inference_steps,
+                        local_main_process_only=False)
+            step_videos: list[np.ndarray] = []
+            step_captions: list[str] = []
+            step_audio: list[np.ndarray | None] = []
+            step_sample_rates: list[int | None] = []
+            for prompt_idx, validation_batch in enumerate(validation_dataloader):
+                if one_prompt_per_rank and prompt_idx > 0:
+                    continue
+                batch = self._prepare_validation_batch(sampling_param, training_args, validation_batch,
+                                                       num_inference_steps)
+                logger.info("rank: %s: rank_in_sp_group: %s, batch.prompt: %s",
+                            self.global_rank,
+                            self.rank_in_sp_group,
+                            batch.prompt,
+                            local_main_process_only=False)
+                assert batch.prompt is not None and isinstance(batch.prompt, str)
+                step_captions.append(batch.prompt)
+                # Run validation inference
+                output_batch = self.validation_pipeline.forward(batch, training_args)
+                samples = output_batch.output.cpu()
+                # Capture audio if available
+                audio = output_batch.extra.get("audio")
+                sample_rate = output_batch.extra.get("audio_sample_rate")
+                if audio is not None and torch.is_tensor(audio):
+                    audio = audio.detach().cpu().float().numpy()
+                step_audio.append(audio)
+                step_sample_rates.append(sample_rate)
+                if self.rank_in_sp_group != 0:
+                    continue
+                # Process outputs
+                video = rearrange(samples, "b c t h w -> t b c h w")
+                frames = []
+                for x in video:
+                    x = torchvision.utils.make_grid(x, nrow=6)
+                    x = x.transpose(0, 1).transpose(1, 2).squeeze(-1)
+                    frames.append((x * 255).numpy().astype(np.uint8))
+                step_videos.append(frames)
+            # Only sp_group leaders (rank_in_sp_group == 0) need to send their
+            # results to global rank 0
+            if self.rank_in_sp_group == 0 and self.global_rank == 0:
+                # Global rank 0 collects results from all sp_group leaders
+                all_videos = step_videos  # Start with own results
+                all_captions = step_captions
+                all_audios = step_audio
+                all_sample_rates = step_sample_rates
+                # Receive from other sp_group leaders
+                for sp_group_idx in range(1, num_sp_groups):
+                    src_rank = sp_group_idx * self.sp_world_size  # Global rank of other sp_group leaders
+                    recv_videos = world_group.recv_object(src=src_rank)
+                    recv_captions = world_group.recv_object(src=src_rank)
+                    recv_audios = world_group.recv_object(src=src_rank)
+                    recv_sample_rates = world_group.recv_object(src=src_rank)
+                    all_videos.extend(recv_videos)
+                    all_captions.extend(recv_captions)
+                    all_audios.extend(recv_audios)
+                    all_sample_rates.extend(recv_sample_rates)
+                video_filenames = []
+                for i, (video, caption, audio, sample_rate) in enumerate(
+                        zip(all_videos, all_captions, all_audios, all_sample_rates, strict=True)):
+                    os.makedirs(training_args.output_dir, exist_ok=True)
+                    filename = os.path.join(
+                        training_args.output_dir,
+                        f"validation_step_{global_step}_inference_steps_{num_inference_steps}_video_{i}.mp4")
+                    imageio.mimsave(filename, video, fps=sampling_param.fps)
+                    # Mux audio if available
+                    if (audio is not None and sample_rate is not None and not self._mux_audio(
+                            filename,
+                            audio,
+                            sample_rate,
+                    )):
+                        logger.warning("Audio mux failed for validation video %s; saved video without audio.", filename)
+                    video_filenames.append(filename)
+                artifacts = []
+                for filename, caption in zip(video_filenames, all_captions, strict=True):
+                    video_artifact = self.tracker.video(filename, caption=caption)
+                    if video_artifact is not None:
+                        artifacts.append(video_artifact)
+                if artifacts:
+                    logs = {f"validation_videos_{num_inference_steps}_steps": artifacts}
+                    self.tracker.log_artifacts(logs, global_step)
+            elif self.rank_in_sp_group == 0:
+                # Other sp_group leaders send their results to global rank 0
+                world_group.send_object(step_videos, dst=0)
+                world_group.send_object(step_captions, dst=0)
+                world_group.send_object(step_audio, dst=0)
+                world_group.send_object(step_sample_rates, dst=0)
+            world_group.barrier()
+        # Re-enable gradients for training
+        training_args.inference_mode = False
+        self.transformer.train()
+        if getattr(self, "transformer_2", None) is not None:
+            self.transformer_2.train()
+    @staticmethod
+    def _mux_audio(
+        video_path: str,
+        audio: torch.Tensor | np.ndarray,
+        sample_rate: int,
+    ) -> bool:
+        """Mux audio into video using PyAV."""
+        try:
+            import av
+        except ImportError:
+            logger.warning("PyAV not installed; cannot mux audio. "
+                           "Install with: pip install av")
+            return False
+        if torch.is_tensor(audio):
+            audio_np = audio.detach().cpu().float().numpy()
+        else:
+            audio_np = np.asarray(audio, dtype=np.float32)
+        if audio_np.ndim == 1:
+            audio_np = audio_np[:, None]
+        elif audio_np.ndim == 2:
+            if audio_np.shape[0] <= 8 and audio_np.shape[1] > audio_np.shape[0]:
+                audio_np = audio_np.T
+        else:
+            logger.warning("Unexpected audio shape %s; skipping mux.", audio_np.shape)
+            return False
+        audio_np = np.clip(audio_np, -1.0, 1.0)
+        audio_int16 = (audio_np * 32767.0).astype(np.int16)
+        num_channels = audio_int16.shape[1]
+        layout = "stereo" if num_channels == 2 else "mono"
+        try:
+            import wave
+            with tempfile.TemporaryDirectory() as tmpdir:
+                out_path = os.path.join(tmpdir, "muxed.mp4")
+                wav_path = os.path.join(tmpdir, "audio.wav")
+                # Write audio to WAV file
+                with wave.open(wav_path, "wb") as wav_file:
+                    wav_file.setnchannels(num_channels)
+                    wav_file.setsampwidth(2)
+                    wav_file.setframerate(sample_rate)
+                    wav_file.writeframes(audio_int16.tobytes())
+                # Open input video and audio
+                input_video = av.open(video_path)
+                input_audio = av.open(wav_path)
+                # Create output with both streams
+                output = av.open(out_path, mode="w")
+                # Add video stream (copy codec from input)
+                in_video_stream = input_video.streams.video[0]
+                out_video_stream = output.add_stream(
+                    codec_name=in_video_stream.codec_context.name,
+                    rate=in_video_stream.average_rate,
+                )
+                out_video_stream.width = in_video_stream.width
+                out_video_stream.height = in_video_stream.height
+                out_video_stream.pix_fmt = in_video_stream.pix_fmt
+                # Add audio stream (AAC)
+                out_audio_stream = output.add_stream("aac", rate=sample_rate)
+                out_audio_stream.layout = layout
+                # Remux video (decode and re-encode to be safe)
+                for frame in input_video.decode(video=0):
+                    for packet in out_video_stream.encode(frame):
+                        output.mux(packet)
+                for packet in out_video_stream.encode():
+                    output.mux(packet)
+                # Encode audio
+                for frame in input_audio.decode(audio=0):
+                    frame.pts = None  # Let encoder assign PTS
+                    for packet in out_audio_stream.encode(frame):
+                        output.mux(packet)
+                for packet in out_audio_stream.encode():
+                    output.mux(packet)
+                input_video.close()
+                input_audio.close()
+                output.close()
+                shutil.move(out_path, video_path)
+            return True
+        except Exception as e:
+            logger.warning("Audio mux failed: %s", e)
+            return False
+    def visualize_intermediate_latents(self, training_batch: TrainingBatch, training_args: TrainingArgs, step: int):
+        """Add visualization data to tracker logging and save frames to disk."""
+        raise NotImplementedError("Visualize intermediate latents is not implemented for training pipeline")

standalone_inference/overlay_files/fastvideo/training/wan_training_pipeline.py ADDED Viewed

	@@ -0,0 +1,74 @@

+# SPDX-License-Identifier: Apache-2.0
+import sys
+from copy import deepcopy
+from fastvideo.fastvideo_args import FastVideoArgs, TrainingArgs
+from fastvideo.logger import init_logger
+from fastvideo.models.schedulers.scheduling_flow_unipc_multistep import (FlowUniPCMultistepScheduler)
+from fastvideo.pipelines.basic.wan.wan_pipeline import WanPipeline
+from fastvideo.training.training_pipeline import TrainingPipeline
+from fastvideo.utils import is_vsa_available
+try:
+    vsa_available = is_vsa_available()
+except Exception:
+    vsa_available = False
+logger = init_logger(__name__)
+class WanTrainingPipeline(TrainingPipeline):
+    """
+    A training pipeline for Wan.
+    """
+    _required_config_modules = ["scheduler", "transformer", "vae"]
+    def initialize_pipeline(self, fastvideo_args: FastVideoArgs):
+        self.modules["scheduler"] = FlowUniPCMultistepScheduler(shift=fastvideo_args.pipeline_config.flow_shift)
+    def create_training_stages(self, training_args: TrainingArgs):
+        """
+        May be used in future refactors.
+        """
+        pass
+    def initialize_validation_pipeline(self, training_args: TrainingArgs):
+        logger.info("Initializing validation pipeline...")
+        args_copy = deepcopy(training_args)
+        args_copy.inference_mode = True
+        validation_pipeline = WanPipeline.from_pretrained(
+            training_args.model_path,
+            args=args_copy,  # type: ignore
+            inference_mode=True,
+            loaded_modules={
+                "transformer": self.get_module("transformer"),
+            },
+            tp_size=training_args.tp_size,
+            sp_size=training_args.sp_size,
+            num_gpus=training_args.num_gpus,
+            pin_cpu_memory=training_args.pin_cpu_memory,
+            dit_cpu_offload=True)
+        self.validation_pipeline = validation_pipeline
+def main(args) -> None:
+    logger.info("Starting training pipeline...")
+    pipeline = WanTrainingPipeline.from_pretrained(args.pretrained_model_name_or_path, args=args)
+    args = pipeline.training_args
+    pipeline.train()
+    logger.info("Training pipeline done")
+if __name__ == "__main__":
+    argv = sys.argv
+    from fastvideo.fastvideo_args import TrainingArgs
+    from fastvideo.utils import FlexibleArgumentParser
+    parser = FlexibleArgumentParser()
+    parser = TrainingArgs.add_cli_args(parser)
+    parser = FastVideoArgs.add_cli_args(parser)
+    args = parser.parse_args()
+    args.dit_cpu_offload = False
+    main(args)

standalone_inference/requirements.txt ADDED Viewed

	@@ -0,0 +1,5 @@

+# Install FastVideo itself from the upstream project or from your local checkout.
+# This file only lists the extra Python packages directly used by the helper.
+huggingface_hub
+safetensors
+triton

standalone_inference/run.sh ADDED Viewed

	@@ -0,0 +1,22 @@

+#!/bin/bash
+set -euo pipefail
+BUNDLE_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+FASTVIDEO_ROOT="${FASTVIDEO_ROOT:-}"
+if [[ -z "${FASTVIDEO_ROOT}" ]]; then
+    echo "FASTVIDEO_ROOT is not set."
+    echo "Set it to a FastVideo source checkout or installed package root, for example:"
+    echo "  FASTVIDEO_ROOT=/path/to/FastVideo bash standalone_inference/run.sh"
+    exit 1
+fi
+python "${BUNDLE_ROOT}/install_overlay.py" --fastvideo-root "${FASTVIDEO_ROOT}"
+export PYTHONPATH="${FASTVIDEO_ROOT}/fastvideo-kernel/python:${FASTVIDEO_ROOT}/fastvideo-kernel:${PYTHONPATH:-}"
+export FASTVIDEO_ATTENTION_BACKEND=SPARSE_FP4_OURS_P_ATTN
+export FASTVIDEO_SPARSE_FP4_USE_HIGH_PREC_O=1
+cd "${FASTVIDEO_ROOT}"
+python "${BUNDLE_ROOT}/run_inference.py" "$@"

standalone_inference/run_inference.py ADDED Viewed

	@@ -0,0 +1,123 @@

+#!/usr/bin/env python3
+"""Run Wan T2V inference with the sparse FP4 checkpoint-700 transformer."""
+from __future__ import annotations
+import argparse
+import os
+from pathlib import Path
+DEFAULT_PROMPT = (
+    "In the video, a woman is elegantly showcasing her earrings, bringing "
+    "attention to their intricate design with a gentle touch of her fingers. "
+    "She is bathed in ambient purple and pink lighting, which casts a soft "
+    "glow on her delicate features and enhances the vivid tones of her lipstick "
+    "and eye makeup. Her hair is styled to frame her face smoothly, emphasizing "
+    "the contours of her jawline and cheekbones. The background features a "
+    "blurred neon light, adding an artistic and modern touch to the overall "
+    "aesthetic."
+)
+DEFAULT_NEGATIVE_PROMPT = (
+    "Bright tones, overexposed, static, blurred details, subtitles, style, "
+    "works, paintings, images, static, overall gray, worst quality, low quality, "
+    "JPEG compression residue, ugly, incomplete, extra fingers, poorly drawn "
+    "hands, poorly drawn faces, deformed, disfigured, misshapen limbs, fused "
+    "fingers, still picture, messy background, three legs, many people in the "
+    "background, walking backwards"
+)
+def _resolve_weights(repo_id: str, weights: str | None, local_dir: str) -> str:
+    if weights:
+        path = Path(weights).expanduser()
+        if path.exists():
+            return str(path.resolve())
+        raise FileNotFoundError(f"--weights does not exist: {path}")
+    from huggingface_hub import hf_hub_download
+    path = hf_hub_download(
+        repo_id=repo_id,
+        filename="transformer/diffusion_pytorch_model.safetensors",
+        local_dir=local_dir,
+        repo_type="model",
+    )
+    return str(Path(path).resolve())
+def main() -> int:
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--repo-id", default="yitongl/sparse_quant_exp")
+    parser.add_argument(
+        "--model-path",
+        default="Wan-AI/Wan2.1-T2V-1.3B-Diffusers",
+        help="Base Wan Diffusers model repo/path.",
+    )
+    parser.add_argument("--weights", default=None)
+    parser.add_argument(
+        "--local-dir",
+        default="checkpoints/hf_download/sparse_quant_exp",
+        help="Local Hugging Face download directory for the uploaded weights.",
+    )
+    parser.add_argument("--prompt", default=DEFAULT_PROMPT)
+    parser.add_argument("--negative-prompt", default=DEFAULT_NEGATIVE_PROMPT)
+    parser.add_argument("--output-path", default="outputs/sfp4_checkpoint_700")
+    parser.add_argument("--height", type=int, default=448)
+    parser.add_argument("--width", type=int, default=832)
+    parser.add_argument("--num-frames", type=int, default=77)
+    parser.add_argument("--num-inference-steps", type=int, default=50)
+    parser.add_argument("--fps", type=int, default=16)
+    parser.add_argument("--guidance-scale", type=float, default=5.0)
+    parser.add_argument("--flow-shift", type=float, default=1.0)
+    parser.add_argument("--seed", type=int, default=1000)
+    parser.add_argument("--vsa-sparsity", type=float, default=0.9)
+    parser.add_argument("--num-gpus", type=int, default=1)
+    parser.add_argument("--sp-size", type=int, default=1)
+    parser.add_argument("--tp-size", type=int, default=1)
+    parser.add_argument("--text-encoder-cpu-offload", action="store_true", default=True)
+    parser.add_argument("--pin-cpu-memory", action="store_true", default=False)
+    args = parser.parse_args()
+    os.environ.setdefault("FASTVIDEO_ATTENTION_BACKEND", "SPARSE_FP4_OURS_P_ATTN")
+    os.environ.setdefault("FASTVIDEO_SPARSE_FP4_USE_HIGH_PREC_O", "1")
+    weights_path = _resolve_weights(args.repo_id, args.weights, args.local_dir)
+    from fastvideo import VideoGenerator
+    generator = VideoGenerator.from_pretrained(
+        model_path=args.model_path,
+        num_gpus=args.num_gpus,
+        sp_size=args.sp_size,
+        tp_size=args.tp_size,
+        init_weights_from_safetensors=weights_path,
+        dit_cpu_offload=False,
+        vae_cpu_offload=False,
+        text_encoder_cpu_offload=args.text_encoder_cpu_offload,
+        pin_cpu_memory=args.pin_cpu_memory,
+        flow_shift=args.flow_shift,
+        VSA_sparsity=args.vsa_sparsity,
+    )
+    result = generator.generate_video(
+        prompt=args.prompt,
+        negative_prompt=args.negative_prompt,
+        output_path=args.output_path,
+        save_video=True,
+        return_frames=False,
+        height=args.height,
+        width=args.width,
+        num_frames=args.num_frames,
+        num_inference_steps=args.num_inference_steps,
+        fps=args.fps,
+        guidance_scale=args.guidance_scale,
+        seed=args.seed,
+    )
+    print(result)
+    return 0
+if __name__ == "__main__":
+    raise SystemExit(main())

standalone_inference/training_attention_settings.json ADDED Viewed

	@@ -0,0 +1,62 @@

+{
+  "run_name": "sfp4_v4_sparse09_hpo_on_ours_p_init2050_1n_interactive",
+  "checkpoint": "checkpoint-700",
+  "training_method": "legacy_sft_wan_training_pipeline",
+  "model_path": "Wan-AI/Wan2.1-T2V-1.3B-Diffusers",
+  "init_weights_from_safetensors": "checkpoints/init/sfp4_v4_sparse06_hpo_on_ours_p_1n_interactive_v2_ckpt2050/transformer/diffusion_pytorch_model.safetensors",
+  "environment": {
+    "FASTVIDEO_ATTENTION_BACKEND": "SPARSE_FP4_OURS_P_ATTN",
+    "FASTVIDEO_SPARSE_FP4_USE_HIGH_PREC_O": "1",
+    "FASTVIDEO_VALIDATION_ONE_PROMPT_PER_RANK": "1",
+    "WANDB_MODE": "online",
+    "WANDB_RESUME": "allow"
+  },
+  "vsa_schedule": {
+    "VSA_SPARSITY": 0.9,
+    "VSA_INIT_SPARSITY": 0.9,
+    "VSA_WARMUP_STEPS": 0,
+    "VSA_DECAY_RATE": 0.03,
+    "VSA_DECAY_INTERVAL_STEPS": 50,
+    "effective_sparsity_from_step_0": 0.9
+  },
+  "attention_semantics": {
+    "selected_backend": "SPARSE_FP4_OURS_P_ATTN",
+    "self_attention": {
+      "backend_path": "fastvideo/attention/backends/sparse_fp4_ours_p_attn.py",
+      "kernel_path": "fastvideo-kernel/python/fastvideo_kernel/triton_kernels/block_sparse_attn_triton_ours_p.py",
+      "tile_size_video": [4, 4, 4],
+      "tile_tokens": 64,
+      "qkv_quantization": "FP4 fake quantization with STE, no q/k mean subtraction in quantization",
+      "block_selection": "top-k blocks from q_c @ k_c tile-mean scores",
+      "p_quantization": "group-local exp2(qk - group_max) FP4 fake quantization; compensation multiplies exp2(group_max - running_row_m)",
+      "dropped_tile_handling": "tile-level q_mean/k_mean score and mean_v compensation"
+    },
+    "cross_attention": {
+      "backend": "dense_sdpa",
+      "reason": "sparse_fp4_ours_p_attn.py treats query_length != key_length as cross attention and returns _dense_sdpa_blhd",
+      "quantized": false,
+      "sparse": false
+    },
+    "force_dense": {
+      "backend": "dense_sdpa",
+      "used_for": "teacher or explicitly forced dense paths, not the normal SFT student self-attention path"
+    }
+  },
+  "validation_and_checkpointing": {
+    "save_steps": 50,
+    "eval_steps": 50,
+    "validation_sampling_steps": 50,
+    "validation_guidance_scale": 5.0,
+    "checkpoints_total_limit": 5,
+    "flow_shift": 1.0
+  },
+  "training_shape": {
+    "num_latent_t": 20,
+    "num_frames": 77,
+    "height": 448,
+    "width": 832,
+    "batch_size_per_gpu": 1,
+    "sp_size": 1,
+    "tp_size": 1
+  }
+}