[WIP] diffusers integration

#21

by kencwt - opened 26 days ago

base: refs/heads/main

←

from: refs/pr/21

Discussion Files changed

+10337

-3693

Files changed (21) hide show

.gitattributes +10 -0
.gitignore +0 -0
README.md +126 -74
_fm_solvers_unipc.py +0 -759
assets/astronaut.mp4 +3 -0
assets/bird.mp4 +3 -0
assets/fisherman.mp4 +3 -0
assets/i2v_sample.jpg +3 -0
assets/sage_compare_BF16.webp +3 -0
assets/sage_compare_Q4_K_M.webp +3 -0
assets/sage_compare_Q5_K_M.webp +3 -0
assets/sage_compare_Q8_0.webp +3 -0
assets/underwater.mp4 +3 -0
assets/woman.mp4 +3 -0
docs/gguf-sageattention.md +132 -0
docs/memory-efficient-inference.md +88 -0
inference.py +0 -119
model_index.json +1 -1
pipeline_motif_video.py +0 -1388
transformer/config.json +0 -2
transformer/transformer_motif_video.py +0 -1350

.gitattributes CHANGED Viewed

@@ -40,3 +40,13 @@ assets/showcase_t2v.png filter=lfs diff=lfs merge=lfs -text
 tokenizer/tokenizer.json filter=lfs diff=lfs merge=lfs -text
 assets/i2v_sample.jpg filter=lfs diff=lfs merge=lfs -text
 motif-video-technical-report.pdf filter=lfs diff=lfs merge=lfs -text

 tokenizer/tokenizer.json filter=lfs diff=lfs merge=lfs -text
 assets/i2v_sample.jpg filter=lfs diff=lfs merge=lfs -text
 motif-video-technical-report.pdf filter=lfs diff=lfs merge=lfs -text
+assets/astronaut.mp4 filter=lfs diff=lfs merge=lfs -text
+assets/bird.mp4 filter=lfs diff=lfs merge=lfs -text
+assets/fisherman.mp4 filter=lfs diff=lfs merge=lfs -text
+assets/underwater.mp4 filter=lfs diff=lfs merge=lfs -text
+assets/vows.mp4 filter=lfs diff=lfs merge=lfs -text
+assets/woman.mp4 filter=lfs diff=lfs merge=lfs -text
+assets/sage_compare_BF16.webp filter=lfs diff=lfs merge=lfs -text
+assets/sage_compare_Q4_K_M.webp filter=lfs diff=lfs merge=lfs -text
+assets/sage_compare_Q5_K_M.webp filter=lfs diff=lfs merge=lfs -text
+assets/sage_compare_Q8_0.webp filter=lfs diff=lfs merge=lfs -text

.gitignore CHANGED Viewed

The diff for this file is too large to render. See raw diff

README.md CHANGED Viewed

@@ -9,6 +9,22 @@ tags:
 - diffusion-transformer
 pipeline_tag: text-to-video
 library_name: diffusers
 ---
 <p align="center">
@@ -31,8 +47,25 @@ library_name: diffusers
 ---
 ## 🔥 News
 - **[2026-04-14]** We release **Motif-Video 2B**, our 2B-parameter text-to-video and image-to-video diffusion transformer, together with the full [technical report](https://arxiv.org/abs/2604.16503).
 ---
@@ -108,41 +141,74 @@ For the full derivation of why Shared Cross-Attention shares K/V but not Q, and
 ### Requirements
 - Python 3.10+
-- CUDA-capable GPU with **30GB+ VRAM** (e.g., A100, H100) — for 24GB GPUs see [Memory-efficient Inference](#-memory-efficient-inference)
 ```bash
-pip install "diffusers>=0.35.2" "transformers>=5.5.4" torch accelerate ftfy einops sentencepiece regex Pillow imageio imageio-ffmpeg
 ```
 ### Text-to-Video (T2V)
 ```python
 import torch
-from diffusers import AdaptiveProjectedGuidance, DiffusionPipeline
 from diffusers.utils import export_to_video
 guider = AdaptiveProjectedGuidance(
     guidance_scale=8.0,
     adaptive_projected_guidance_rescale=12.0,
     adaptive_projected_guidance_momentum=0.1,
     use_original_formulation=True,
 )
-pipe = DiffusionPipeline.from_pretrained(
     "Motif-Technologies/Motif-Video-2B",
-    custom_pipeline="pipeline_motif_video",
-    trust_remote_code=True,
     torch_dtype=torch.bfloat16,
     guider=guider,
 )
 pipe = pipe.to("cuda")
 output = pipe(
-    prompt="A category-five hurricane, viewed from inside the eye, reveals a circular stadium of cloud walls rising to fifty thousand feet with an eerie disk of blue sky directly overhead. Shot from a NOAA reconnaissance aircraft mounted camera, the perspective looks outward toward the eyewall — a near-vertical curtain of rotating cloud and lightning that is simultaneously terrifying and transcendent. The inner surface of the eyewall catches the setting sun, painting it in improbable shades of peach and rose. The camera slowly pans 360 degrees to complete one full revolution, capturing the entire coliseum of the storm. Below, the ocean surface is a white blur of foam and spray. The documentary-style cinematography strips away all artifice to present the storm as an entity of pure elemental power.",
     height=736,
     width=1280,
     num_frames=121,
     num_inference_steps=50,
 )
 export_to_video(output.frames[0], "output.mp4", fps=24)
@@ -152,7 +218,11 @@ export_to_video(output.frames[0], "output.mp4", fps=24)
 ```python
 import torch
-from diffusers import AdaptiveProjectedGuidance, DiffusionPipeline
 from diffusers.utils import export_to_video, load_image
 guider = AdaptiveProjectedGuidance(
@@ -160,26 +230,38 @@ guider = AdaptiveProjectedGuidance(
     adaptive_projected_guidance_rescale=12.0,
     adaptive_projected_guidance_momentum=0.1,
     use_original_formulation=True,
 )
-pipe = DiffusionPipeline.from_pretrained(
     "Motif-Technologies/Motif-Video-2B",
-    custom_pipeline="pipeline_motif_video",
-    trust_remote_code=True,
     torch_dtype=torch.bfloat16,
     guider=guider,
 )
 pipe = pipe.to("cuda")
 image = load_image("https://huggingface.co/Motif-Technologies/Motif-Video-2B/resolve/main/assets/i2v_sample.jpg")
 output = pipe(
-    prompt="Three friends stride through a sun-bleached meadow as a warm breeze ripples the tall dry grass around their legs. The woman on the left turns her head to share a quiet laugh, the woman in the center pushes a loose curl behind her ear, and the man on the right tilts his face toward the sky. The camera drifts gently alongside them at walking pace, handheld, with soft overcast light.",
     image=image,
     height=736,
     width=1280,
     num_frames=121,
     num_inference_steps=50,
 )
 export_to_video(output.frames[0], "output.mp4", fps=24)
@@ -188,96 +270,66 @@ export_to_video(output.frames[0], "output.mp4", fps=24)
 ### CLI Inference
 ```bash
-# Text-to-Video
 python inference.py \
-  --prompt "A time-lapse of a flower blooming in a dark room, dramatic lighting" \
   --output t2v_output.mp4
-# Image-to-Video
 python inference.py \
-  --image assets/i2v_sample.jpg \
-  --prompt "Three friends stride through a meadow as a warm breeze ripples the tall grass" \
-  --output i2v_output.mp4
 ```
-See `inference.py` for all available options (`--help`).
 ### Recommended Settings
 | Parameter | Default | Notes |
 |---|---|---|
-| Resolution | 1280x736 | 720p, best quality |
 | Frames | 121 | ~5 seconds at 24fps |
-| Guidance scale | 8.0 | |
 | Inference steps | 50 | |
 | dtype | bfloat16 | Recommended for H100/A100 |
 ### 🔋 Memory-efficient Inference
-By default, `pipe.to("cuda")` loads all components onto the GPU simultaneously, requiring **~30 GB VRAM**.
-For GPUs with 24 GB or less (e.g. RTX 4090, RTX 3090), use `enable_model_cpu_offload()` with the `expandable_segments` allocator setting:
-```bash
-export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
-```
-```python
-pipe = DiffusionPipeline.from_pretrained(
-    "Motif-Technologies/Motif-Video-2B",
-    custom_pipeline="pipeline_motif_video",
-    trust_remote_code=True,
-    torch_dtype=torch.bfloat16,
-    guider=guider,  # see T2V example above
-)
-pipe.enable_model_cpu_offload()  # replaces pipe.to("cuda")
-output = pipe(prompt="...", height=736, width=1280, num_frames=121, num_inference_steps=50)
-export_to_video(output.frames[0], "output.mp4", fps=24)
-```
-This moves each component (text encoder → transformer → VAE) to GPU only when needed. The `expandable_segments` setting allows the CUDA memory allocator to efficiently reuse memory released by earlier components, avoiding fragmentation-related OOM errors.
-| Mode | Peak VRAM | Speed | Recommended GPU |
-|------|-----------|-------|-----------------|
-| `pipe.to("cuda")` | ~30 GB | Fastest | A100, H100, H200 |
-| `enable_model_cpu_offload()` | ~19 GB | Similar | RTX 4090, RTX 3090 |
-#### FP8 Weight Quantization (Optional)
-For further VRAM reduction, you can quantize the transformer weights to FP8 using [torchao](https://github.com/pytorch/ao):
-```bash
-pip install torchao
-```
-```python
-from torchao.quantization import quantize_, Float8WeightOnlyConfig
-pipe = DiffusionPipeline.from_pretrained(
-    "Motif-Technologies/Motif-Video-2B",
-    custom_pipeline="pipeline_motif_video",
-    trust_remote_code=True,
-    torch_dtype=torch.bfloat16,
-    guider=guider,  # see T2V example above
-)
-quantize_(pipe.transformer, Float8WeightOnlyConfig())
-pipe.enable_model_cpu_offload()
-output = pipe(prompt="...", height=736, width=1280, num_frames=121, num_inference_steps=50)
-export_to_video(output.frames[0], "output.mp4", fps=24)
-```
-This stores the transformer weights in FP8 (8-bit) instead of BF16 (16-bit), reducing peak VRAM from ~19 GB to ~15 GB while keeping all computation in BF16 precision.
-| Mode | Peak VRAM | Notes |
-|------|-----------|-------|
-| `enable_model_cpu_offload()` | ~19 GB | BF16 baseline |
-| `+ Float8WeightOnlyConfig` | ~15 GB | FP8 weights, BF16 compute |
 ### 🖥️ ComfyUI
-Official ComfyUI custom nodes for Motif-Video 2B are currently in development. Stay tuned for updates.
 ---

 - diffusion-transformer
 pipeline_tag: text-to-video
 library_name: diffusers
+widget:
+  - text: "A vibrant blue jay perches gracefully on a slender branch, its feathers shimmering in the soft morning light. The bird's keen eyes scan the surroundings, capturing the essence of the tranquil forest. It flutters its wings briefly, showcasing the intricate patterns of blue, white, and black on its plumage. The background reveals a lush canopy of green leaves, with rays of sunlight filtering through, creating a dappled effect on the forest floor. The blue jay then tilts its head, emitting a melodious call that echoes through the serene woodland, adding a touch of magic to the peaceful scene."
+    output:
+      url: assets/bird.mp4
+  - text: "Underwater footage of a vibrant coral reef ecosystem with tropical fish swimming through coral formations. Natural sunlight filtering down through clear water creates dancing light patterns on the reef. Smooth underwater camera movement, natural color correction preserving authentic ocean blues and coral colors, documentary marine biology style, peaceful and educational mood."
+    output:
+      url: assets/underwater.mp4
+  - text: "An old fisherman mends his nets on a stone harbor wall, weathered hands moving with practiced speed through the green mesh. Shot on a 50mm lens with a slow dolly-in from his side, the afternoon sun throws warm light across his salt-stained coat and the worn granite beneath him. Behind him, a single wooden boat bobs gently in a turquoise bay. Gulls drift through the distant sky in soft focus. The camera settles on his hands, then racks focus to his weathered, squinting eyes."
+    output:
+      url: assets/fisherman.mp4
+  - text: "A lone astronaut drifts just outside a derelict space station, tethered by a single silver line as Earth's terminator glows blue-white behind her. Shot with a slow wide-to-medium push, the camera floats alongside her in weightless silence, the curvature of the planet filling the lower third of the frame. Sunlight rakes across the hull's scarred panels, casting long hard shadows that stretch and shift as she rotates. Her visor reflects the aurora below, ribbons of green pulling across the glass. She reaches out with a gloved hand and lets her fingertips graze a dented antenna, the gesture small and reverent."
+    output:
+      url: assets/astronaut.mp4
+  - text: "A woman standing in a sunlit field as flower petals swirl around her in slow motion. Each petal floats gently through the golden light, casting tiny shadows. Her hair moves like water, and time seems to stand still."
+    output:
+      url: assets/woman.mp4
 ---
 <p align="center">
 ---
+<!--
+  NOTE: This README is written against the CURRENT state of diffusers PR #13551
+  (pre-merge). The PR currently has issues:
+  - negative_prompt defaults to None (should be built-in)
+  - use_linear_quadratic_schedule defaults to True (should be False)
+  - DPMSolverMultistepScheduler crashes (pipeline always passes sigmas)
+  - No built-in SageAttention support (requires manual patching)
+  Code examples below include workarounds (explicit negative_prompt,
+  use_linear_quadratic_schedule=False, _FlowDPMSolver subclass).
+  TODO: Update after PR feedback is applied, and again after merge.
+  Tracking: https://github.com/MotifTechnologies/diffusers/pull/1
+-->
 ## 🔥 News
+- **[2026-04-28]** **ComfyUI custom nodes** released: [ComfyUI-MotifVideo2B](https://github.com/MotifTechnologies/ComfyUI-MotifVideo2B). GGUF workflow support coming soon.
+- **[2026-04-28]** **GGUF quantized weights** now available at [Motif-Video-2B-GGUF](https://huggingface.co/Motif-Technologies/Motif-Video-2B-GGUF) — up to 2.7 GB VRAM savings with no speed penalty. **SageAttention** support for ~2× faster inference. See [GGUF + SageAttention](#🧊-gguf--sageattention) below.
 - **[2026-04-14]** We release **Motif-Video 2B**, our 2B-parameter text-to-video and image-to-video diffusion transformer, together with the full [technical report](https://arxiv.org/abs/2604.16503).
 ---
 ### Requirements
 - Python 3.10+
+- CUDA-capable GPU with **30GB+ VRAM** (e.g., A100, H100) — for 24GB GPUs see [Memory-efficient Inference](🔋-memory-efficient-inference)
 ```bash
+pip install "transformers>=5.5.4" torch accelerate ftfy einops sentencepiece regex Pillow imageio imageio-ffmpeg
+pip install git+https://github.com/waitingcheung/diffusers.git@feat/motif-video
 ```
 ### Text-to-Video (T2V)
 ```python
 import torch
+from diffusers import (
+    AdaptiveProjectedGuidance,
+    DPMSolverMultistepScheduler,
+    MotifVideoPipeline,
+)
 from diffusers.utils import export_to_video
+# DPMSolver++ subclass: ignores pipeline-supplied sigmas and builds its own
+# flow-matching schedule. Will be unnecessary once PR #13551 adds the
+# _is_flow_multistep branch.
+class FlowDPMSolver(DPMSolverMultistepScheduler):
+    def set_timesteps(self, num_inference_steps=None, device=None,
+                      sigmas=None, mu=None, timesteps=None):
+        if sigmas is not None and num_inference_steps is None:
+            num_inference_steps = len(sigmas)
+        super().set_timesteps(
+            num_inference_steps=num_inference_steps,
+            device=device, timesteps=timesteps,
+        )
 guider = AdaptiveProjectedGuidance(
     guidance_scale=8.0,
     adaptive_projected_guidance_rescale=12.0,
     adaptive_projected_guidance_momentum=0.1,
     use_original_formulation=True,
+    normalization_dims="spatial",
 )
+pipe = MotifVideoPipeline.from_pretrained(
     "Motif-Technologies/Motif-Video-2B",
+    revision="diffusers-integration",
     torch_dtype=torch.bfloat16,
     guider=guider,
 )
+# DPMSolver++ for faster convergence
+pipe.scheduler = FlowDPMSolver(
+    num_train_timesteps=pipe.scheduler.config.get("num_train_timesteps", 1000),
+    algorithm_type="dpmsolver++",
+    solver_order=2,
+    prediction_type="flow_prediction",
+    use_flow_sigmas=True,
+    flow_shift=15.0,
+)
 pipe = pipe.to("cuda")
 output = pipe(
+    prompt="A woman standing in a sunlit field as flower petals swirl around her in slow motion. Each petal floats gently through the golden light, casting tiny shadows. Her hair moves like water, and time seems to stand still.",
+    negative_prompt="text overlay, graphic overlay, watermark, logo, subtitles, timestamp, broadcast graphics, UI elements, random letters, frozen pose, rigid, static expression, jerky motion, mechanical motion, discontinuous motion, flat framing, depthless, dull lighting, monotone, crushed shadows, blown-out highlights, shifting background, fading background, poor continuity, identity drift, deformation, flickering, ghosting, smearing, duplication, mutated proportions, inconsistent clothing, flat colors, desaturated, tonally compressed, poor background separation, exposure shift, uneven brightness, color balance shift",
     height=736,
     width=1280,
     num_frames=121,
     num_inference_steps=50,
+    frame_rate=24,
+    use_linear_quadratic_schedule=False,
 )
 export_to_video(output.frames[0], "output.mp4", fps=24)
 ```python
 import torch
+from diffusers import (
+    AdaptiveProjectedGuidance,
+    DPMSolverMultistepScheduler,
+    MotifVideoPipeline,
+)
 from diffusers.utils import export_to_video, load_image
 guider = AdaptiveProjectedGuidance(
     adaptive_projected_guidance_rescale=12.0,
     adaptive_projected_guidance_momentum=0.1,
     use_original_formulation=True,
+    normalization_dims="spatial",
 )
+pipe = MotifVideoPipeline.from_pretrained(
     "Motif-Technologies/Motif-Video-2B",
+    revision="diffusers-integration",
     torch_dtype=torch.bfloat16,
     guider=guider,
 )
+pipe.scheduler = FlowDPMSolver(
+    num_train_timesteps=pipe.scheduler.config.get("num_train_timesteps", 1000),
+    algorithm_type="dpmsolver++",
+    solver_order=2,
+    prediction_type="flow_prediction",
+    use_flow_sigmas=True,
+    flow_shift=15.0,
+)
 pipe = pipe.to("cuda")
 image = load_image("https://huggingface.co/Motif-Technologies/Motif-Video-2B/resolve/main/assets/i2v_sample.jpg")
 output = pipe(
+    prompt="Three friends stride through a sun-bleached meadow as a warm breeze ripples the tall dry grass around their legs.",
+    negative_prompt="text overlay, graphic overlay, watermark, logo, subtitles, timestamp, broadcast graphics, UI elements, random letters, frozen pose, rigid, static expression, jerky motion, mechanical motion, discontinuous motion, flat framing, depthless, dull lighting, monotone, crushed shadows, blown-out highlights, shifting background, fading background, poor continuity, identity drift, deformation, flickering, ghosting, smearing, duplication, mutated proportions, inconsistent clothing, flat colors, desaturated, tonally compressed, poor background separation, exposure shift, uneven brightness, color balance shift",
     image=image,
     height=736,
     width=1280,
     num_frames=121,
     num_inference_steps=50,
+    frame_rate=24,
+    use_linear_quadratic_schedule=False,
 )
 export_to_video(output.frames[0], "output.mp4", fps=24)
 ### CLI Inference
 ```bash
+# Text-to-Video (default settings)
 python inference.py \
+  --prompt "A woman standing in a sunlit field as..." \
   --output t2v_output.mp4
+# With SageAttention (~2x faster, requires sageattention package)
 python inference.py \
+  --prompt "Three friends stride through a sun-bleached meadow..." \
+  --use-sage-attention \
+  --output t2v_output.mp4
 ```
+See `inference.py --help` for all available options.
 ### Recommended Settings
 | Parameter | Default | Notes |
 |---|---|---|
+| Resolution | 1280×736 | 720p, best quality |
 | Frames | 121 | ~5 seconds at 24fps |
+| Scheduler | DPMSolver++ | `solver_order=2`, `flow_shift=15.0` |
+| Guidance scale | 8.0 | With APG (`normalization_dims="spatial"`) |
 | Inference steps | 50 | |
+| Negative prompt | (built-in) | See code examples above |
+| `use_linear_quadratic_schedule` | `False` | Must be set explicitly |
 | dtype | bfloat16 | Recommended for H100/A100 |
 ### 🔋 Memory-efficient Inference
+For GPUs with 24 GB or less (e.g. RTX 4090, RTX 3090), CPU offloading and FP8 quantization can reduce peak VRAM from ~30 GB to ~15 GB with minimal speed impact.
+| Mode | Peak VRAM | Recommended GPU |
+|------|-----------|-----------------|
+| `pipe.to("cuda")` | ~30 GB | A100, H100, H200 |
+| `enable_model_cpu_offload()` | ~19 GB | RTX 4090, RTX 3090 |
+| `+ FP8 quantization` | ~15 GB | RTX 4090, RTX 3090 |
+> **Full guide** → [docs/memory-efficient-inference.md](docs/memory-efficient-inference.md)
+---
+### 🧊 GGUF + SageAttention
+GGUF quantized weights at [Motif-Video-2B-GGUF](https://huggingface.co/Motif-Technologies/Motif-Video-2B-GGUF) — up to 2.7 GB VRAM savings with no speed penalty. Combined with [SageAttention](https://github.com/thu-ml/SageAttention) for ~1.6× faster inference.
+| Variant | Sage (s/it) | Speedup | Peak alloc (GB) |
+|---------|------------|---------|-----------------|
+| BF16    | 14.75      | 1.58x   | 15.12           |
+| Q8_0    | 14.49      | 1.60x   | 13.44           |
+| Q4_K_M  | 14.59      | 1.60x   | 12.53           |
+> **Full guide** → [docs/gguf-sageattention.md](docs/gguf-sageattention.md)
+---
 ### 🖥️ ComfyUI
+Official ComfyUI custom nodes: [ComfyUI-MotifVideo2B](https://github.com/MotifTechnologies/ComfyUI-MotifVideo2B)
+> **Note:** Currently requires **High VRAM** mode. GGUF quantized model loading in ComfyUI is in progress.
 ---

_fm_solvers_unipc.py DELETED Viewed

@@ -1,759 +0,0 @@
-# Copied from https://github.com/huggingface/diffusers/blob/v0.31.0/src/diffusers/schedulers/scheduling_unipc_multistep.py
-# Convert unipc for flow matching
-# Copyright 2024-2025 The Alibaba Wan Team Authors. All rights reserved.
-import math
-from typing import List, Optional, Tuple, Union
-import numpy as np
-import torch
-from diffusers.configuration_utils import ConfigMixin, register_to_config
-from diffusers.schedulers.scheduling_utils import (
-    KarrasDiffusionSchedulers,
-    SchedulerMixin,
-    SchedulerOutput,
-)
-from diffusers.utils import deprecate, is_scipy_available
-if is_scipy_available():
-    pass
-class FlowUniPCMultistepScheduler(SchedulerMixin, ConfigMixin):
-    """
-    `UniPCMultistepScheduler` is a training-free framework designed for the fast sampling of diffusion models.
-    This model inherits from [`SchedulerMixin`] and [`ConfigMixin`]. Check the superclass documentation for the generic
-    methods the library implements for all schedulers such as loading and saving.
-    Args:
-        num_train_timesteps (`int`, defaults to 1000):
-            The number of diffusion steps to train the model.
-        solver_order (`int`, default `2`):
-            The UniPC order which can be any positive integer. The effective order of accuracy is `solver_order + 1`
-            due to the UniC. It is recommended to use `solver_order=2` for guided sampling, and `solver_order=3` for
-            unconditional sampling.
-        prediction_type (`str`, defaults to "flow_prediction"):
-            Prediction type of the scheduler function; must be `flow_prediction` for this scheduler, which predicts
-            the flow of the diffusion process.
-        thresholding (`bool`, defaults to `False`):
-            Whether to use the "dynamic thresholding" method. This is unsuitable for latent-space diffusion models such
-            as Stable Diffusion.
-        dynamic_thresholding_ratio (`float`, defaults to 0.995):
-            The ratio for the dynamic thresholding method. Valid only when `thresholding=True`.
-        sample_max_value (`float`, defaults to 1.0):
-            The threshold value for dynamic thresholding. Valid only when `thresholding=True` and `predict_x0=True`.
-        predict_x0 (`bool`, defaults to `True`):
-            Whether to use the updating algorithm on the predicted x0.
-        solver_type (`str`, default `bh2`):
-            Solver type for UniPC. It is recommended to use `bh1` for unconditional sampling when steps < 10, and `bh2`
-            otherwise.
-        lower_order_final (`bool`, default `True`):
-            Whether to use lower-order solvers in the final steps. Only valid for < 15 inference steps. This can
-            stabilize the sampling of DPMSolver for steps < 15, especially for steps <= 10.
-        disable_corrector (`list`, default `[]`):
-            Decides which step to disable the corrector to mitigate the misalignment between `epsilon_theta(x_t, c)`
-            and `epsilon_theta(x_t^c, c)` which can influence convergence for a large guidance scale. Corrector is
-            usually disabled during the first few steps.
-        solver_p (`SchedulerMixin`, default `None`):
-            Any other scheduler that if specified, the algorithm becomes `solver_p + UniC`.
-        use_karras_sigmas (`bool`, *optional*, defaults to `False`):
-            Whether to use Karras sigmas for step sizes in the noise schedule during the sampling process. If `True`,
-            the sigmas are determined according to a sequence of noise levels {σi}.
-        use_exponential_sigmas (`bool`, *optional*, defaults to `False`):
-            Whether to use exponential sigmas for step sizes in the noise schedule during the sampling process.
-        timestep_spacing (`str`, defaults to `"linspace"`):
-            The way the timesteps should be scaled. Refer to Table 2 of the [Common Diffusion Noise Schedules and
-            Sample Steps are Flawed](https://huggingface.co/papers/2305.08891) for more information.
-        steps_offset (`int`, defaults to 0):
-            An offset added to the inference steps, as required by some model families.
-        final_sigmas_type (`str`, defaults to `"zero"`):
-            The final `sigma` value for the noise schedule during the sampling process. If `"sigma_min"`, the final
-            sigma is the same as the last sigma in the training schedule. If `zero`, the final sigma is set to 0.
-    """
-    _compatibles = [e.name for e in KarrasDiffusionSchedulers]
-    order = 1
-    @register_to_config
-    def __init__(
-        self,
-        num_train_timesteps: int = 1000,
-        solver_order: int = 2,
-        prediction_type: str = "flow_prediction",
-        shift: Optional[float] = 1.0,
-        use_dynamic_shifting=False,
-        thresholding: bool = False,
-        dynamic_thresholding_ratio: float = 0.995,
-        sample_max_value: float = 1.0,
-        predict_x0: bool = True,
-        solver_type: str = "bh2",
-        lower_order_final: bool = True,
-        disable_corrector: List[int] = [],
-        solver_p: Optional[SchedulerMixin] = None,
-        timestep_spacing: str = "linspace",
-        steps_offset: int = 0,
-        final_sigmas_type: Optional[str] = "zero",  # "zero", "sigma_min"
-    ):
-        if solver_type not in ["bh1", "bh2"]:
-            if solver_type in ["midpoint", "heun", "logrho"]:
-                self.register_to_config(solver_type="bh2")
-            else:
-                raise NotImplementedError(f"{solver_type} is not implemented for {self.__class__}")
-        self.predict_x0 = predict_x0
-        # setable values
-        self.num_inference_steps = None
-        alphas = np.linspace(1, 1 / num_train_timesteps, num_train_timesteps)[::-1].copy()
-        sigmas = 1.0 - alphas
-        sigmas = torch.from_numpy(sigmas).to(dtype=torch.float32)
-        if not use_dynamic_shifting:
-            # when use_dynamic_shifting is True, we apply the timestep shifting on the fly based on the image resolution
-            sigmas = shift * sigmas / (1 + (shift - 1) * sigmas)  # pyright: ignore
-        self.sigmas = sigmas
-        self.timesteps = sigmas * num_train_timesteps
-        self.model_outputs = [None] * solver_order
-        self.timestep_list = [None] * solver_order
-        self.lower_order_nums = 0
-        self.disable_corrector = disable_corrector
-        self.solver_p = solver_p
-        self.last_sample = None
-        self._step_index = None
-        self._begin_index = None
-        self.sigmas = self.sigmas.to("cpu")  # to avoid too much CPU/GPU communication
-        self.sigma_min = self.sigmas[-1].item()
-        self.sigma_max = self.sigmas[0].item()
-    @property
-    def step_index(self):
-        """
-        The index counter for current timestep. It will increase 1 after each scheduler step.
-        """
-        return self._step_index
-    @property
-    def begin_index(self):
-        """
-        The index for the first timestep. It should be set from pipeline with `set_begin_index` method.
-        """
-        return self._begin_index
-    # Copied from diffusers.schedulers.scheduling_dpmsolver_multistep.DPMSolverMultistepScheduler.set_begin_index
-    def set_begin_index(self, begin_index: int = 0):
-        """
-        Sets the begin index for the scheduler. This function should be run from pipeline before the inference.
-        Args:
-            begin_index (`int`):
-                The begin index for the scheduler.
-        """
-        self._begin_index = begin_index
-    # Modified from diffusers.schedulers.scheduling_flow_match_euler_discrete.FlowMatchEulerDiscreteScheduler.set_timesteps
-    def set_timesteps(
-        self,
-        num_inference_steps: Union[int, None] = None,
-        device: Optional[Union[str, torch.device]] = None,
-        sigmas: Optional[List[float]] = None,
-        mu: Optional[Union[float, None]] = None,
-        shift: Optional[Union[float, None]] = None,
-    ):
-        """
-        Sets the discrete timesteps used for the diffusion chain (to be run before inference).
-        Args:
-            num_inference_steps (`int`):
-                Total number of the spacing of the time steps.
-            device (`str` or `torch.device`, *optional*):
-                The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
-        """
-        if self.config.use_dynamic_shifting and mu is None:
-            raise ValueError(" you have to pass a value for `mu` when `use_dynamic_shifting` is set to be `True`")
-        if sigmas is None:
-            sigmas = np.linspace(self.sigma_max, self.sigma_min, num_inference_steps + 1).copy()[:-1]  # pyright: ignore
-        if self.config.use_dynamic_shifting:
-            sigmas = self.time_shift(mu, 1.0, sigmas)  # pyright: ignore
-        else:
-            if shift is None:
-                shift = self.config.shift
-            sigmas = shift * sigmas / (1 + (shift - 1) * sigmas)  # pyright: ignore
-        if self.config.final_sigmas_type == "sigma_min":
-            sigma_last = self.config.sigma_min
-        elif self.config.final_sigmas_type == "zero":
-            sigma_last = 0
-        else:
-            raise ValueError(
-                f"`final_sigmas_type` must be one of 'zero', or 'sigma_min', but got {self.config.final_sigmas_type}"
-            )
-        timesteps = sigmas * self.config.num_train_timesteps
-        sigmas = np.concatenate([sigmas, [sigma_last]]).astype(np.float32)  # pyright: ignore
-        self.sigmas = torch.from_numpy(sigmas)
-        self.timesteps = torch.from_numpy(timesteps).to(device=device, dtype=torch.int64)
-        self.num_inference_steps = len(timesteps)
-        self.model_outputs = [
-            None,
-        ] * self.config.solver_order
-        self.lower_order_nums = 0
-        self.last_sample = None
-        if self.solver_p:
-            self.solver_p.set_timesteps(self.num_inference_steps, device=device)
-        # add an index counter for schedulers that allow duplicated timesteps
-        self._step_index = None
-        self._begin_index = None
-        self.sigmas = self.sigmas.to("cpu")  # to avoid too much CPU/GPU communication
-    # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler._threshold_sample
-    def _threshold_sample(self, sample: torch.Tensor) -> torch.Tensor:
-        """
-        "Dynamic thresholding: At each sampling step we set s to a certain percentile absolute pixel value in xt0 (the
-        prediction of x_0 at timestep t), and if s > 1, then we threshold xt0 to the range [-s, s] and then divide by
-        s. Dynamic thresholding pushes saturated pixels (those near -1 and 1) inwards, thereby actively preventing
-        pixels from saturation at each step. We find that dynamic thresholding results in significantly better
-        photorealism as well as better image-text alignment, especially when using very large guidance weights."
-        https://arxiv.org/abs/2205.11487
-        """
-        dtype = sample.dtype
-        batch_size, channels, *remaining_dims = sample.shape
-        if dtype not in (torch.float32, torch.float64):
-            sample = sample.float()  # upcast for quantile calculation, and clamp not implemented for cpu half
-        # Flatten sample for doing quantile calculation along each image
-        sample = sample.reshape(batch_size, channels * np.prod(remaining_dims))
-        abs_sample = sample.abs()  # "a certain percentile absolute pixel value"
-        s = torch.quantile(abs_sample, self.config.dynamic_thresholding_ratio, dim=1)
-        s = torch.clamp(
-            s, min=1, max=self.config.sample_max_value
-        )  # When clamped to min=1, equivalent to standard clipping to [-1, 1]
-        s = s.unsqueeze(1)  # (batch_size, 1) because clamp will broadcast along dim=0
-        sample = torch.clamp(sample, -s, s) / s  # "we threshold xt0 to the range [-s, s] and then divide by s"
-        sample = sample.reshape(batch_size, channels, *remaining_dims)
-        sample = sample.to(dtype)
-        return sample
-    # Copied from diffusers.schedulers.scheduling_flow_match_euler_discrete.FlowMatchEulerDiscreteScheduler._sigma_to_t
-    def _sigma_to_t(self, sigma):
-        return sigma * self.config.num_train_timesteps
-    def _sigma_to_alpha_sigma_t(self, sigma):
-        return 1 - sigma, sigma
-    # Copied from diffusers.schedulers.scheduling_flow_match_euler_discrete.set_timesteps
-    def time_shift(self, mu: float, sigma: float, t: torch.Tensor):
-        return math.exp(mu) / (math.exp(mu) + (1 / t - 1) ** sigma)
-    def convert_model_output(
-        self,
-        model_output: torch.Tensor,
-        *args,
-        sample: Optional[torch.Tensor] = None,
-        **kwargs,
-    ) -> torch.Tensor:
-        r"""
-        Convert the model output to the corresponding type the UniPC algorithm needs.
-        Args:
-            model_output (`torch.Tensor`):
-                The direct output from the learned diffusion model.
-            timestep (`int`):
-                The current discrete timestep in the diffusion chain.
-            sample (`torch.Tensor`):
-                A current instance of a sample created by the diffusion process.
-        Returns:
-            `torch.Tensor`:
-                The converted model output.
-        """
-        timestep = args[0] if len(args) > 0 else kwargs.pop("timestep", None)
-        if sample is None:
-            if len(args) > 1:
-                sample = args[1]
-            else:
-                raise ValueError("missing `sample` as a required keyward argument")
-        if timestep is not None:
-            deprecate(
-                "timesteps",
-                "1.0.0",
-                "Passing `timesteps` is deprecated and has no effect as model output conversion is now handled via an internal counter `self.step_index`",
-            )
-        sigma = self.sigmas[self.step_index]
-        alpha_t, sigma_t = self._sigma_to_alpha_sigma_t(sigma)
-        if self.predict_x0:
-            if self.config.prediction_type == "flow_prediction":
-                sigma_t = self.sigmas[self.step_index]
-                x0_pred = sample - sigma_t * model_output
-            else:
-                raise ValueError(
-                    f"prediction_type given as {self.config.prediction_type} must be one of `epsilon`, `sample`,"
-                    " `v_prediction` or `flow_prediction` for the UniPCMultistepScheduler."
-                )
-            if self.config.thresholding:
-                x0_pred = self._threshold_sample(x0_pred)
-            return x0_pred
-        else:
-            if self.config.prediction_type == "flow_prediction":
-                sigma_t = self.sigmas[self.step_index]
-                epsilon = sample - (1 - sigma_t) * model_output
-            else:
-                raise ValueError(
-                    f"prediction_type given as {self.config.prediction_type} must be one of `epsilon`, `sample`,"
-                    " `v_prediction` or `flow_prediction` for the UniPCMultistepScheduler."
-                )
-            if self.config.thresholding:
-                sigma_t = self.sigmas[self.step_index]
-                x0_pred = sample - sigma_t * model_output
-                x0_pred = self._threshold_sample(x0_pred)
-                epsilon = model_output + x0_pred
-            return epsilon
-    def multistep_uni_p_bh_update(
-        self,
-        model_output: torch.Tensor,
-        *args,
-        sample: Optional[torch.Tensor] = None,
-        order: Optional[int] = None,
-        **kwargs,
-    ) -> torch.Tensor:
-        """
-        One step for the UniP (B(h) version). Alternatively, `self.solver_p` is used if is specified.
-        Args:
-            model_output (`torch.Tensor`):
-                The direct output from the learned diffusion model at the current timestep.
-            prev_timestep (`int`):
-                The previous discrete timestep in the diffusion chain.
-            sample (`torch.Tensor`):
-                A current instance of a sample created by the diffusion process.
-            order (`int`):
-                The order of UniP at this timestep (corresponds to the *p* in UniPC-p).
-        Returns:
-            `torch.Tensor`:
-                The sample tensor at the previous timestep.
-        """
-        prev_timestep = args[0] if len(args) > 0 else kwargs.pop("prev_timestep", None)
-        if sample is None:
-            if len(args) > 1:
-                sample = args[1]
-            else:
-                raise ValueError(" missing `sample` as a required keyward argument")
-        if order is None:
-            if len(args) > 2:
-                order = args[2]
-            else:
-                raise ValueError(" missing `order` as a required keyward argument")
-        if prev_timestep is not None:
-            deprecate(
-                "prev_timestep",
-                "1.0.0",
-                "Passing `prev_timestep` is deprecated and has no effect as model output conversion is now handled via an internal counter `self.step_index`",
-            )
-        model_output_list = self.model_outputs
-        s0 = self.timestep_list[-1]
-        m0 = model_output_list[-1]
-        x = sample
-        if self.solver_p:
-            x_t = self.solver_p.step(model_output, s0, x).prev_sample
-            return x_t
-        sigma_t, sigma_s0 = self.sigmas[self.step_index + 1], self.sigmas[self.step_index]  # pyright: ignore
-        alpha_t, sigma_t = self._sigma_to_alpha_sigma_t(sigma_t)
-        alpha_s0, sigma_s0 = self._sigma_to_alpha_sigma_t(sigma_s0)
-        lambda_t = torch.log(alpha_t) - torch.log(sigma_t)
-        lambda_s0 = torch.log(alpha_s0) - torch.log(sigma_s0)
-        h = lambda_t - lambda_s0
-        device = sample.device
-        rks = []
-        D1s = []
-        for i in range(1, order):
-            si = self.step_index - i  # pyright: ignore
-            mi = model_output_list[-(i + 1)]
-            alpha_si, sigma_si = self._sigma_to_alpha_sigma_t(self.sigmas[si])
-            lambda_si = torch.log(alpha_si) - torch.log(sigma_si)
-            rk = (lambda_si - lambda_s0) / h
-            rks.append(rk)
-            D1s.append((mi - m0) / rk)  # pyright: ignore
-        rks.append(1.0)
-        rks = torch.tensor(rks, device=device)
-        R = []
-        b = []
-        hh = -h if self.predict_x0 else h
-        h_phi_1 = torch.expm1(hh)  # h\phi_1(h) = e^h - 1
-        h_phi_k = h_phi_1 / hh - 1
-        factorial_i = 1
-        if self.config.solver_type == "bh1":
-            B_h = hh
-        elif self.config.solver_type == "bh2":
-            B_h = torch.expm1(hh)
-        else:
-            raise NotImplementedError()
-        for i in range(1, order + 1):
-            R.append(torch.pow(rks, i - 1))
-            b.append(h_phi_k * factorial_i / B_h)
-            factorial_i *= i + 1
-            h_phi_k = h_phi_k / hh - 1 / factorial_i
-        R = torch.stack(R)
-        b = torch.tensor(b, device=device)
-        if len(D1s) > 0:
-            D1s = torch.stack(D1s, dim=1)  # (B, K)
-            # for order 2, we use a simplified version
-            if order == 2:
-                rhos_p = torch.tensor([0.5], dtype=x.dtype, device=device)
-            else:
-                rhos_p = torch.linalg.solve(R[:-1, :-1], b[:-1]).to(device).to(x.dtype)
-        else:
-            D1s = None
-        if self.predict_x0:
-            x_t_ = sigma_t / sigma_s0 * x - alpha_t * h_phi_1 * m0
-            if D1s is not None:
-                pred_res = torch.einsum("k,bkc...->bc...", rhos_p, D1s)  # pyright: ignore
-            else:
-                pred_res = 0
-            x_t = x_t_ - alpha_t * B_h * pred_res
-        else:
-            x_t_ = alpha_t / alpha_s0 * x - sigma_t * h_phi_1 * m0
-            if D1s is not None:
-                pred_res = torch.einsum("k,bkc...->bc...", rhos_p, D1s)  # pyright: ignore
-            else:
-                pred_res = 0
-            x_t = x_t_ - sigma_t * B_h * pred_res
-        x_t = x_t.to(x.dtype)
-        return x_t
-    def multistep_uni_c_bh_update(
-        self,
-        this_model_output: torch.Tensor,
-        *args,
-        last_sample: Optional[torch.Tensor] = None,
-        this_sample: Optional[torch.Tensor] = None,
-        order: Optional[int] = None,
-        **kwargs,
-    ) -> torch.Tensor:
-        """
-        One step for the UniC (B(h) version).
-        Args:
-            this_model_output (`torch.Tensor`):
-                The model outputs at `x_t`.
-            this_timestep (`int`):
-                The current timestep `t`.
-            last_sample (`torch.Tensor`):
-                The generated sample before the last predictor `x_{t-1}`.
-            this_sample (`torch.Tensor`):
-                The generated sample after the last predictor `x_{t}`.
-            order (`int`):
-                The `p` of UniC-p at this step. The effective order of accuracy should be `order + 1`.
-        Returns:
-            `torch.Tensor`:
-                The corrected sample tensor at the current timestep.
-        """
-        this_timestep = args[0] if len(args) > 0 else kwargs.pop("this_timestep", None)
-        if last_sample is None:
-            if len(args) > 1:
-                last_sample = args[1]
-            else:
-                raise ValueError(" missing`last_sample` as a required keyward argument")
-        if this_sample is None:
-            if len(args) > 2:
-                this_sample = args[2]
-            else:
-                raise ValueError(" missing`this_sample` as a required keyward argument")
-        if order is None:
-            if len(args) > 3:
-                order = args[3]
-            else:
-                raise ValueError(" missing`order` as a required keyward argument")
-        if this_timestep is not None:
-            deprecate(
-                "this_timestep",
-                "1.0.0",
-                "Passing `this_timestep` is deprecated and has no effect as model output conversion is now handled via an internal counter `self.step_index`",
-            )
-        model_output_list = self.model_outputs
-        m0 = model_output_list[-1]
-        x = last_sample
-        x_t = this_sample
-        model_t = this_model_output
-        sigma_t, sigma_s0 = self.sigmas[self.step_index], self.sigmas[self.step_index - 1]  # pyright: ignore
-        alpha_t, sigma_t = self._sigma_to_alpha_sigma_t(sigma_t)
-        alpha_s0, sigma_s0 = self._sigma_to_alpha_sigma_t(sigma_s0)
-        lambda_t = torch.log(alpha_t) - torch.log(sigma_t)
-        lambda_s0 = torch.log(alpha_s0) - torch.log(sigma_s0)
-        h = lambda_t - lambda_s0
-        device = this_sample.device
-        rks = []
-        D1s = []
-        for i in range(1, order):
-            si = self.step_index - (i + 1)  # pyright: ignore
-            mi = model_output_list[-(i + 1)]
-            alpha_si, sigma_si = self._sigma_to_alpha_sigma_t(self.sigmas[si])
-            lambda_si = torch.log(alpha_si) - torch.log(sigma_si)
-            rk = (lambda_si - lambda_s0) / h
-            rks.append(rk)
-            D1s.append((mi - m0) / rk)  # pyright: ignore
-        rks.append(1.0)
-        rks = torch.tensor(rks, device=device)
-        R = []
-        b = []
-        hh = -h if self.predict_x0 else h
-        h_phi_1 = torch.expm1(hh)  # h\phi_1(h) = e^h - 1
-        h_phi_k = h_phi_1 / hh - 1
-        factorial_i = 1
-        if self.config.solver_type == "bh1":
-            B_h = hh
-        elif self.config.solver_type == "bh2":
-            B_h = torch.expm1(hh)
-        else:
-            raise NotImplementedError()
-        for i in range(1, order + 1):
-            R.append(torch.pow(rks, i - 1))
-            b.append(h_phi_k * factorial_i / B_h)
-            factorial_i *= i + 1
-            h_phi_k = h_phi_k / hh - 1 / factorial_i
-        R = torch.stack(R)
-        b = torch.tensor(b, device=device)
-        if len(D1s) > 0:
-            D1s = torch.stack(D1s, dim=1)
-        else:
-            D1s = None
-        # for order 1, we use a simplified version
-        if order == 1:
-            rhos_c = torch.tensor([0.5], dtype=x.dtype, device=device)
-        else:
-            rhos_c = torch.linalg.solve(R, b).to(device).to(x.dtype)
-        if self.predict_x0:
-            x_t_ = sigma_t / sigma_s0 * x - alpha_t * h_phi_1 * m0
-            if D1s is not None:
-                corr_res = torch.einsum("k,bkc...->bc...", rhos_c[:-1], D1s)
-            else:
-                corr_res = 0
-            D1_t = model_t - m0
-            x_t = x_t_ - alpha_t * B_h * (corr_res + rhos_c[-1] * D1_t)
-        else:
-            x_t_ = alpha_t / alpha_s0 * x - sigma_t * h_phi_1 * m0
-            if D1s is not None:
-                corr_res = torch.einsum("k,bkc...->bc...", rhos_c[:-1], D1s)
-            else:
-                corr_res = 0
-            D1_t = model_t - m0
-            x_t = x_t_ - sigma_t * B_h * (corr_res + rhos_c[-1] * D1_t)
-        x_t = x_t.to(x.dtype)
-        return x_t
-    def index_for_timestep(self, timestep, schedule_timesteps=None):
-        if schedule_timesteps is None:
-            schedule_timesteps = self.timesteps
-        indices = (schedule_timesteps == timestep).nonzero()
-        # The sigma index that is taken for the **very** first `step`
-        # is always the second index (or the last index if there is only 1)
-        # This way we can ensure we don't accidentally skip a sigma in
-        # case we start in the middle of the denoising schedule (e.g. for image-to-image)
-        pos = 1 if len(indices) > 1 else 0
-        return indices[pos].item()
-    # Copied from diffusers.schedulers.scheduling_dpmsolver_multistep.DPMSolverMultistepScheduler._init_step_index
-    def _init_step_index(self, timestep):
-        """
-        Initialize the step_index counter for the scheduler.
-        """
-        if self.begin_index is None:
-            if isinstance(timestep, torch.Tensor):
-                timestep = timestep.to(self.timesteps.device)
-            self._step_index = self.index_for_timestep(timestep)
-        else:
-            self._step_index = self._begin_index
-    def step(
-        self,
-        model_output: torch.Tensor,
-        timestep: Union[int, torch.Tensor],
-        sample: torch.Tensor,
-        return_dict: bool = True,
-        generator=None,
-    ) -> Union[SchedulerOutput, Tuple]:
-        """
-        Predict the sample from the previous timestep by reversing the SDE. This function propagates the sample with
-        the multistep UniPC.
-        Args:
-            model_output (`torch.Tensor`):
-                The direct output from learned diffusion model.
-            timestep (`int`):
-                The current discrete timestep in the diffusion chain.
-            sample (`torch.Tensor`):
-                A current instance of a sample created by the diffusion process.
-            return_dict (`bool`):
-                Whether or not to return a [`~schedulers.scheduling_utils.SchedulerOutput`] or `tuple`.
-        Returns:
-            [`~schedulers.scheduling_utils.SchedulerOutput`] or `tuple`:
-                If return_dict is `True`, [`~schedulers.scheduling_utils.SchedulerOutput`] is returned, otherwise a
-                tuple is returned where the first element is the sample tensor.
-        """
-        if self.num_inference_steps is None:
-            raise ValueError(
-                "Number of inference steps is 'None', you need to run 'set_timesteps' after creating the scheduler"
-            )
-        if self.step_index is None:
-            self._init_step_index(timestep)
-        use_corrector = (
-            self.step_index > 0 and self.step_index - 1 not in self.disable_corrector and self.last_sample is not None  # pyright: ignore
-        )
-        model_output_convert = self.convert_model_output(model_output, sample=sample)
-        if use_corrector:
-            sample = self.multistep_uni_c_bh_update(
-                this_model_output=model_output_convert,
-                last_sample=self.last_sample,
-                this_sample=sample,
-                order=self.this_order,
-            )
-        for i in range(self.config.solver_order - 1):
-            self.model_outputs[i] = self.model_outputs[i + 1]
-            self.timestep_list[i] = self.timestep_list[i + 1]
-        self.model_outputs[-1] = model_output_convert
-        self.timestep_list[-1] = timestep  # pyright: ignore
-        if self.config.lower_order_final:
-            this_order = min(self.config.solver_order, len(self.timesteps) - self.step_index)  # pyright: ignore
-        else:
-            this_order = self.config.solver_order
-        self.this_order = min(this_order, self.lower_order_nums + 1)  # warmup for multistep
-        assert self.this_order > 0
-        self.last_sample = sample
-        prev_sample = self.multistep_uni_p_bh_update(
-            model_output=model_output,  # pass the original non-converted model output, in case solver-p is used
-            sample=sample,
-            order=self.this_order,
-        )
-        if self.lower_order_nums < self.config.solver_order:
-            self.lower_order_nums += 1
-        # upon completion increase step index by one
-        self._step_index += 1  # pyright: ignore
-        if not return_dict:
-            return (prev_sample,)
-        return SchedulerOutput(prev_sample=prev_sample)
-    def scale_model_input(self, sample: torch.Tensor, *args, **kwargs) -> torch.Tensor:
-        """
-        Ensures interchangeability with schedulers that need to scale the denoising model input depending on the
-        current timestep.
-        Args:
-            sample (`torch.Tensor`):
-                The input sample.
-        Returns:
-            `torch.Tensor`:
-                A scaled input sample.
-        """
-        return sample
-    # Copied from diffusers.schedulers.scheduling_dpmsolver_multistep.DPMSolverMultistepScheduler.add_noise
-    def add_noise(
-        self,
-        original_samples: torch.Tensor,
-        noise: torch.Tensor,
-        timesteps: torch.IntTensor,
-    ) -> torch.Tensor:
-        # Make sure sigmas and timesteps have the same device and dtype as original_samples
-        sigmas = self.sigmas.to(device=original_samples.device, dtype=original_samples.dtype)
-        if original_samples.device.type == "mps" and torch.is_floating_point(timesteps):
-            # mps does not support float64
-            schedule_timesteps = self.timesteps.to(original_samples.device, dtype=torch.float32)
-            timesteps = timesteps.to(original_samples.device, dtype=torch.float32)
-        else:
-            schedule_timesteps = self.timesteps.to(original_samples.device)
-            timesteps = timesteps.to(original_samples.device)
-        # begin_index is None when the scheduler is used for training or pipeline does not implement set_begin_index
-        if self.begin_index is None:
-            step_indices = [self.index_for_timestep(t, schedule_timesteps) for t in timesteps]
-        elif self.step_index is not None:
-            # add_noise is called after first denoising step (for inpainting)
-            step_indices = [self.step_index] * timesteps.shape[0]
-        else:
-            # add noise is called before first denoising step to create initial latent(img2img)
-            step_indices = [self.begin_index] * timesteps.shape[0]
-        sigma = sigmas[step_indices].flatten()
-        while len(sigma.shape) < len(original_samples.shape):
-            sigma = sigma.unsqueeze(-1)
-        alpha_t, sigma_t = self._sigma_to_alpha_sigma_t(sigma)
-        noisy_samples = alpha_t * original_samples + sigma_t * noise
-        return noisy_samples
-    def __len__(self):
-        return self.config.num_train_timesteps

assets/astronaut.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5c13fd69a91c40ce217162e1bee917c23b86fcd878301bcab11a48fcd3bfeded
+size 1020141

assets/bird.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ceddcad73d270f8a03a5bfa5ab6cc3dc74c9b1ff3db3a652151b5c26efd36e9c
+size 757207

assets/fisherman.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4e67764b0104e66ec56dab3cc216f3887aef84d25f3bb7758443de1b8624c745
+size 1506256

assets/i2v_sample.jpg ADDED Viewed

Git LFS Details

SHA256: a3709a6989fc201b9c4332eadc30d34b365a41ef300f6bf5ecc1fdc40a7c8969
Pointer size: 131 Bytes
Size of remote file: 378 kB

assets/sage_compare_BF16.webp ADDED Viewed

Git LFS Details

SHA256: c6de38ff09e335e7c33e7de359418b27de30af05b63d08f0d9ec521bfb7a583f
Pointer size: 132 Bytes
Size of remote file: 6.53 MB

assets/sage_compare_Q4_K_M.webp ADDED Viewed

Git LFS Details

SHA256: 6e7625ab6be438419a421f35f297963c80e1314e9cffdbbfe2fe9438966046cf
Pointer size: 132 Bytes
Size of remote file: 6.03 MB

assets/sage_compare_Q5_K_M.webp ADDED Viewed

Git LFS Details

SHA256: d4698ccf9716113f605de3a4d2e5ccff24dddfb9b34279a0a05da416a9e701d6
Pointer size: 132 Bytes
Size of remote file: 6.83 MB

assets/sage_compare_Q8_0.webp ADDED Viewed

Git LFS Details

SHA256: e4d15be42c5aa4cff9398a6b24922d7473a9815210651072f8ea4059fe288d7f
Pointer size: 132 Bytes
Size of remote file: 6.4 MB

assets/underwater.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:184b72b8672bca924bf8f0b568488daac507eafd8dd8ddcb15f9928549309550
+size 2024562

assets/woman.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2aa2e93ccc2d333f26323a0944e4a3e9c0ee3064824ae23b245f5ab2c548a947
+size 2057707

docs/gguf-sageattention.md ADDED Viewed

	@@ -0,0 +1,132 @@

+# 🧊 GGUF + SageAttention
+> See the main [README](../README.md) for `FlowDPMSolver` and pipeline setup.
+GGUF quantized transformer weights are available at [Motif-Video-2B-GGUF](https://huggingface.co/Motif-Technologies/Motif-Video-2B-GGUF), reducing VRAM with minimal quality loss. Combined with [SageAttention](https://github.com/thu-ml/SageAttention) for ~2× faster attention computation.
+## GGUF Inference
+```bash
+pip install gguf
+```
+```python
+import torch
+from diffusers import (
+    AdaptiveProjectedGuidance,
+    DPMSolverMultistepScheduler,
+    GGUFQuantizationConfig,
+    MotifVideoPipeline,
+    MotifVideoTransformer3DModel,
+)
+from diffusers.utils import export_to_video
+from huggingface_hub import hf_hub_download
+guider = AdaptiveProjectedGuidance(
+    guidance_scale=8.0,
+    adaptive_projected_guidance_rescale=12.0,
+    adaptive_projected_guidance_momentum=0.1,
+    use_original_formulation=True,
+    normalization_dims="spatial",
+)
+variant = "Q4_K_M"  # Options: Q4_0, Q4_1, Q4_K_M, Q5_0, Q5_1, Q5_K_M, Q6_K, Q8_0, BF16
+ckpt_path = hf_hub_download(
+    "Motif-Technologies/Motif-Video-2B-GGUF",
+    filename=f"motifv-2b-dev-{variant}.gguf",
+)
+transformer = MotifVideoTransformer3DModel.from_single_file(
+    ckpt_path,
+    quantization_config=GGUFQuantizationConfig(compute_dtype=torch.bfloat16),
+    config="Motif-Technologies/Motif-Video-2B",
+    revision="diffusers-integration",
+    subfolder="transformer",
+    torch_dtype=torch.bfloat16,
+)
+pipe = MotifVideoPipeline.from_pretrained(
+    "Motif-Technologies/Motif-Video-2B",
+    revision="diffusers-integration",
+    torch_dtype=torch.bfloat16,
+    guider=guider,
+    transformer=transformer,
+)
+pipe.scheduler = FlowDPMSolver(
+    num_train_timesteps=pipe.scheduler.config.get("num_train_timesteps", 1000),
+    algorithm_type="dpmsolver++",
+    solver_order=2,
+    prediction_type="flow_prediction",
+    use_flow_sigmas=True,
+    flow_shift=15.0,
+)
+pipe.enable_model_cpu_offload()
+output = pipe(
+    prompt="A woman standing in a sunlit field as flower petals swirl around her in slow motion. Each petal floats gently through the golden light, casting tiny shadows. Her hair moves like water, and time seems to stand still.",
+    negative_prompt="text overlay, graphic overlay, watermark, logo, subtitles, timestamp, broadcast graphics, UI elements, random letters, frozen pose, rigid, static expression, jerky motion, mechanical motion, discontinuous motion, flat framing, depthless, dull lighting, monotone, crushed shadows, blown-out highlights, shifting background, fading background, poor continuity, identity drift, deformation, flickering, ghosting, smearing, duplication, mutated proportions, inconsistent clothing, flat colors, desaturated, tonally compressed, poor background separation, exposure shift, uneven brightness, color balance shift",
+    height=736,
+    width=1280,
+    num_frames=121,
+    num_inference_steps=50,
+    frame_rate=24,
+    use_linear_quadratic_schedule=False,
+)
+export_to_video(output.frames[0], "output.mp4", fps=24)
+```
+## SageAttention (Optional, ~1.6× faster)
+Same prompt and seed, 1280x736, 121 frames, 50 steps. Left = SDPA, Right = SageAttention.
+![BF16](../assets/sage_compare_BF16.webp)
+![Q8_0](../assets/sage_compare_Q8_0.webp)
+![Q5_K_M](../assets/sage_compare_Q5_K_M.webp)
+![Q4_K_M](../assets/sage_compare_Q4_K_M.webp)
+[SageAttention](https://github.com/thu-ml/SageAttention) accelerates attention by quantizing Q/K to INT8 and V to FP8, reducing memory bandwidth. Works with all GGUF variants.
+**Install** (build from source — PyPI only has 1.x, need 2.x):
+```bash
+# Set TORCH_CUDA_ARCH_LIST to match your GPU: "8.0" for A100, "9.0" for H100/H200
+TORCH_CUDA_ARCH_LIST="9.0" pip install git+https://github.com/thu-ml/SageAttention.git --no-build-isolation
+```
+**Usage with `inference.py`:**
+```bash
+python inference.py --use-sage-attention --prompt "..."
+```
+**Notes:**
+- Requires NVIDIA GPU with SM70+
+- SM90+ (H100, H200) — FP8 kernels for maximum speedup
+- SM80-SM89 (A100, RTX 3090, RTX 4090) — FP16 kernels (still faster than SDPA)
+- SM70-SM75 (V100, RTX 2080 Ti) — FP16 kernels
+- Set `TORCH_CUDA_ARCH_LIST` to match your GPU when building (e.g., `"8.6"` for RTX 3090, `"8.9"` for RTX 4090)
+- No quality degradation observed across all GGUF variants
+## Benchmark
+Measured on NVIDIA H200, 1280x736, 121 frames, 50 steps, DPMSolver++ (order=2, flow_shift=15.0):
+| Variant | SDPA (s/it) | Sage (s/it) | Speedup | Peak alloc (GB) | Peak rsv (GB) | Total SDPA (s) | Total Sage (s) |
+|---------|------------|------------|---------|-----------------|----------------|----------------|----------------|
+| BF16    | 23.36      | 14.75      | 1.58x   | 14.78 / 15.12   | 24.93 / 24.90  | 1184           | 754            |
+| Q8_0    | 23.16      | 14.49      | 1.60x   | 13.10 / 13.44   | 23.14 / 23.11  | 1178           | 744            |
+| Q6_K    | 23.21      | 14.55      | 1.60x   | 12.62 / 12.95   | 22.72 / 22.69  | 1178           | 747            |
+| Q5_K_M  | 23.33      | 14.69      | 1.59x   | 12.39 / 12.72   | 22.45 / 22.42  | 1184           | 754            |
+| Q5_1    | 23.54      | 14.96      | 1.57x   | 12.47 / 12.81   | 22.66 / 22.62  | 1193           | 764            |
+| Q5_0    | 23.26      | 14.67      | 1.59x   | 12.37 / 12.71   | 22.55 / 22.52  | 1179           | 750            |
+| Q4_K_M  | 23.25      | 14.59      | 1.60x   | 12.19 / 12.53   | 22.22 / 22.18  | 1178           | 747            |
+| Q4_1    | 23.31      | 14.68      | 1.59x   | 12.26 / 12.60   | 22.26 / 22.22  | 1181           | 750            |
+| Q4_0    | 23.33      | 14.75      | 1.58x   | 12.14 / 12.47   | 22.18 / 22.14  | 1188           | 760            |
+Peak alloc/rsv columns show SDPA / Sage values. Sage adds ~0.3 GB alloc overhead (INT8/FP8 quantization buffers) with no change in reserved memory.
+**Key findings:**
+- **~1.59x faster with SageAttention** — consistent across all quantization levels
+- **VRAM unchanged** — sage overhead is negligible (~0.3 GB alloc)
+- **GGUF + Sage stacks** — Q4_K_M + Sage achieves 14.59 s/it at 12.53 GB alloc (vs BF16 SDPA: 23.36 s/it at 14.78 GB)

docs/memory-efficient-inference.md ADDED Viewed

	@@ -0,0 +1,88 @@

+# Memory-efficient Inference
+> See the main [README](../README.md) for `FlowDPMSolver` and `guider` setup.
+By default, `pipe.to("cuda")` loads all components onto the GPU simultaneously, requiring **~30 GB VRAM**.
+For GPUs with 24 GB or less (e.g. RTX 4090, RTX 3090), use `enable_model_cpu_offload()` with the `expandable_segments` allocator setting:
+```bash
+export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
+```
+```python
+pipe = MotifVideoPipeline.from_pretrained(
+    "Motif-Technologies/Motif-Video-2B",
+    revision="diffusers-integration",
+    torch_dtype=torch.bfloat16,
+    guider=guider,  # see T2V example above
+)
+pipe.scheduler = FlowDPMSolver(
+    num_train_timesteps=pipe.scheduler.config.get("num_train_timesteps", 1000),
+    algorithm_type="dpmsolver++",
+    solver_order=2,
+    prediction_type="flow_prediction",
+    use_flow_sigmas=True,
+    flow_shift=15.0,
+)
+pipe.enable_model_cpu_offload()  # replaces pipe.to("cuda")
+output = pipe(
+    prompt="...",
+    negative_prompt="...",
+    height=736, width=1280, num_frames=121, num_inference_steps=50,
+    frame_rate=24, use_linear_quadratic_schedule=False,
+)
+export_to_video(output.frames[0], "output.mp4", fps=24)
+```
+This moves each component (text encoder → transformer → VAE) to GPU only when needed. The `expandable_segments` setting allows the CUDA memory allocator to efficiently reuse memory released by earlier components, avoiding fragmentation-related OOM errors.
+| Mode | Peak VRAM | Speed | Recommended GPU |
+|------|-----------|-------|-----------------|
+| `pipe.to("cuda")` | ~30 GB | Fastest | A100, H100, H200 |
+| `enable_model_cpu_offload()` | ~19 GB | Similar | RTX 4090, RTX 3090 |
+## FP8 Weight Quantization (Optional)
+For further VRAM reduction, you can quantize the transformer weights to FP8 using [torchao](https://github.com/pytorch/ao):
+```bash
+pip install torchao
+```
+```python
+from torchao.quantization import quantize_, Float8WeightOnlyConfig
+pipe = MotifVideoPipeline.from_pretrained(
+    "Motif-Technologies/Motif-Video-2B",
+    revision="diffusers-integration",
+    torch_dtype=torch.bfloat16,
+    guider=guider,  # see T2V example above
+)
+pipe.scheduler = FlowDPMSolver(
+    num_train_timesteps=pipe.scheduler.config.get("num_train_timesteps", 1000),
+    algorithm_type="dpmsolver++",
+    solver_order=2,
+    prediction_type="flow_prediction",
+    use_flow_sigmas=True,
+    flow_shift=15.0,
+)
+quantize_(pipe.transformer, Float8WeightOnlyConfig())
+pipe.enable_model_cpu_offload()
+output = pipe(
+    prompt="...",
+    negative_prompt="...",
+    height=736, width=1280, num_frames=121, num_inference_steps=50,
+    frame_rate=24, use_linear_quadratic_schedule=False,
+)
+export_to_video(output.frames[0], "output.mp4", fps=24)
+```
+This stores the transformer weights in FP8 (8-bit) instead of BF16 (16-bit), reducing peak VRAM from ~19 GB to ~15 GB while keeping all computation in BF16 precision.
+| Mode | Peak VRAM | Notes |
+|------|-----------|-------|
+| `enable_model_cpu_offload()` | ~19 GB | BF16 baseline |
+| `+ Float8WeightOnlyConfig` | ~15 GB | FP8 weights, BF16 compute |

inference.py DELETED Viewed

@@ -1,119 +0,0 @@
-#!/usr/bin/env python3
-"""Motif-Video 2B — Text-to-Video & Image-to-Video inference.
-GPU requirements: ~24GB VRAM for 720p (1280x736, 121 frames).
-Tested with: torch>=2.0, diffusers>=0.35.2, transformers>=5.0.0
-Uses Adaptive Projected Guidance (APG) by default for best quality.
-"""
-import argparse
-import torch
-from diffusers import AdaptiveProjectedGuidance, DiffusionPipeline
-from diffusers.utils import export_to_video
-def parse_args():
-    parser = argparse.ArgumentParser(description="Motif-Video 2B Inference (T2V / I2V)")
-    parser.add_argument(
-        "--model-path",
-        type=str,
-        default="Motif-Technologies/Motif-Video-2B",
-        help="HuggingFace model ID or local checkpoint path (uses trust_remote_code=True)",
-    )
-    parser.add_argument(
-        "--prompt",
-        type=str,
-        default="A category-five hurricane, viewed from inside the eye, reveals a circular stadium of cloud walls rising to fifty thousand feet with an eerie disk of blue sky directly overhead. Shot from a NOAA reconnaissance aircraft mounted camera, the perspective looks outward toward the eyewall — a near-vertical curtain of rotating cloud and lightning that is simultaneously terrifying and transcendent. The inner surface of the eyewall catches the setting sun, painting it in improbable shades of peach and rose. The camera slowly pans 360 degrees to complete one full revolution, capturing the entire coliseum of the storm. Below, the ocean surface is a white blur of foam and spray. The documentary-style cinematography strips away all artifice to present the storm as an entity of pure elemental power.",
-        help="Text prompt for video generation",
-    )
-    parser.add_argument(
-        "--image",
-        type=str,
-        default=None,
-        help="Path to input image for I2V mode (omit for T2V)",
-    )
-    parser.add_argument(
-        "--negative-prompt",
-        type=str,
-        default=None,
-        help="Negative prompt (default: built-in pipeline default)",
-    )
-    parser.add_argument("--output", type=str, default="output.mp4", help="Output video file path")
-    parser.add_argument("--num-frames", type=int, default=121, help="Number of frames to generate (121 = ~5s at 24fps)")
-    parser.add_argument("--height", type=int, default=736, help="Video height in pixels")
-    parser.add_argument("--width", type=int, default=1280, help="Video width in pixels")
-    parser.add_argument("--guidance-scale", type=float, default=8.0, help="Classifier-free guidance scale")
-    parser.add_argument("--num-inference-steps", type=int, default=50, help="Number of denoising steps")
-    parser.add_argument("--fps", type=int, default=24, help="Output video frame rate")
-    parser.add_argument("--seed", type=int, default=42, help="Random seed for reproducibility")
-    parser.add_argument(
-        "--dtype",
-        type=str,
-        default="bfloat16",
-        choices=["float16", "bfloat16", "float32"],
-        help="Model dtype",
-    )
-    return parser.parse_args()
-def main():
-    args = parse_args()
-    dtype_map = {"float16": torch.float16, "bfloat16": torch.bfloat16, "float32": torch.float32}
-    torch_dtype = dtype_map[args.dtype]
-    mode = "I2V" if args.image else "T2V"
-    print(f"[{mode}] Loading model from: {args.model_path}")
-    guider = AdaptiveProjectedGuidance(
-        guidance_scale=args.guidance_scale,
-        adaptive_projected_guidance_rescale=12.0,
-        adaptive_projected_guidance_momentum=0.1,
-        eta=0.0,
-        use_original_formulation=True,
-    )
-    pipe = DiffusionPipeline.from_pretrained(
-        args.model_path,
-        custom_pipeline="pipeline_motif_video",
-        trust_remote_code=True,
-        torch_dtype=torch_dtype,
-        guider=guider,
-    )
-    pipe = pipe.to("cuda")
-    generator = torch.Generator(device="cuda").manual_seed(args.seed)
-    # Load image for I2V mode
-    image = None
-    if args.image:
-        from PIL import Image
-        image = Image.open(args.image).convert("RGB")
-        print(f"[I2V] Input image: {args.image} ({image.size[0]}x{image.size[1]})")
-    print(f"Generating video: {args.width}x{args.height}, {args.num_frames} frames, {args.num_inference_steps} steps")
-    pipe_kwargs = dict(
-        prompt=args.prompt,
-        image=image,
-        height=args.height,
-        width=args.width,
-        num_frames=args.num_frames,
-        num_inference_steps=args.num_inference_steps,
-        generator=generator,
-        frame_rate=args.fps,
-    )
-    if args.negative_prompt is not None:
-        pipe_kwargs["negative_prompt"] = args.negative_prompt
-    output = pipe(**pipe_kwargs)
-    video_frames = output.frames[0]
-    export_to_video(video_frames, args.output, fps=args.fps)
-    print(f"Video saved to: {args.output}")
-if __name__ == "__main__":
-    main()

model_index.json CHANGED Viewed

@@ -14,7 +14,7 @@
     "GemmaTokenizer"
   ],
   "transformer": [
-    "transformer_motif_video",
     "MotifVideoTransformer3DModel"
   ],
   "vae": [

     "GemmaTokenizer"
   ],
   "transformer": [
+    "diffusers",
     "MotifVideoTransformer3DModel"
   ],
   "vae": [

pipeline_motif_video.py DELETED Viewed

@@ -1,1388 +0,0 @@
-# Copyright 2026 Motif Technologies, Inc. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import html
-import inspect
-from dataclasses import dataclass
-from typing import Any, Callable, Dict, List, Optional, Tuple, Union
-import ftfy
-import numpy as np
-import regex as re
-import torch
-from diffusers import (
-    AdaptiveProjectedGuidance,
-    AutoencoderKLWan,
-    ClassifierFreeGuidance,
-    DiffusionPipeline,
-    DPMSolverMultistepScheduler,
-    FlowMatchEulerDiscreteScheduler,
-    SkipLayerGuidance,
-    UniPCMultistepScheduler,
-)
-from diffusers.callbacks import MultiPipelineCallbacks, PipelineCallback
-from diffusers.guiders.adaptive_projected_guidance import MomentumBuffer
-from diffusers.guiders.guider_utils import GuiderOutput
-from diffusers.utils import (
-    BaseOutput,
-    is_torch_xla_available,
-    logging,
-    replace_example_docstring,
-)
-from diffusers.utils.torch_utils import randn_tensor
-from diffusers.video_processor import VideoProcessor
-from einops import rearrange
-from PIL import Image
-from torch import Tensor
-from transformers import (
-    BatchEncoding,
-    PreTrainedTokenizerBase,
-    SiglipImageProcessor,
-    T5Gemma2Encoder,
-)
-from ._fm_solvers_unipc import FlowUniPCMultistepScheduler
-if is_torch_xla_available():
-    import torch_xla.core.xla_model as xm
-    XLA_AVAILABLE = True
-else:
-    XLA_AVAILABLE = False
-logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
-EXAMPLE_DOC_STRING = """
-    Examples:
-        ```py
-        >>> import torch
-        >>> from diffusers import MotifVideoPipeline
-        >>> from diffusers.utils import export_to_video
-        >>> # Load the Motif Video pipeline
-        >>> motif_video_model_id = "MotifTechnologies/Motif-Video"
-        >>> pipe = MotifVideoPipeline.from_pretrained(motif_video_model_id, torch_dtype=torch.bfloat16)
-        >>> pipe.to("cuda")
-        >>> prompt = "A woman with long brown hair and light skin smiles at another woman with long blonde hair. The woman with brown hair wears a black jacket and has a small, barely noticeable mole on her right cheek. The camera angle is a close-up, focused on the woman with brown hair's face. The lighting is warm and natural, likely from the setting sun, casting a soft glow on the scene. The scene appears to be real-life footage"
-        >>> negative_prompt = "worst quality, inconsistent motion, blurry, jittery, distorted"
-        >>> video = pipe(
-        ...     prompt=prompt,
-        ...     negative_prompt=negative_prompt,
-        ...     width=640,
-        ...     height=352,
-        ...     num_frames=65,
-        ...     num_inference_steps=50,
-        ... ).frames[0]
-        >>> export_to_video(video, "output.mp4", fps=16)
-        ```
-"""
-@dataclass
-class MotifVideoPipelineOutput(BaseOutput):
-    r"""
-    Output class for Motif Video pipelines.
-    Args:
-        frames (`torch.Tensor`, `np.ndarray`, or List[List[PIL.Image.Image]]):
-            List of video outputs - It can be a nested list of length `batch_size,` with each sub-list containing
-            denoised PIL image sequences of length `num_frames.` It can also be a NumPy array or Torch tensor of shape
-            `(batch_size, num_frames, channels, height, width)`.
-    """
-    frames: torch.Tensor
-"""Video-aware Adaptive Projected Guidance (APG).
-Standard APG normalizes over all spatial dimensions [C, T, H, W], which collapses
-temporal variation. This module normalizes over [C, H, W] only, preserving
-per-frame independence.
-"""
-def video_normalized_guidance(
-    pred_cond: torch.Tensor,
-    pred_uncond: torch.Tensor,
-    guidance_scale: float,
-    momentum_buffer: MomentumBuffer | None = None,
-    eta: float = 1.0,
-    norm_threshold: float = 0.0,
-    use_original_formulation: bool = False,
-) -> torch.Tensor:
-    """APG with video-aware normalization: normalize over [C, H, W], exclude T.
-    For 5D input [B, C, T, H, W], dim=[-1, -2, -4] normalizes per-frame (W, H, C),
-    keeping the T dimension independent. For 4D input [B, C, H, W], falls back to
-    standard [-1, -2, -3] behavior.
-    """
-    diff = pred_cond - pred_uncond
-    if len(diff.shape) == 5:
-        # [B, C, T, H, W] → normalize over W(-1), H(-2), C(-4), skip T(-3)
-        dim = [-1, -2, -4]
-    else:
-        # [B, C, H, W] → standard behavior
-        dim = [-i for i in range(1, len(diff.shape))]
-    if momentum_buffer is not None:
-        momentum_buffer.update(diff)
-        diff = momentum_buffer.running_average
-    if norm_threshold > 0:
-        ones = torch.ones_like(diff)
-        diff_norm = diff.norm(p=2, dim=dim, keepdim=True)
-        scale_factor = torch.minimum(ones, norm_threshold / diff_norm)
-        diff = diff * scale_factor
-    v0, v1 = diff.double(), pred_cond.double()
-    v1 = torch.nn.functional.normalize(v1, dim=dim)
-    v0_parallel = (v0 * v1).sum(dim=dim, keepdim=True) * v1
-    v0_orthogonal = v0 - v0_parallel
-    diff_parallel, diff_orthogonal = (
-        v0_parallel.type_as(diff),
-        v0_orthogonal.type_as(diff),
-    )
-    normalized_update = diff_orthogonal + eta * diff_parallel
-    pred = pred_cond if use_original_formulation else pred_uncond
-    pred = pred + guidance_scale * normalized_update
-    return pred
-class VideoAdaptiveProjectedGuidance(AdaptiveProjectedGuidance):
-    """APG variant that normalizes over [C, H, W] per frame, excluding the T dimension."""
-    def forward(self, pred_cond: torch.Tensor, pred_uncond: torch.Tensor | None = None) -> GuiderOutput:
-        pred = None
-        if not self._is_apg_enabled():
-            pred = pred_cond
-        else:
-            pred = video_normalized_guidance(
-                pred_cond,
-                pred_uncond,
-                self.guidance_scale,
-                self.momentum_buffer,
-                self.eta,
-                self.adaptive_projected_guidance_rescale,
-                self.use_original_formulation,
-            )
-        if self.guidance_rescale > 0.0:
-            from diffusers.guiders.classifier_free_guidance import rescale_noise_cfg
-            pred = rescale_noise_cfg(pred, pred_cond, self.guidance_rescale)
-        return GuiderOutput(pred=pred, pred_cond=pred_cond, pred_uncond=pred_uncond)
-# Copied from diffusers.pipelines.flux.pipeline_flux.calculate_shift
-def calculate_shift(
-    image_seq_len,
-    base_seq_len: int = 256,
-    max_seq_len: int = 4096,
-    base_shift: float = 0.5,
-    max_shift: float = 1.15,
-):
-    m = (max_shift - base_shift) / (max_seq_len - base_seq_len)
-    b = base_shift - m * base_seq_len
-    mu = image_seq_len * m + b
-    return mu
-def get_linear_quadratic_sigmas(
-    num_inference_steps: int,
-    linear_quadratic_emulating_steps: int = 250,
-) -> np.ndarray:
-    """
-    Compute a linear-quadratic sigma schedule for flow matching.
-    This schedule combines:
-    - First half: Linear interpolation from high noise to medium noise (slow denoising)
-    - Second half: Quadratic interpolation from medium noise to clean (faster denoising)
-    Convention:
-    - sigma=1.0 represents pure noise
-    - sigma=0.0 represents clean image
-    - Output sigmas are in descending order (1.0 → ~0)
-    Args:
-        num_inference_steps: Total number of denoising steps (must be even).
-        linear_quadratic_emulating_steps: Controls the slope of linear interpolation.
-            Higher values result in gentler slope in the first half.
-    Returns:
-        np.ndarray: Array of sigma values with shape (num_inference_steps,).
-            The scheduler will append a terminal 0.
-    Raises:
-        ValueError: If num_inference_steps is not even.
-    Reference:
-        Linear-quadratic timestep schedule for improved flow matching inference.
-    """
-    if num_inference_steps % 2 != 0:
-        raise ValueError(
-            f"num_inference_steps must be even for linear-quadratic schedule, but got {num_inference_steps}"
-        )
-    steps = num_inference_steps
-    N = linear_quadratic_emulating_steps
-    half_steps = steps // 2
-    # First half: linear interpolation from 1 toward 0
-    # Takes first half_steps values from linspace(1, 0, N+1)
-    linear_part = np.linspace(1.0, 0.0, N + 1)[:half_steps]
-    # Second half: quadratic interpolation
-    # Formula: x^2 * (half_steps/N - 1) - (half_steps/N - 1)
-    #        = (half_steps/N - 1) * (x^2 - 1)
-    # This maps x=0 to (half_steps/N - 1) * (-1) = 1 - half_steps/N
-    # and maps x=1 to 0
-    x = np.linspace(0.0, 1.0, half_steps + 1)
-    scale_factor = half_steps / N - 1  # negative value
-    quadratic_part = x**2 * scale_factor - scale_factor
-    # Concatenate and exclude the last 0 (scheduler appends terminal 0)
-    sigmas = np.concatenate([linear_part, quadratic_part])
-    sigmas = sigmas[:-1]  # Remove trailing 0, scheduler will append it
-    return sigmas.astype(np.float32)
-# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.retrieve_timesteps
-def retrieve_timesteps(
-    scheduler,
-    num_inference_steps: Optional[int] = None,
-    device: Optional[Union[str, torch.device]] = None,
-    timesteps: Optional[List[int]] = None,
-    sigmas: Optional[List[float]] = None,
-    use_linear_quadratic_schedule: bool = False,
-    linear_quadratic_emulating_steps: int = 250,
-    **kwargs,
-):
-    """
-    Retrieve timesteps from the scheduler.
-    Args:
-        scheduler: The noise scheduler to use.
-        num_inference_steps: Number of denoising steps.
-        device: Device to place timesteps on.
-        timesteps: Custom timestep values (mutually exclusive with sigmas).
-        sigmas: Custom sigma values (mutually exclusive with timesteps).
-        use_linear_quadratic_schedule: If True, use linear-quadratic sigma schedule.
-            This overrides the default linear schedule. Requires num_inference_steps
-            to be even.
-        linear_quadratic_emulating_steps: Controls the linear portion slope.
-            Higher values result in gentler slope in the first half. Default: 250.
-        **kwargs: Additional arguments passed to scheduler.set_timesteps().
-    Returns:
-        Tuple of (timesteps, num_inference_steps).
-    Raises:
-        ValueError: If both timesteps and sigmas are provided, or if
-            use_linear_quadratic_schedule is True but num_inference_steps is odd.
-    """
-    if timesteps is not None and sigmas is not None:
-        raise ValueError("Only one of `timesteps` or `sigmas` can be passed. Please choose one to set custom values")
-    # Handle linear-quadratic schedule: compute sigmas if flag is set
-    if use_linear_quadratic_schedule:
-        if sigmas is not None:
-            raise ValueError(
-                "Cannot use both `sigmas` and `use_linear_quadratic_schedule`. "
-                "The linear-quadratic schedule computes sigmas automatically."
-            )
-        if num_inference_steps is None:
-            raise ValueError("`num_inference_steps` must be provided when using `use_linear_quadratic_schedule`.")
-        sigmas = get_linear_quadratic_sigmas(
-            num_inference_steps=num_inference_steps,
-            linear_quadratic_emulating_steps=linear_quadratic_emulating_steps,
-        )
-    if timesteps is not None:
-        accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
-        if not accepts_timesteps:
-            raise ValueError(
-                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
-                f" timestep schedules. Please check whether you are using the correct scheduler."
-            )
-        scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs)
-        timesteps = scheduler.timesteps
-        num_inference_steps = len(timesteps)
-    elif sigmas is not None:
-        accept_sigmas = "sigmas" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
-        if not accept_sigmas:
-            raise ValueError(
-                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
-                f" sigmas schedules. Please check whether you are using the correct scheduler."
-            )
-        scheduler.set_timesteps(sigmas=sigmas, device=device, **kwargs)
-        timesteps = scheduler.timesteps
-        num_inference_steps = len(timesteps)
-    else:
-        scheduler.set_timesteps(num_inference_steps, device=device, **kwargs)
-        timesteps = scheduler.timesteps
-    return timesteps, num_inference_steps
-def basic_clean(text):
-    text = ftfy.fix_text(text)
-    text = html.unescape(html.unescape(text))
-    return text.strip()
-def whitespace_clean(text):
-    text = re.sub(r"\s+", " ", text)
-    text = text.strip()
-    return text
-def prompt_clean(text):
-    text = whitespace_clean(basic_clean(text))
-    return text
-class MotifVideoPipeline(DiffusionPipeline):
-    r"""
-    Pipeline for text-to-video generation using MotifVideoTransformer.
-    Args:
-        transformer ([`MotifVideoTransformer3DModel`]):
-            Conditional Transformer architecture to denoise the encoded video latents.
-        scheduler ([`FlowMatchEulerDiscreteScheduler`]):
-            A scheduler to be used in combination with `transformer` to denoise the encoded video latents.
-        vae ([`AutoencoderKLWan`]):
-            Variational Auto-Encoder (VAE) Model to encode and decode videos to and from latent representations.
-        text_encoder ([`T5Gemma2Encoder`]):
-            Primary text encoder for encoding text prompts into embeddings.
-        tokenizer ([`PreTrainedTokenizerBase`]):
-            Tokenizer corresponding to the primary text encoder.
-        guider ([`ClassifierFreeGuidance`] or [`SkipLayerGuidance`] or [`AdaptiveProjectedGuidance`] or [`VideoAdaptiveProjectedGuidance`], *optional*):
-            The guidance method to use. If `None`, it defaults to `ClassifierFreeGuidance()`.
-    """
-    model_cpu_offload_seq = "text_encoder->transformer->vae"
-    _optional_components = ["feature_extractor"]
-    _callback_tensor_inputs = ["latents", "prompt_embeds", "negative_prompt_embeds"]
-    def __init__(
-        self,
-        scheduler: Union[
-            FlowMatchEulerDiscreteScheduler,
-            DPMSolverMultistepScheduler,
-            UniPCMultistepScheduler,
-            FlowUniPCMultistepScheduler,
-        ],
-        vae: AutoencoderKLWan,
-        text_encoder: T5Gemma2Encoder,
-        tokenizer: PreTrainedTokenizerBase,
-        transformer,
-        guider: Optional[
-            Union[
-                ClassifierFreeGuidance,
-                SkipLayerGuidance,
-                AdaptiveProjectedGuidance,
-                VideoAdaptiveProjectedGuidance,
-            ]
-        ] = None,
-        feature_extractor: Optional[SiglipImageProcessor] = None,
-    ):
-        super().__init__()
-        self.guider = ClassifierFreeGuidance() if guider is None else guider
-        self.register_modules(
-            vae=vae,
-            text_encoder=text_encoder,
-            tokenizer=tokenizer,
-            transformer=transformer,
-            scheduler=scheduler,
-            feature_extractor=feature_extractor,
-        )
-        self.vae_scale_factor_temporal = self.vae.config.scale_factor_temporal if getattr(self, "vae", None) else 4
-        self.vae_scale_factor_spatial = self.vae.config.scale_factor_spatial if getattr(self, "vae", None) else 8
-        self.transformer_spatial_patch_size = (
-            self.transformer.config.patch_size if getattr(self, "transformer", None) is not None else 2
-        )
-        self.transformer_temporal_patch_size = (
-            self.transformer.config.patch_size_t if getattr(self, "transformer") is not None else 1
-        )
-        self.video_processor = VideoProcessor(vae_scale_factor=self.vae_scale_factor_spatial)
-        self.tokenizer_max_length = (
-            self.tokenizer.model_max_length if getattr(self, "tokenizer", None) is not None else 512
-        )
-    def _get_default_embeds(
-        self,
-        text_encoder,
-        tokenizer: PreTrainedTokenizerBase,
-        prompt: Union[str, List[str]],
-        max_sequence_length: int = 512,
-        device: Optional[torch.device] = None,
-        dtype: Optional[torch.dtype] = None,
-    ) -> Tuple[torch.Tensor, torch.Tensor]:
-        dtype = dtype or text_encoder.dtype
-        text_inputs = tokenizer(
-            prompt,
-            padding="max_length",
-            max_length=max_sequence_length,
-            truncation=True,
-            add_special_tokens=True,
-            return_attention_mask=True,
-            return_tensors="pt",
-        )
-        text_inputs = BatchEncoding(
-            {k: v.to(device) if isinstance(v, torch.Tensor) else v for k, v in text_inputs.items()}
-        )
-        prompt_embeds = text_encoder(**text_inputs)[0]
-        prompt_embeds = prompt_embeds.to(dtype=dtype, device=device)
-        return prompt_embeds, text_inputs.attention_mask
-    def _average_pool(self, last_hidden_states: Tensor, attention_mask: Tensor) -> Tensor:
-        last_hidden = last_hidden_states.masked_fill(~attention_mask[..., None].bool(), 0.0)
-        denom = attention_mask.sum(dim=1, keepdim=True).clamp(min=1)  # avoid div by zero
-        return last_hidden.sum(dim=1) / denom
-    def _get_prompt_embeds(
-        self,
-        text_encoder: T5Gemma2Encoder,
-        tokenizer: PreTrainedTokenizerBase,
-        prompt: Union[str, List[str]] | None = None,
-        num_videos_per_prompt: int = 1,
-        max_sequence_length: int = 512,
-        device: Optional[torch.device] = None,
-        dtype: Optional[torch.dtype] = None,
-    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
-        device = device or self._execution_device
-        prompt = [prompt] if isinstance(prompt, str) else prompt
-        prompt_embeds_kwargs = {
-            "text_encoder": text_encoder,
-            "tokenizer": tokenizer,
-            "prompt": prompt,
-            "max_sequence_length": max_sequence_length,
-            "device": device,
-            "dtype": dtype,
-        }
-        # When enable_model_cpu_offload() is active, the accelerate forward hook is on text_encoder (parent). Moving the encoder to the execution device explicitly ensures inputs and
-        # weights are on the same device. The parent's offload hook will move text_encoder back to CPU after
-        # the next component claims the GPU.
-        if next(text_encoder.parameters()).device != torch.device(device):
-            text_encoder.to(device)
-        prompt_embeds, prompt_attention_mask = self._get_default_embeds(**prompt_embeds_kwargs)
-        pooled_prompt_embeds = self._average_pool(prompt_embeds, prompt_attention_mask)
-        return prompt_embeds, prompt_attention_mask, pooled_prompt_embeds
-    # Keep encode_prompt structure, uses _get_prompt_embeds internally
-    def encode_prompt(
-        self,
-        prompt: Union[str, List[str]],
-        num_videos_per_prompt: int = 1,
-        prompt_embeds: Optional[torch.Tensor] = None,
-        pooled_prompt_embeds: Optional[torch.Tensor] = None,
-        prompt_attention_mask: Optional[torch.Tensor] = None,
-        max_sequence_length: int = 512,
-        device: Optional[torch.device] = None,
-        dtype: Optional[torch.dtype] = None,
-    ) -> Tuple[
-        torch.Tensor,
-        torch.Tensor,
-        torch.Tensor,
-    ]:
-        device = device or self._execution_device
-        prompt = [prompt] if isinstance(prompt, str) else prompt
-        if prompt is not None:
-            batch_size = len(prompt)
-        else:
-            batch_size = prompt_embeds.shape[0]
-        prompt_embeds_kwargs = {
-            "device": device,
-            "dtype": dtype,
-        }
-        if prompt_embeds is None:
-            prompt_embeds, prompt_attention_mask, pooled_prompt_embeds = self._get_prompt_embeds(
-                text_encoder=self.text_encoder,
-                tokenizer=self.tokenizer,
-                prompt=prompt,
-                max_sequence_length=max_sequence_length,
-                **prompt_embeds_kwargs,
-            )
-        # Compute actual (non-padding) token count for batch=1 Flash Attention trimming in __call__
-        actual_seq_len = None
-        if batch_size == 1 and prompt_attention_mask is not None:
-            actual_seq_len = int(prompt_attention_mask.sum(dim=-1).max().item())
-        # duplicate text embeddings for each generation per prompt, using mps friendly method
-        seq_len = prompt_embeds.shape[1]
-        prompt_embeds = prompt_embeds.repeat(1, num_videos_per_prompt, 1)
-        prompt_embeds = prompt_embeds.view(batch_size * num_videos_per_prompt, seq_len, -1)
-        if pooled_prompt_embeds is not None:
-            pooled_prompt_embeds = pooled_prompt_embeds.repeat_interleave(num_videos_per_prompt, dim=0)
-        if prompt_attention_mask is not None:
-            prompt_attention_mask = prompt_attention_mask.bool()
-            prompt_attention_mask = prompt_attention_mask.view(batch_size, -1)
-            prompt_attention_mask = prompt_attention_mask.repeat_interleave(num_videos_per_prompt, dim=0)
-        return (
-            prompt_embeds,
-            pooled_prompt_embeds,
-            prompt_attention_mask,
-            actual_seq_len,
-        )
-    @property
-    def vision_encoder(self):
-        """Get the vision encoder from T5Gemma2.
-        T5Gemma2 has vision_tower.vision_model structure.
-        Will raise AttributeError if not available.
-        """
-        return self.text_encoder.vision_tower.vision_model
-    def encode_image(
-        self,
-        image: Image.Image,
-        batch_size: int = 1,
-        device: Optional[torch.device] = None,
-        dtype: Optional[torch.dtype] = None,
-    ) -> torch.Tensor:
-        """Encode image to embeddings using SigLIP vision encoder."""
-        device = device or self._execution_device
-        dtype = dtype or self.transformer.dtype
-        image_embeds = self._get_image_embeds(
-            image_encoder=self.vision_encoder,
-            feature_extractor=self.feature_extractor,
-            image=image,
-            device=device,
-        )
-        image_embeds = image_embeds.repeat(batch_size, 1, 1)
-        return image_embeds.to(device=device, dtype=dtype)
-    @staticmethod
-    def _get_image_embeds(
-        image_encoder,
-        feature_extractor: SiglipImageProcessor,
-        image,
-        device: torch.device,
-    ) -> torch.Tensor:
-        """Helper to encode single image with SigLIP.
-        Args:
-            image_encoder: The SigLIP vision encoder model.
-            feature_extractor: SiglipImageProcessor for preprocessing.
-            image: Can be either:
-                - PIL.Image.Image: Will be preprocessed by feature_extractor
-                - torch.Tensor: Assumed to be in [0, 1] range, will be normalized and passed to encoder
-            device: Device to place tensors on.
-        Returns:
-            Image embeddings from the vision encoder.
-        """
-        image_encoder_dtype = next(image_encoder.parameters()).dtype
-        if isinstance(image, torch.Tensor):
-            image = feature_extractor.preprocess(
-                images=image.float(),
-                do_resize=True,
-                do_rescale=False,
-                do_normalize=True,
-                do_convert_rgb=True,
-                return_tensors="pt",
-            )
-        else:
-            image = feature_extractor.preprocess(
-                images=image,
-                do_resize=True,
-                do_rescale=False,
-                do_normalize=True,
-                do_convert_rgb=True,
-                return_tensors="pt",
-            )
-        image = image.to(device, dtype=image_encoder_dtype)
-        return image_encoder(**image).last_hidden_state
-    @torch.compiler.disable
-    def _prepare_first_frame_conditioning(
-        self,
-        video: torch.Tensor,
-        latents: torch.Tensor,
-        use_conditioning: bool,
-        generator: Optional[torch.Generator] = None,
-    ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor | None]:
-        """Prepare first frame conditioning tensors.
-        This method implements batch-level conditioning where entire
-        batches are either I2V (all samples conditioned) or T2V (no conditioning). This
-        prevents mode confusion within batches.
-        For I2V mode:
-        1. Extract and VAE-encode first frame from video
-        2. Create latent_condition by repeating first frame across time (frame 0 only)
-        3. Create latent_mask with 1.0 at frame 0
-        4. Get image_embeds from vision encoder
-        For T2V mode:
-        1. Pad with zeros for latent_condition and latent_mask
-        Args:
-            video: Input video tensor [batch_size, frames, channels, height, width] in [-1, 1]
-            latents: Latents [batch_size, lantent_channels, latent_num_frames, latent_height, latent_width]
-            use_conditioning: Whether to use first-frame conditioning (True for I2V, False for T2V)
-            generator: Optional random number generator for reproducibility
-        Returns:
-            Tuple of (latent_condition, latent_mask, image_embeds).
-            - latent_condition: [B, C, F, H, W] conditioning signal (zeros for T2V)
-            - latent_mask: [B, 1, F, H, W] binary mask (zeros for T2V)
-            - image_embeds: [B, N, D] image embeddings from vision encoder or None for T2V
-        """
-        batch_size, lantent_channels, latent_num_frames, latent_height, latent_width = latents.shape
-        device = latents.device
-        dtype = latents.dtype
-        # Determine if we should use conditioning
-        use_conditioning = use_conditioning and (latent_num_frames > 1)
-        # Initialize conditioning tensors
-        latent_condition = torch.zeros(
-            batch_size,
-            lantent_channels,
-            latent_num_frames,
-            latent_height,
-            latent_width,
-            device=device,
-            dtype=dtype,
-        )
-        latent_mask = torch.zeros(
-            batch_size,
-            1,
-            latent_num_frames,
-            latent_height,
-            latent_width,
-            device=device,
-            dtype=dtype,
-        )
-        image_embeds = None
-        if use_conditioning:
-            with torch.no_grad():
-                # Encode first frame for latent_condition
-                first_frame_latents = self.vae.encode(
-                    rearrange(video[:, 0:1], "b f c h w -> b c f h w")
-                ).latent_dist.sample(generator=generator)
-            first_frame_latents = self._normalize_latents(
-                latents=first_frame_latents,
-                latents_mean=self.vae.config.latents_mean,
-                latents_std=self.vae.config.latents_std,
-            )
-            # Create latent_condition by repeating first frame across time
-            latent_condition = first_frame_latents.repeat(1, 1, latent_num_frames, 1, 1)
-            latent_condition[:, :, 1:, :, :] = 0
-            # latent_mask: 1.0 at frame 0, 0.0 elsewhere
-            latent_mask[:, :, 0] = 1.0
-            # image_embeds from vision encoder
-            first_frame_vision = video[:, 0]  # [B, C, H, W]
-            first_frame_vision = ((first_frame_vision + 1) / 2).clamp(0, 1)
-            with torch.no_grad():
-                image_embeds = self._get_image_embeds(
-                    image_encoder=self.vision_encoder,
-                    feature_extractor=self.feature_extractor,
-                    image=first_frame_vision,
-                    device=device,
-                )
-        return latent_condition, latent_mask, image_embeds
-    def check_inputs(
-        self,
-        prompt,
-        negative_prompt,
-        height,
-        width,
-        batch_size,
-        callback_on_step_end_tensor_inputs=None,
-        prompt_embeds=None,
-        negative_prompt_embeds=None,
-        prompt_attention_mask=None,
-        negative_prompt_attention_mask=None,
-    ):
-        # Resolution must be divisible by VAE scale factor * transformer patch size
-        # (e.g. 8 * 2 = 16 for default config) to avoid latent/patch dimension mismatch.
-        spatial_divisor = self.vae_scale_factor_spatial * self.transformer_spatial_patch_size
-        if height % spatial_divisor != 0 or width % spatial_divisor != 0:
-            raise ValueError(
-                f"`height` and `width` have to be divisible by {spatial_divisor} "
-                f"(vae_scale={self.vae_scale_factor_spatial} * patch_size={self.transformer_spatial_patch_size}) "
-                f"but are {height} and {width}."
-            )
-        if callback_on_step_end_tensor_inputs is not None and not all(
-            k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
-        ):
-            raise ValueError(
-                f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}"
-            )
-        if prompt is not None and prompt_embeds is not None:
-            raise ValueError(
-                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
-                " only forward one of the two."
-            )
-        elif prompt is None and prompt_embeds is None:
-            raise ValueError(
-                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
-            )
-        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
-            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
-        # Validate negative_prompt: must be None, str, or list with matching batch_size
-        if negative_prompt is not None:
-            if not isinstance(negative_prompt, (str, list)):
-                raise ValueError(f"`negative_prompt` has to be of type `str` or `list` but is {type(negative_prompt)}")
-            if isinstance(negative_prompt, list) and len(negative_prompt) != batch_size:
-                raise ValueError(
-                    f"`negative_prompt` list length ({len(negative_prompt)}) must match batch_size ({batch_size})."
-                )
-        if prompt_embeds is not None and prompt_attention_mask is None:
-            raise ValueError("Must provide `prompt_attention_mask` when specifying `prompt_embeds`.")
-        if negative_prompt_embeds is not None and negative_prompt_attention_mask is None:
-            raise ValueError("Must provide `negative_prompt_attention_mask` when specifying `negative_prompt_embeds`.")
-        if prompt_embeds is not None and negative_prompt_embeds is not None:
-            if prompt_embeds.shape != negative_prompt_embeds.shape:
-                raise ValueError(
-                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
-                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
-                    f" {negative_prompt_embeds.shape}."
-                )
-            if prompt_attention_mask.shape != negative_prompt_attention_mask.shape:
-                raise ValueError(
-                    "`prompt_attention_mask` and `negative_prompt_attention_mask` must have the same shape when passed directly, but"
-                    f" got: `prompt_attention_mask` {prompt_attention_mask.shape} != `negative_prompt_attention_mask`"
-                    f" {negative_prompt_attention_mask.shape}."
-                )
-    def _prepare_negative_prompt(
-        self,
-        negative_prompt: Optional[Union[str, List[str]]],
-        batch_size: int,
-    ) -> List[str]:
-        """
-        Prepare negative_prompt to match batch_size.
-        Args:
-            negative_prompt: None, a single string, or a list of strings matching batch_size.
-            batch_size: The number of prompts in the batch.
-        Returns:
-            A list of negative prompts with length equal to batch_size.
-        """
-        if negative_prompt is None:
-            return [""] * batch_size
-        if isinstance(negative_prompt, str):
-            return [negative_prompt] * batch_size
-        return negative_prompt
-    @staticmethod
-    def _pack_latents(latents: torch.Tensor, patch_size: int = 1, patch_size_t: int = 1) -> torch.Tensor:
-        batch_size, num_channels, num_frames, height, width = latents.shape
-        post_patch_num_frames = num_frames // patch_size_t
-        post_patch_height = height // patch_size
-        post_patch_width = width // patch_size
-        latents = latents.reshape(
-            batch_size,
-            -1,
-            post_patch_num_frames,
-            patch_size_t,
-            post_patch_height,
-            patch_size,
-            post_patch_width,
-            patch_size,
-        )
-        latents = latents.permute(0, 2, 4, 6, 1, 3, 5, 7).flatten(4, 7).flatten(1, 3)
-        return latents
-    @staticmethod
-    def _unpack_latents(
-        latents: torch.Tensor,
-        num_frames: int,
-        height: int,
-        width: int,
-        patch_size: int = 1,
-        patch_size_t: int = 1,
-    ) -> torch.Tensor:
-        batch_size = latents.size(0)
-        latents = latents.reshape(
-            batch_size,
-            num_frames,
-            height,
-            width,
-            -1,
-            patch_size_t,
-            patch_size,
-            patch_size,
-        )
-        latents = latents.permute(0, 4, 1, 5, 2, 6, 3, 7).flatten(6, 7).flatten(4, 5).flatten(2, 3)
-        return latents
-    @staticmethod
-    def _normalize_latents(
-        latents: torch.Tensor, latents_mean: torch.Tensor, latents_std: torch.Tensor
-    ) -> torch.Tensor:
-        # Normalize latents across the channel dimension [B, C, F, H, W]
-        latents_mean = torch.tensor(latents_mean).view(1, -1, 1, 1, 1).to(latents.device, latents.dtype)
-        latents_std = torch.tensor(latents_std).view(1, -1, 1, 1, 1).to(latents.device, latents.dtype)
-        latents = (latents - latents_mean) / latents_std
-        return latents
-    @staticmethod
-    def _denormalize_latents(
-        latents: torch.Tensor, latents_mean: torch.Tensor, latents_std: torch.Tensor
-    ) -> torch.Tensor:
-        # Denormalize latents across the channel dimension [B, C, F, H, W]
-        latents_mean = torch.tensor(latents_mean).view(1, -1, 1, 1, 1).to(latents.device, latents.dtype)
-        latents_std = torch.tensor(latents_std).view(1, -1, 1, 1, 1).to(latents.device, latents.dtype)
-        latents = latents * latents_std + latents_mean
-        return latents
-    def prepare_latents(
-        self,
-        batch_size: int = 1,
-        num_channels_latents: int = 16,
-        height: int = 352,
-        width: int = 640,
-        num_frames: int = 65,
-        dtype: Optional[torch.dtype] = None,
-        device: Optional[torch.device] = None,
-        generator: Optional[torch.Generator] = None,
-        latents: Optional[torch.Tensor] = None,
-    ) -> torch.Tensor:
-        if latents is not None:
-            return latents.to(device=device, dtype=dtype)
-        shape = (
-            batch_size,
-            num_channels_latents,
-            (num_frames - 1) // self.vae_scale_factor_temporal + 1,
-            height // self.vae_scale_factor_spatial,
-            width // self.vae_scale_factor_spatial,
-        )
-        if isinstance(generator, list) and len(generator) != batch_size:
-            raise ValueError(
-                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
-                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
-            )
-        latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
-        return latents
-    @property
-    def num_timesteps(self):
-        return self._num_timesteps
-    @property
-    def current_timestep(self):
-        return self._current_timestep
-    @property
-    def attention_kwargs(self):
-        return self._attention_kwargs
-    @property
-    def interrupt(self):
-        return self._interrupt
-    @torch.no_grad()
-    @replace_example_docstring(EXAMPLE_DOC_STRING)
-    def __call__(
-        self,
-        prompt: Union[str, List[str]] | None = None,
-        image=None,
-        negative_prompt: Optional[
-            Union[str, List[str]]
-        ] = "text overlay, graphic overlay, watermark, logo, subtitles, timestamp, broadcast graphics, UI elements, random letters, frozen pose, rigid, static expression, jerky motion, mechanical motion, discontinuous motion, flat framing, depthless, dull lighting, monotone, crushed shadows, blown-out highlights, shifting background, fading background, poor continuity, identity drift, deformation, flickering, ghosting, smearing, duplication, mutated proportions, inconsistent clothing, flat colors, desaturated, tonally compressed, poor background separation, exposure shift, uneven brightness, color balance shift",
-        height: int = 736,
-        width: int = 1280,
-        num_frames: int = 121,
-        frame_rate: int = 24,
-        num_inference_steps: int = 50,
-        timesteps: List[int] | None = None,
-        use_linear_quadratic_schedule: bool = False,
-        linear_quadratic_emulating_steps: int = 250,
-        num_videos_per_prompt: Optional[int] = 1,
-        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
-        latents: Optional[torch.Tensor] = None,
-        prompt_embeds: Optional[torch.Tensor] = None,
-        pooled_prompt_embeds: Optional[torch.Tensor] = None,
-        prompt_attention_mask: Optional[torch.Tensor] = None,
-        negative_prompt_embeds: Optional[torch.Tensor] = None,
-        negative_pooled_prompt_embeds: Optional[torch.Tensor] = None,
-        negative_prompt_attention_mask: Optional[torch.Tensor] = None,
-        output_type: Optional[str] = "pil",
-        return_dict: bool = True,
-        attention_kwargs: Optional[Dict[str, Any]] = None,
-        callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
-        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
-        max_sequence_length: int = 512,
-        use_attention_mask: bool = True,
-        vae_batch_size: int | None = None,
-    ):
-        r"""
-        Function invoked when calling the pipeline for generation.
-        Args:
-            prompt (`str` or `List[str]`, *optional*):
-                The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
-                instead.
-            negative_prompt (`str` or `List[str]`, *optional*):
-                The prompt or prompts not to guide the image generation. If not defined, one has to pass
-                `negative_prompt_embeds` instead. Ignored when not using guidance.
-            height (`int`, defaults to `352`):
-                The height in pixels of the generated image.
-            width (`int`, defaults to `640`):
-                The width in pixels of the generated image.
-            num_frames (`int`, defaults to `65`):
-                The number of video frames to generate
-            frame_rate (`int`, defaults to `25`):
-                Frame rate for the output video.
-            num_inference_steps (`int`, *optional*, defaults to 50):
-                The number of denoising steps. More denoising steps usually lead to a higher quality video at the
-                expense of slower inference.
-            timesteps (`List[int]`, *optional*):
-                Custom timesteps to use for the denoising process.
-            use_linear_quadratic_schedule (`bool`, defaults to `True`):
-                Whether to use a linear-quadratic sigma schedule instead of the default linear schedule.
-                This schedule combines linear interpolation in the first half (slow denoising at high noise)
-                with quadratic interpolation in the second half (faster denoising toward clean image).
-                Requires `num_inference_steps` to be even.
-            linear_quadratic_emulating_steps (`int`, defaults to `250`):
-                Controls the slope of linear interpolation in the first half of the linear-quadratic schedule.
-                Higher values result in a gentler slope. Only used when `use_linear_quadratic_schedule=True`.
-            num_videos_per_prompt (`int`, *optional*, defaults to 1):
-                The number of videos to generate per prompt.
-            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
-                PyTorch Generator object(s) for deterministic generation.
-            latents (`torch.Tensor`, *optional*):
-                Pre-generated noisy latents.
-            prompt_embeds (`torch.Tensor`, *optional*):
-                Pre-generated text embeddings.
-            pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
-                Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
-                If not provided, pooled text embeddings will be generated from `prompt` input argument.
-            prompt_attention_mask (`torch.Tensor`, *optional*):
-                Pre-generated attention mask for text embeddings.
-            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
-                Pre-generated negative text embeddings.
-            negative_pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
-                Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
-                weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt`
-                input argument.
-            negative_prompt_attention_mask (`torch.FloatTensor`, *optional*):
-                Pre-generated attention mask for negative text embeddings.
-            output_type (`str`, *optional*, defaults to `"pil"`):
-                The output format ("pil" or "np").
-            return_dict (`bool`, *optional*, defaults to `True`):
-                Whether to return a `MotifVideoPipelineOutput`.
-            attention_kwargs (`dict`, *optional*):
-                Arguments passed to the attention processor.
-            callback_on_step_end (`Callable`, *optional*):
-                Callback function called at the end of each step.
-            callback_on_step_end_tensor_inputs (`List`, *optional*):
-                Tensors to include in the callback.
-            max_sequence_length (`int` defaults to `512`):
-                Maximum sequence length for the tokenizer.
-        Examples:
-        Returns:
-            [`~pipelines.motif_video.MotifVideoPipelineOutput`] or `tuple`:
-                If `return_dict` is `True`, returns [`~pipelines.motif_video.MotifVideoPipelineOutput`],
-                otherwise returns a tuple where the first element is a list of generated video frames.
-        """
-        if isinstance(callback_on_step_end, (PipelineCallback, MultiPipelineCallbacks)):
-            callback_on_step_end_tensor_inputs = callback_on_step_end.tensor_inputs
-        # 1. Define call parameters (batch_size needed for check_inputs)
-        if prompt is not None and isinstance(prompt, str):
-            batch_size = 1
-        elif prompt is not None and isinstance(prompt, list):
-            batch_size = len(prompt)
-        else:
-            batch_size = prompt_embeds.shape[0]
-        # 2. Check inputs. Raise error if not correct
-        self.check_inputs(
-            prompt=prompt,
-            negative_prompt=negative_prompt,
-            height=height,
-            width=width,
-            batch_size=batch_size,
-            callback_on_step_end_tensor_inputs=callback_on_step_end_tensor_inputs,
-            prompt_embeds=prompt_embeds,
-            negative_prompt_embeds=negative_prompt_embeds,
-            prompt_attention_mask=prompt_attention_mask,
-            negative_prompt_attention_mask=negative_prompt_attention_mask,
-        )
-        self._attention_kwargs = attention_kwargs
-        self._interrupt = False
-        self._current_timestep = None
-        # Auto-upgrade AdaptiveProjectedGuidance to VideoAdaptiveProjectedGuidance
-        # for video generation. Video-aware APG normalizes per-frame [C,H,W] instead
-        # of collapsing the temporal axis, preserving motion quality.
-        if type(self.guider) is AdaptiveProjectedGuidance:
-            self.guider = VideoAdaptiveProjectedGuidance(
-                guidance_scale=self.guider.guidance_scale,
-                adaptive_projected_guidance_rescale=self.guider.adaptive_projected_guidance_rescale,
-                adaptive_projected_guidance_momentum=self.guider.adaptive_projected_guidance_momentum,
-                eta=self.guider.eta,
-                use_original_formulation=self.guider.use_original_formulation,
-            )
-        device = self._execution_device
-        # 3. Prepare text embeddings
-        prompt_embeds, pooled_prompt_embeds, prompt_attention_mask, pos_actual_len = self.encode_prompt(
-            prompt=prompt,
-            num_videos_per_prompt=num_videos_per_prompt,
-            prompt_embeds=prompt_embeds,
-            pooled_prompt_embeds=pooled_prompt_embeds,
-            prompt_attention_mask=prompt_attention_mask,
-            max_sequence_length=max_sequence_length,
-            device=device,
-        )
-        if not self.guider._enabled and pos_actual_len is not None:
-            prompt_embeds = prompt_embeds[:, :pos_actual_len, :]
-            prompt_attention_mask = None
-        if self.guider._enabled:
-            negative_prompt = self._prepare_negative_prompt(negative_prompt, batch_size)
-            (
-                negative_prompt_embeds,
-                negative_pooled_prompt_embeds,
-                negative_prompt_attention_mask,
-                neg_actual_len,
-            ) = self.encode_prompt(
-                prompt=negative_prompt,
-                num_videos_per_prompt=num_videos_per_prompt,
-                prompt_embeds=negative_prompt_embeds,
-                pooled_prompt_embeds=negative_pooled_prompt_embeds,
-                prompt_attention_mask=negative_prompt_attention_mask,
-                max_sequence_length=max_sequence_length,
-                device=device,
-            )
-            # Trim each to its own actual length — guider runs pos/neg in separate loop iterations,
-            # so different seq lengths are fine. No padding embeddings attend without mask.
-            if pos_actual_len is not None and neg_actual_len is not None:
-                prompt_embeds = prompt_embeds[:, :pos_actual_len, :]
-                negative_prompt_embeds = negative_prompt_embeds[:, :neg_actual_len, :]
-                prompt_attention_mask = None
-                negative_prompt_attention_mask = None
-        num_channels_latents = self.vae.config.z_dim
-        latents = self.prepare_latents(
-            batch_size * num_videos_per_prompt,
-            num_channels_latents,
-            height,
-            width,
-            num_frames,
-            self.transformer.dtype,
-            device,
-            generator,
-            latents,
-        )
-        # 4.5 Preprocess image for I2V conditioning
-        if image is not None:
-            from PIL import Image as PILImage
-            if isinstance(image, PILImage.Image):
-                image = image.convert("RGB").resize((width, height), PILImage.LANCZOS)
-                image = torch.from_numpy(np.array(image)).permute(2, 0, 1).float() / 255.0
-                image = image * 2.0 - 1.0  # [0,1] -> [-1,1]
-                image = image.unsqueeze(0)  # [1, C, H, W]
-            # Handle [C, H, W] -> [1, C, H, W]
-            if image.dim() == 3:
-                image = image.unsqueeze(0)
-            # [B, C, H, W] -> [B, 1, C, H, W] for video format
-            if image.dim() == 4:
-                image = image.unsqueeze(1)
-            image = image.to(device=device, dtype=self.vae.dtype)
-        # 5. Prepare timesteps (including mu calculation)
-        # Recalculate latent dims based on VAE for mu calculation
-        latent_height = height // self.vae_scale_factor_spatial
-        latent_width = width // self.vae_scale_factor_spatial
-        latent_num_frames = (num_frames - 1) // self.vae_scale_factor_temporal + 1
-        # Calculate sequence length based on *packed* dimensions if transformer uses packing
-        # Packed dims: H/patch, W/patch, F/patch_t
-        packed_latent_height = latent_height // self.transformer_spatial_patch_size
-        packed_latent_width = latent_width // self.transformer_spatial_patch_size
-        packed_latent_num_frames = latent_num_frames // self.transformer_temporal_patch_size
-        video_sequence_length = packed_latent_num_frames * packed_latent_height * packed_latent_width
-        # Compute sigmas: use linear-quadratic schedule if enabled, otherwise default linear
-        _is_flow_multistep = isinstance(
-            self.scheduler,
-            (
-                DPMSolverMultistepScheduler,
-                UniPCMultistepScheduler,
-                FlowUniPCMultistepScheduler,
-            ),
-        )
-        # Compute mu once, shared by both branches (required by FlowUniPCMultistepScheduler)
-        mu = calculate_shift(
-            video_sequence_length,
-            self.scheduler.config.get("base_image_seq_len", 256),
-            self.scheduler.config.get("max_image_seq_len", 4096),
-            self.scheduler.config.get("base_shift", 0.5),
-            self.scheduler.config.get("max_shift", 1.15),
-        )
-        if _is_flow_multistep:
-            # DPMSolver/UniPC manage their own sigma schedule via use_flow_sigmas + flow_shift.
-            # Pass mu for dynamic shifting support (required by FlowUniPCMultistepScheduler).
-            timesteps, num_inference_steps = retrieve_timesteps(
-                self.scheduler,
-                num_inference_steps,
-                device,
-                timesteps,
-                mu=mu,
-            )
-        else:
-            if use_linear_quadratic_schedule:
-                # Linear-quadratic schedule computes sigmas internally in retrieve_timesteps
-                sigmas = None
-            else:
-                sigmas = np.linspace(1.0, 1 / num_inference_steps, num_inference_steps)
-            timesteps, num_inference_steps = retrieve_timesteps(
-                self.scheduler,
-                num_inference_steps,
-                device,
-                timesteps,
-                sigmas=sigmas,
-                use_linear_quadratic_schedule=use_linear_quadratic_schedule,
-                linear_quadratic_emulating_steps=linear_quadratic_emulating_steps,
-                mu=mu,
-            )
-        # Get conditioning tensors
-        latent_condition, latent_mask, image_embeds = self._prepare_first_frame_conditioning(
-            image,
-            latents,
-            use_conditioning=image is not None,
-            generator=generator,
-        )
-        num_warmup_steps = max(len(timesteps) - num_inference_steps * self.scheduler.order, 0)
-        self._num_timesteps = len(timesteps)
-        # 6. Denoising loop
-        with self.progress_bar(total=num_inference_steps) as progress_bar:
-            for i, t in enumerate(timesteps):
-                if self.interrupt:
-                    continue
-                self._current_timestep = t
-                # Concatenate current latents with conditioning for this timestep
-                # [latents | latent_condition | latent_mask]
-                hidden_states = torch.cat([latents, latent_condition, latent_mask], dim=1)
-                # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
-                timestep = t.expand(latents.shape[0])
-                # Step 1: Collect model inputs needed for the guidance method
-                # conditional inputs should always be first element in the tuple
-                guider_inputs = {
-                    "encoder_hidden_states": (prompt_embeds, negative_prompt_embeds),
-                }
-                if use_attention_mask and prompt_attention_mask is not None:
-                    guider_inputs["encoder_attention_mask"] = (
-                        prompt_attention_mask,
-                        negative_prompt_attention_mask,
-                    )
-                if self.transformer.config.pooled_projection_dim is not None:
-                    guider_inputs["pooled_projections"] = (
-                        pooled_prompt_embeds,
-                        negative_pooled_prompt_embeds,
-                    )
-                if image_embeds is not None:
-                    guider_inputs["image_embeds"] = (image_embeds, image_embeds)
-                # Step 2: Update guider's internal state for this denoising step
-                self.guider.set_state(step=i, num_inference_steps=num_inference_steps, timestep=t)
-                # Sigma injection for guiders that support sigma-based gating
-                # (Kynkäänniemi 2024). Must precede `prepare_inputs` because
-                # `num_conditions` → `_is_cfg_enabled()` reads `_current_sigma`.
-                # Duck-typed so diffusers-native guiders are unaffected; guard
-                # on scheduler too since some schedulers don't expose `sigmas`.
-                if hasattr(self.guider, "_current_sigma") and hasattr(self.scheduler, "sigmas"):
-                    self.guider._current_sigma = float(self.scheduler.sigmas[i])
-                # Step 3: Prepare batched model inputs based on the guidance method
-                # The guider splits model inputs into separate batches for conditional/unconditional predictions.
-                # For CFG with guider_inputs = {"encoder_hidden_states": (prompt_embeds, negative_prompt_embeds)}:
-                # you will get a guider_state with two batches:
-                #   guider_state = [
-                #       {"encoder_hidden_states": prompt_embeds, "__guidance_identifier__": "pred_cond"},      # conditional batch
-                #       {"encoder_hidden_states": negative_prompt_embeds, "__guidance_identifier__": "pred_uncond"},  # unconditional batch
-                #   ]
-                # Other guidance methods may return 1 batch (no guidance) or 3+ batches (e.g., PAG, APG).
-                guider_state = self.guider.prepare_inputs(guider_inputs)
-                # Step 4: Run the denoiser for each batch
-                # Each batch in guider_state represents a different conditioning (conditional, unconditional, etc.).
-                # We run the model once per batch and store the noise prediction in guider_state_batch.noise_pred.
-                for guider_state_batch in guider_state:
-                    self.guider.prepare_models(self.transformer)
-                    # Extract conditioning kwargs for this batch (e.g., encoder_hidden_states)
-                    cond_kwargs = {
-                        input_name: getattr(guider_state_batch, input_name) for input_name in guider_inputs.keys()
-                    }
-                    tread_disabled = getattr(self.guider, "_current_tread_disabled", False)
-                    # Override TREAD selection ratio per batch if the guider provides one
-                    selection_ratio = getattr(self.guider, "_current_selection_ratio", None)
-                    tread_mixin = getattr(self.transformer, "_inference_tread_mixin", None)
-                    if (
-                        selection_ratio is not None
-                        and tread_mixin is not None
-                        and tread_mixin._tread_route is not None
-                    ):
-                        tread_mixin._tread_route["sel"] = selection_ratio
-                    # e.g. "pred_cond"/"pred_uncond"
-                    context_name = getattr(guider_state_batch, self.guider._identifier_key)
-                    with self.transformer.cache_context(context_name):
-                        # Run denoiser and store noise prediction in this batch
-                        noise_pred = self.transformer(
-                            hidden_states=hidden_states,
-                            timestep=timestep,
-                            attention_kwargs=self.attention_kwargs,
-                            return_dict=False,
-                            tread_disabled=tread_disabled,
-                            **cond_kwargs,
-                        )[0].clone()
-                        guider_state_batch.noise_pred = noise_pred
-                    # Cleanup model (e.g., remove hooks)
-                    self.guider.cleanup_models(self.transformer)
-                # Step 5: Combine predictions using the guidance method
-                # The guider takes all noise predictions from guider_state and combines them according to the guidance algorithm.
-                # Continuing the CFG example, the guider receives:
-                #   guider_state = [
-                #       {"encoder_hidden_states": prompt_embeds, "noise_pred": noise_pred_cond, "__guidance_identifier__": "pred_cond"},      # batch 0
-                #       {"encoder_hidden_states": negative_prompt_embeds, "noise_pred": noise_pred_uncond, "__guidance_identifier__": "pred_uncond"},  # batch 1
-                #   ]
-                # And extracts predictions using the __guidance_identifier__:
-                #   pred_cond = guider_state[0]["noise_pred"]      # extracts noise_pred_cond
-                #   pred_uncond = guider_state[1]["noise_pred"]    # extracts noise_pred_uncond
-                # Then applies CFG formula:
-                #   noise_pred = pred_uncond + guidance_scale * (pred_cond - pred_uncond)
-                # Returns GuiderOutput(pred=noise_pred, pred_cond=pred_cond, pred_uncond=pred_uncond)
-                noise_pred = self.guider(guider_state)[0]
-                # compute the previous noisy sample x_t -> x_t-1
-                latents = self.scheduler.step(noise_pred, t, latents, return_dict=False)[0]
-                if callback_on_step_end is not None:
-                    callback_kwargs = {}
-                    for k in callback_on_step_end_tensor_inputs:
-                        callback_kwargs[k] = locals()[k]
-                    callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
-                    latents = callback_outputs.pop("latents", latents)
-                    prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
-                    # Handle negative embeds if needed by callback
-                    if "negative_prompt_embeds" in callback_outputs:
-                        negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds")
-                # call the callback, if provided
-                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
-                    progress_bar.update()
-                if XLA_AVAILABLE:
-                    xm.mark_step()
-        self._current_timestep = None
-        if output_type == "latent":
-            video = latents
-        else:
-            latents = latents.to(self.vae.dtype)
-            latents = self._denormalize_latents(latents, self.vae.config.latents_mean, self.vae.config.latents_std)
-            if vae_batch_size is not None and latents.shape[0] > vae_batch_size:
-                video_chunks = []
-                for i in range(0, latents.shape[0], vae_batch_size):
-                    chunk = latents[i : i + vae_batch_size]
-                    video_chunks.append(self.vae.decode(chunk, return_dict=False)[0])
-                video = torch.cat(video_chunks, dim=0)
-                del video_chunks
-            else:
-                video = self.vae.decode(latents, return_dict=False)[0]
-            video = self.video_processor.postprocess_video(video, output_type=output_type)
-        # Offload all models
-        self.maybe_free_model_hooks()
-        if not return_dict:
-            return (video,)
-        # Return updated output type
-        return MotifVideoPipelineOutput(frames=video)

transformer/config.json CHANGED Viewed

@@ -3,7 +3,6 @@
   "_diffusers_version": "0.36.0",
   "_library": "diffusers",
   "attention_head_dim": 128,
-  "base_latent_size": null,
   "image_embed_dim": 1152,
   "in_channels": 33,
   "mlp_ratio": 4.0,
@@ -15,7 +14,6 @@
   "out_channels": 16,
   "patch_size": 2,
   "patch_size_t": 1,
-  "pooled_projection_dim": null,
   "qk_norm": "rms_norm",
   "rope_axes_dim": [
     16,

   "_diffusers_version": "0.36.0",
   "_library": "diffusers",
   "attention_head_dim": 128,
   "image_embed_dim": 1152,
   "in_channels": 33,
   "mlp_ratio": 4.0,
   "out_channels": 16,
   "patch_size": 2,
   "patch_size_t": 1,
   "qk_norm": "rms_norm",
   "rope_axes_dim": [
     16,

transformer/transformer_motif_video.py DELETED Viewed

@@ -1,1350 +0,0 @@
-# Copyright 2026 Motif Technologies. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import math
-from functools import lru_cache
-from typing import Any, Dict, List, Optional, Tuple, Union
-import numpy as np
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-from diffusers.configuration_utils import ConfigMixin, register_to_config
-from diffusers.hooks._helpers import TransformerBlockMetadata, TransformerBlockRegistry
-from diffusers.loaders import FromOriginalModelMixin, PeftAdapterMixin
-from diffusers.models.attention import FeedForward
-from diffusers.models.attention_processor import Attention, AttentionProcessor
-from diffusers.models.cache_utils import CacheMixin
-from diffusers.models.embeddings import (
-    PixArtAlphaTextProjection,
-    TimestepEmbedding,
-    Timesteps,
-)
-from diffusers.models.modeling_outputs import Transformer2DModelOutput
-from diffusers.models.modeling_utils import ModelMixin
-from diffusers.models.normalization import (
-    AdaLayerNormContinuous,
-    AdaLayerNormZero,
-    AdaLayerNormZeroSingle,
-)
-from diffusers.utils import USE_PEFT_BACKEND, logging, scale_lora_layers, unscale_lora_layers
-# Stub functions for TREAD (Token REduction with Approximated Distillation).
-# These stubs ensure TREAD code paths are never activated during inference
-# without requiring the motif_core package.
-def is_tread_start(block_idx, start, end): return False
-def is_tread_end(block_idx, start, end): return False
-logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
-NUM_TRAIN_TIMESTEPS = 1000
-def apply_rotary_emb(
-    x: torch.Tensor,
-    freqs_cis: Tuple[torch.Tensor, torch.Tensor],
-    use_real: bool = True,
-    use_real_unbind_dim: int = -1,
-) -> torch.Tensor:
-    """
-    Apply rotary positional embeddings (RoPE) to input tensors.
-    This implementation supports both standard 2D RoPE tensors [L, Dh] and batched 4D RoPE
-    tensors [B, 1, L, Dh] for compatibility with TREAD's token-dropping mechanism where
-    different batches may have different token subsets.
-    Args:
-        x: Input tensor of shape [B, H, L, Dh].
-        freqs_cis: Tuple of (cos, sin) tensors. Supports shapes [L, Dh] or [B, 1, L, Dh].
-        use_real: Whether to use real-valued RoPE implementation.
-        use_real_unbind_dim: Dimension to unbind when using real-valued RoPE (-1 or -2).
-    Returns:
-        Tensor with rotary embeddings applied, same shape as input x.
-    """
-    if use_real:
-        cos, sin = freqs_cis
-        if cos.dim() == 2:  # [L, Dh] → [1, 1, L, Dh]
-            cos = cos.unsqueeze(0).unsqueeze(0)
-            sin = sin.unsqueeze(0).unsqueeze(0)
-        if cos.dim() != 4 or sin.dim() != 4:
-            raise RuntimeError(f"RoPE must be 2D or 4D, got cos={cos.dim()}D, sin={sin.dim()}D")
-        cos, sin = cos.to(x.device), sin.to(x.device)
-        if cos.size(-2) != x.size(-2) or cos.size(-1) != x.size(-1):
-            raise RuntimeError(
-                f"RoPE shape mismatch: rope[-2:]=({cos.size(-2)},{cos.size(-1)}) vs x[-2:]=({x.size(-2)},{x.size(-1)})"
-            )
-        if use_real_unbind_dim == -1:
-            x_real, x_imag = x.reshape(*x.shape[:-1], -1, 2).unbind(-1)
-            x_rotated = torch.stack([-x_imag, x_real], dim=-1).flatten(3)
-        elif use_real_unbind_dim == -2:
-            x_real, x_imag = x.reshape(*x.shape[:-1], 2, -1).unbind(-2)
-            x_rotated = torch.cat([-x_imag, x_real], dim=-1)
-        else:
-            raise ValueError(f"`use_real_unbind_dim={use_real_unbind_dim}` but should be -1 or -2.")
-        out = (x.float() * cos + x_rotated.float() * sin).to(x.dtype)
-        return out
-    else:
-        x_rot = torch.view_as_complex(x.float().reshape(*x.shape[:-1], -1, 2))
-        freqs = freqs_cis.unsqueeze(2)
-        x_out = torch.view_as_real(x_rot * freqs).flatten(3)
-        return x_out.type_as(x)
-class MotifVideoAttnProcessor2_0:
-    def __init__(self):
-        if not hasattr(F, "scaled_dot_product_attention"):
-            raise ImportError(
-                "MotifVideoAttnProcessor2_0 requires PyTorch 2.0. To use it, please upgrade PyTorch to 2.0."
-            )
-    def __call__(
-        self,
-        attn: Attention,
-        hidden_states: torch.Tensor,
-        encoder_hidden_states: Optional[torch.Tensor] = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        image_rotary_emb: Optional[torch.Tensor] = None,
-        query_input: Optional[torch.Tensor] = None,
-        key_input: Optional[torch.Tensor] = None,
-        value_input: Optional[torch.Tensor] = None,
-    ) -> torch.Tensor:
-        # Cross-attention mode: query already projected externally (cross_attn_query_proj + norm),
-        # skip to_q and only apply reshape + norm_q + RoPE. K/V use to_k/to_v as normal.
-        if query_input is not None:
-            query = query_input.unflatten(2, (attn.heads, -1)).transpose(1, 2)
-            key = attn.to_k(key_input)
-            value = attn.to_v(value_input)
-            key = key.unflatten(2, (attn.heads, -1)).transpose(1, 2)
-            value = value.unflatten(2, (attn.heads, -1)).transpose(1, 2)
-            if attn.norm_q is not None:
-                query = attn.norm_q(query)
-            if attn.norm_k is not None:
-                key = attn.norm_k(key)
-            if image_rotary_emb is not None:
-                query = apply_rotary_emb(query, image_rotary_emb)
-            hidden_states = F.scaled_dot_product_attention(
-                query, key, value, attn_mask=attention_mask, dropout_p=0.0, is_causal=False
-            )
-            hidden_states = hidden_states.transpose(1, 2).flatten(2, 3)
-            hidden_states = hidden_states.to(query.dtype)
-            return hidden_states, None
-        if attn.add_q_proj is None and encoder_hidden_states is not None:
-            hidden_states = torch.cat([hidden_states, encoder_hidden_states], dim=1)
-        # 1. QKV projections
-        query = attn.to_q(hidden_states)
-        key = attn.to_k(hidden_states)
-        value = attn.to_v(hidden_states)
-        query = query.unflatten(2, (attn.heads, -1)).transpose(1, 2)
-        key = key.unflatten(2, (attn.heads, -1)).transpose(1, 2)
-        value = value.unflatten(2, (attn.heads, -1)).transpose(1, 2)
-        # 2. QK normalization
-        if attn.norm_q is not None:
-            query = attn.norm_q(query)
-        if attn.norm_k is not None:
-            key = attn.norm_k(key)
-        # 3. Rotational positional embeddings applied to latent stream
-        if image_rotary_emb is not None:
-            if attn.add_q_proj is None and encoder_hidden_states is not None:
-                query = torch.cat(
-                    [
-                        apply_rotary_emb(query[:, :, : -encoder_hidden_states.shape[1]], image_rotary_emb),
-                        query[:, :, -encoder_hidden_states.shape[1] :],
-                    ],
-                    dim=2,
-                )
-                key = torch.cat(
-                    [
-                        apply_rotary_emb(key[:, :, : -encoder_hidden_states.shape[1]], image_rotary_emb),
-                        key[:, :, -encoder_hidden_states.shape[1] :],
-                    ],
-                    dim=2,
-                )
-            else:
-                query = apply_rotary_emb(query, image_rotary_emb)
-                key = apply_rotary_emb(key, image_rotary_emb)
-        # 4. Encoder condition QKV projection and normalization
-        if attn.add_q_proj is not None and encoder_hidden_states is not None:
-            encoder_query = attn.add_q_proj(encoder_hidden_states)
-            encoder_key = attn.add_k_proj(encoder_hidden_states)
-            encoder_value = attn.add_v_proj(encoder_hidden_states)
-            encoder_query = encoder_query.unflatten(2, (attn.heads, -1)).transpose(1, 2)
-            encoder_key = encoder_key.unflatten(2, (attn.heads, -1)).transpose(1, 2)
-            encoder_value = encoder_value.unflatten(2, (attn.heads, -1)).transpose(1, 2)
-            if attn.norm_added_q is not None:
-                encoder_query = attn.norm_added_q(encoder_query)
-            if attn.norm_added_k is not None:
-                encoder_key = attn.norm_added_k(encoder_key)
-            query = torch.cat([query, encoder_query], dim=2)
-            key = torch.cat([key, encoder_key], dim=2)
-            value = torch.cat([value, encoder_value], dim=2)
-        # 5. Attention
-        hidden_states = F.scaled_dot_product_attention(
-            query, key, value, attn_mask=attention_mask, dropout_p=0.0, is_causal=False
-        )
-        hidden_states = hidden_states.transpose(1, 2).flatten(2, 3)
-        hidden_states = hidden_states.to(query.dtype)
-        # 6. Output projection
-        if encoder_hidden_states is not None:
-            hidden_states, encoder_hidden_states = (
-                hidden_states[:, : -encoder_hidden_states.shape[1]],
-                hidden_states[:, -encoder_hidden_states.shape[1] :],
-            )
-            if getattr(attn, "to_out", None) is not None:
-                hidden_states = attn.to_out[0](hidden_states)
-                hidden_states = attn.to_out[1](hidden_states)
-            if getattr(attn, "to_add_out", None) is not None:
-                encoder_hidden_states = attn.to_add_out(encoder_hidden_states)
-        return hidden_states, encoder_hidden_states
-class MotifVideoPatchEmbed(nn.Module):
-    def __init__(
-        self,
-        patch_size: Union[int, Tuple[int, int, int]] = 16,
-        in_chans: int = 3,
-        embed_dim: int = 768,
-    ) -> None:
-        super().__init__()
-        patch_size = (patch_size, patch_size, patch_size) if isinstance(patch_size, int) else patch_size
-        self.proj = nn.Conv3d(in_chans, embed_dim, kernel_size=patch_size, stride=patch_size)
-    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
-        hidden_states = self.proj(hidden_states)
-        hidden_states = hidden_states.flatten(2).transpose(1, 2)  # BCFHW -> BNC
-        return hidden_states
-class MotifVideoAdaNorm(nn.Module):
-    def __init__(self, in_features: int, out_features: Optional[int] = None) -> None:
-        super().__init__()
-        out_features = out_features or 2 * in_features
-        self.linear = nn.Linear(in_features, out_features)
-        self.nonlinearity = nn.SiLU()
-    def forward(self, temb: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
-        temb = self.linear(self.nonlinearity(temb))
-        gate_msa, gate_mlp = temb.chunk(2, dim=1)
-        gate_msa, gate_mlp = gate_msa.unsqueeze(1), gate_mlp.unsqueeze(1)
-        return gate_msa, gate_mlp
-class MotifVideoConditionEmbedding(nn.Module):
-    def __init__(
-        self,
-        embedding_dim: int,
-        pooled_projection_dim: int | None,
-    ):
-        super().__init__()
-        self.time_proj = Timesteps(num_channels=256, flip_sin_to_cos=True, downscale_freq_shift=0)
-        self.timestep_embedder = TimestepEmbedding(in_channels=256, time_embed_dim=embedding_dim)
-        if isinstance(pooled_projection_dim, int):
-            self.text_embedder = PixArtAlphaTextProjection(pooled_projection_dim, embedding_dim, act_fn="silu")
-    def forward(
-        self,
-        timestep: torch.Tensor,
-        pooled_projection: torch.Tensor | None = None,
-    ) -> Tuple[torch.Tensor, torch.Tensor]:
-        timesteps_proj = self.time_proj(timestep)
-        timestep_embedder_dtype = next(self.timestep_embedder.parameters()).dtype
-        conditioning = self.timestep_embedder(timesteps_proj.to(timestep_embedder_dtype))  # (N, D)
-        if pooled_projection is not None:
-            conditioning = conditioning + self.text_embedder(pooled_projection)
-        token_replace_emb = None
-        return conditioning, token_replace_emb
-# Copied from https://github.com/guyyariv/DyPE/blob/5dd4fab99b479ee487754140d717bfb888a6afa2/flux/transformer_flux.py#L485-L486
-def find_correction_factor(num_rotations, dim, base, max_position_embeddings):
-    dtype = num_rotations.dtype if isinstance(num_rotations, torch.Tensor) else torch.float32
-    max_pos_tensor = torch.as_tensor(max_position_embeddings, dtype=dtype)
-    return (dim * torch.log(max_pos_tensor / (num_rotations * 2 * math.pi))) / (
-        2 * math.log(base)
-    )  # Inverse dim formula to find number of rotations
-# Copied from https://github.com/guyyariv/DyPE/blob/5dd4fab99b479ee487754140d717bfb888a6afa2/flux/transformer_flux.py#L489-L495
-def find_correction_range(low_ratio, high_ratio, dim, base, ori_max_pe_len):
-    """
-    Find the correction range for NTK-by-parts interpolation.
-    """
-    low = torch.floor(find_correction_factor(low_ratio, dim, base, ori_max_pe_len))
-    high = torch.ceil(find_correction_factor(high_ratio, dim, base, ori_max_pe_len))
-    low = torch.clamp(low, min=0)
-    high = torch.clamp(high, max=dim - 1)
-    return low, high  # Clamp values just in case
-# Copied from https://github.com/guyyariv/DyPE/blob/5dd4fab99b479ee487754140d717bfb888a6afa2/flux/transformer_flux.py#L498-L504
-def linear_ramp_mask(min_val, max_val, num_dim):
-    if isinstance(min_val, torch.Tensor):
-        if (min_val == max_val).all():
-            max_val = max_val + 0.001
-    elif min_val == max_val:
-        max_val += 0.001
-    linear_func = (torch.arange(num_dim, dtype=torch.float32) - min_val) / (max_val - min_val)
-    ramp_func = torch.clamp(linear_func, 0, 1)
-    return ramp_func
-# Copied from https://github.com/guyyariv/DyPE/blob/5dd4fab99b479ee487754140d717bfb888a6afa2/flux/transformer_flux.py#L507-L511
-def find_newbase_ntk(dim, base, scale):
-    """
-    Calculate the new base for NTK-aware scaling.
-    """
-    # Avoid division by zero when dim == 2 (or invalid smaller values).
-    # In these degenerate cases, fall back to the original base (no NTK adjustment).
-    if dim <= 2:
-        return base
-    return base * (scale ** (dim / (dim - 2)))
-# Copied from https://github.com/guyyariv/DyPE/blob/5dd4fab99b479ee487754140d717bfb888a6afa2/flux/transformer_flux.py#L514-L652
-def get_1d_rotary_pos_embed(
-    dim: int,
-    pos: Union[np.ndarray, int],
-    theta: float = 10000.0,
-    use_real=False,
-    linear_factor=1.0,
-    ntk_factor=1.0,
-    repeat_interleave_real=True,
-    freqs_dtype=torch.float32,
-    yarn=False,
-    max_pe_len=None,
-    ori_max_pe_len=64,
-    dype=False,
-    current_timestep=1.0,
-):
-    """
-    Precompute the frequency tensor for complex exponentials with RoPE.
-    Supports YARN interpolation for vision transformers.
-    Args:
-        dim (`int`):
-            Dimension of the frequency tensor.
-        pos (`np.ndarray` or `int`):
-            Position indices for the frequency tensor. [S] or scalar.
-        theta (`float`, *optional*, defaults to 10000.0):
-            Scaling factor for frequency computation.
-        use_real (`bool`, *optional*, defaults to False):
-            If True, return real part and imaginary part separately. Otherwise, return complex numbers.
-        linear_factor (`float`, *optional*, defaults to 1.0):
-            Scaling factor for linear interpolation.
-        ntk_factor (`float`, *optional*, defaults to 1.0):
-            Scaling factor for NTK-Aware RoPE.
-        repeat_interleave_real (`bool`, *optional*, defaults to True):
-            If True and use_real, real and imaginary parts are interleaved with themselves to reach dim.
-            Otherwise, they are concatenated.
-        freqs_dtype (`torch.float32` or `torch.float64`, *optional*, defaults to `torch.float32`):
-            Data type of the frequency tensor.
-        yarn (`bool`, *optional*, defaults to False):
-            If True, use YARN interpolation combining NTK, linear, and base methods.
-        max_pe_len (`int`, *optional*):
-            Maximum position encoding length (current patches for vision models).
-        ori_max_pe_len (`int`, *optional*, defaults to 64):
-            Original maximum position encoding length (base patches for vision models).
-        dype (`bool`, *optional*, defaults to False):
-            If True, enable DyPE (Dynamic Position Encoding) with timestep-aware scaling.
-        current_timestep (`float`, *optional*, defaults to 1.0):
-            Current timestep for DyPE, normalized to [0, 1] where 1 is pure noise.
-    Returns:
-        `torch.Tensor`: Precomputed frequency tensor with complex exponentials. [S, D/2]
-            If use_real=True, returns tuple of (cos, sin) tensors.
-    """
-    assert dim % 2 == 0
-    if isinstance(pos, int):
-        pos = torch.arange(pos)
-    if isinstance(pos, np.ndarray):
-        pos = torch.from_numpy(pos)
-    device = pos.device
-    if yarn and max_pe_len is not None and max_pe_len > ori_max_pe_len:
-        if not isinstance(max_pe_len, torch.Tensor):
-            max_pe_len = torch.tensor(max_pe_len, dtype=freqs_dtype, device=device)
-        scale = torch.clamp_min(max_pe_len / ori_max_pe_len, 1.0)
-        beta_0 = 1.25
-        beta_1 = 0.75
-        gamma_0 = 16
-        gamma_1 = 2
-        freqs_base = 1.0 / (theta ** (torch.arange(0, dim, 2, dtype=freqs_dtype, device=device) / dim))
-        freqs_linear = 1.0 / torch.einsum(
-            "..., f -> ... f",
-            scale,
-            (theta ** (torch.arange(0, dim, 2, dtype=freqs_dtype, device=device) / dim)),
-        )
-        new_base = find_newbase_ntk(dim, theta, scale)
-        if new_base.dim() > 0:
-            new_base = new_base.view(-1, 1)
-        freqs_ntk = 1.0 / torch.pow(new_base, (torch.arange(0, dim, 2, dtype=freqs_dtype, device=device) / dim))
-        if freqs_ntk.dim() > 1:
-            freqs_ntk = freqs_ntk.squeeze()
-        if dype:
-            beta_0 = torch.pow(beta_0, 2.0 * torch.pow(current_timestep, 2.0))
-            beta_1 = torch.pow(beta_1, 2.0 * torch.pow(current_timestep, 2.0))
-        low, high = find_correction_range(beta_0, beta_1, dim, theta, ori_max_pe_len)
-        high = torch.clamp(high, max=dim // 2)
-        freqs_mask = 1 - linear_ramp_mask(low, high, dim // 2).to(device).to(freqs_dtype)
-        freqs = freqs_linear * (1 - freqs_mask) + freqs_ntk * freqs_mask
-        if dype:
-            gamma_0 = torch.pow(gamma_0, 2.0 * torch.pow(current_timestep, 2.0))
-            gamma_1 = torch.pow(gamma_1, 2.0 * torch.pow(current_timestep, 2.0))
-        low, high = find_correction_range(gamma_0, gamma_1, dim, theta, ori_max_pe_len)
-        high = torch.clamp(high, max=dim // 2)
-        freqs_mask = 1 - linear_ramp_mask(low, high, dim // 2).to(device).to(freqs_dtype)
-        freqs = freqs * (1 - freqs_mask) + freqs_base * freqs_mask
-    else:
-        theta_ntk = theta * ntk_factor
-        freqs = 1.0 / (theta_ntk ** (torch.arange(0, dim, 2, dtype=freqs_dtype, device=device) / dim)) / linear_factor
-    freqs = torch.outer(pos, freqs)
-    is_npu = freqs.device.type == "npu"
-    if is_npu:
-        freqs = freqs.float()
-    if use_real and repeat_interleave_real:
-        freqs_cos = freqs.cos().repeat_interleave(2, dim=1, output_size=freqs.shape[1] * 2).float()
-        freqs_sin = freqs.sin().repeat_interleave(2, dim=1, output_size=freqs.shape[1] * 2).float()
-        if yarn and max_pe_len is not None and max_pe_len > ori_max_pe_len:
-            mscale = torch.where(scale <= 1.0, 1.0, 0.1 * torch.log(scale) + 1.0).to(scale)
-            freqs_cos = freqs_cos * mscale
-            freqs_sin = freqs_sin * mscale
-        return freqs_cos, freqs_sin
-    elif use_real:
-        freqs_cos = torch.cat([freqs.cos(), freqs.cos()], dim=-1).float()
-        freqs_sin = torch.cat([freqs.sin(), freqs.sin()], dim=-1).float()
-        return freqs_cos, freqs_sin
-    else:
-        freqs_cis = torch.polar(torch.ones_like(freqs), freqs)
-        return freqs_cis
-class MotifVideoRotaryPosEmbed(nn.Module):
-    def __init__(
-        self,
-        patch_size: int,
-        patch_size_t: int,
-        rope_dim: List[int],
-        theta: float = 256.0,
-        base_latent_size: int | None = None,
-    ):
-        """
-        Rotary Positional Embedding (RoPE) for video latents.
-        Args:
-            patch_size (`int`):
-                Spatial patch size (e.g., 2).
-            patch_size_t (`int`):
-                Temporal patch size (e.g., 1).
-            rope_dim (`List[int]`):
-                Dimensions for RoPE across [Time, Height, Width] axes.
-            theta (`float`, *optional*, defaults to 256.0):
-                Base frequency for rotary embeddings.
-            base_latent_size (`int`, *optional*):
-                The maximum spatial dimension (in latent units) seen during training,
-                i.e. `training_resolution / vae_scale_factor_spatial`.
-                For example, for 1280x1280 training images and a VAE spatial downscale
-                (`vae_scale_factor_spatial`) of 8, this would be 160; for a downscale
-                of 16, it would be 80.
-        """
-        super().__init__()
-        self.patch_size = patch_size
-        self.patch_size_t = patch_size_t
-        self.rope_dim = rope_dim
-        self.theta = theta
-        self.base_latent_size = base_latent_size
-    @lru_cache(maxsize=8)
-    def _get_base_patch_grid_size(self, base_latent_size: Optional[int], patch_size: int) -> Optional[int]:
-        return base_latent_size // patch_size if base_latent_size else None
-    @lru_cache(maxsize=8)
-    def _get_dynamic_interpolation_scale(self, h: int, w: int, base_grid_size: int) -> float:
-        return math.sqrt(h * w / (base_grid_size**2))
-    def forward(self, hidden_states: torch.Tensor, timestep: Optional[torch.Tensor] = None) -> torch.Tensor:
-        if self.training:
-            assert self.base_latent_size is None, (
-                "RoPE interpolation/extrapolation logic should only be enabled for inference. "
-                f"During training, base_latent_size must be None, but got {self.base_latent_size!r}."
-            )
-        batch_size, num_channels, num_frames, height, width = hidden_states.shape
-        rope_sizes = [num_frames // self.patch_size_t, height // self.patch_size, width // self.patch_size]
-        axes_grids = []
-        for i in range(3):
-            # Note: The following line diverges from original behaviour. We create the grid on the device, whereas
-            # original implementation creates it on CPU and then moves it to device. This results in numerical
-            # differences in layerwise debugging outputs, but visually it is the same.
-            grid = torch.arange(0, rope_sizes[i], device=hidden_states.device, dtype=torch.float32)
-            axes_grids.append(grid)
-        grid = torch.meshgrid(*axes_grids, indexing="ij")  # [W, H, T]
-        grid = torch.stack(grid, dim=0)  # [3, W, H, T]
-        base_patch_grid_size = self._get_base_patch_grid_size(self.base_latent_size, self.patch_size)
-        if base_patch_grid_size is not None:
-            if base_patch_grid_size <= 0:
-                raise ValueError(f"base_patch_grid_size must be a positive number, got {base_patch_grid_size}.")
-            dynamic_interpolation_scale = self._get_dynamic_interpolation_scale(
-                rope_sizes[1], rope_sizes[2], base_patch_grid_size
-            )
-        normalized_timestep = torch.tensor(1.0)
-        if not self.training and timestep is not None:
-            normalized_timestep = timestep[0] / NUM_TRAIN_TIMESTEPS
-        freqs = []
-        for i in range(3):
-            common_kwargs = {
-                "dim": self.rope_dim[i],
-                "pos": grid[i].reshape(-1),
-                "theta": self.theta,
-                "use_real": True,
-                "freqs_dtype": torch.float64,
-            }
-            # Apply scaling only to spatial dimensions (Height and Width, i=1 and i=2)
-            if i > 0 and base_patch_grid_size is not None and dynamic_interpolation_scale > 1.0:
-                # We project the training base to the current size using the uniform scale factor.
-                # max_pe_len tells the RoPE logic the "new" maximum length it's dealing with.
-                max_pe_len = torch.tensor(
-                    base_patch_grid_size * dynamic_interpolation_scale,
-                    dtype=torch.float64,
-                    device=hidden_states.device,
-                )
-                freq = get_1d_rotary_pos_embed(
-                    **common_kwargs,
-                    yarn=True,  # Enable Yet Another RoPE extensioN (YARN) for extrapolation
-                    max_pe_len=max_pe_len,
-                    ori_max_pe_len=base_patch_grid_size,  # The original training scale
-                    dype=True,  # Enable Dynamic Position Encoding (time-aware)
-                    current_timestep=normalized_timestep,
-                )
-            else:
-                # Time dimension OR within training bounds -> Standard RoPE
-                freq = get_1d_rotary_pos_embed(**common_kwargs)
-            freqs.append(freq)
-        freqs_cos = torch.cat([f[0] for f in freqs], dim=1)  # (W * H * T, D / 2)
-        freqs_sin = torch.cat([f[1] for f in freqs], dim=1)  # (W * H * T, D / 2)
-        return freqs_cos, freqs_sin
-class MotifVideoImageProjection(nn.Module):
-    def __init__(self, in_features: int, hidden_size: int):
-        super().__init__()
-        self.norm_in = nn.LayerNorm(in_features)
-        self.linear_1 = nn.Linear(in_features, in_features)
-        self.act_fn = nn.GELU()
-        self.linear_2 = nn.Linear(in_features, hidden_size)
-        self.norm_out = nn.LayerNorm(hidden_size)
-    def forward(self, image_embeds: torch.Tensor) -> torch.Tensor:
-        hidden_states = self.norm_in(image_embeds)
-        hidden_states = self.linear_1(hidden_states)
-        hidden_states = self.act_fn(hidden_states)
-        hidden_states = self.linear_2(hidden_states)
-        hidden_states = self.norm_out(hidden_states)
-        return hidden_states
-class MotifVideoSingleTransformerBlock(nn.Module):
-    def __init__(
-        self,
-        num_attention_heads: int,
-        attention_head_dim: int,
-        mlp_ratio: float = 4.0,
-        qk_norm: str = "rms_norm",
-        norm_type: str = "layer_norm",
-        enable_text_cross_attention: bool = False,
-    ) -> None:
-        super().__init__()
-        hidden_size = num_attention_heads * attention_head_dim
-        mlp_dim = int(hidden_size * mlp_ratio)
-        self.attn = Attention(
-            query_dim=hidden_size,
-            cross_attention_dim=None,
-            dim_head=attention_head_dim,
-            heads=num_attention_heads,
-            out_dim=hidden_size,
-            bias=True,
-            processor=MotifVideoAttnProcessor2_0(),
-            qk_norm=qk_norm,
-            eps=1e-6,
-            pre_only=True,
-        )
-        self.enable_text_cross_attention = enable_text_cross_attention
-        if enable_text_cross_attention:
-            self.cross_attn_query_proj = nn.Linear(hidden_size, hidden_size)
-            self.cross_attn_query_norm = nn.LayerNorm(hidden_size, eps=1e-6)
-            self.cross_attn_out_proj = nn.Linear(hidden_size, hidden_size)
-            nn.init.zeros_(self.cross_attn_out_proj.weight)
-            nn.init.zeros_(self.cross_attn_out_proj.bias)
-        self.norm = AdaLayerNormZeroSingle(hidden_size, norm_type=norm_type)
-        self.proj_mlp = nn.Linear(hidden_size, mlp_dim)
-        self.act_mlp = nn.GELU(approximate="tanh")
-        self.proj_out = nn.Linear(hidden_size + mlp_dim, hidden_size)
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        encoder_hidden_states: torch.Tensor,
-        temb: torch.Tensor,
-        attention_mask: Optional[torch.Tensor] = None,
-        image_rotary_emb: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
-        token_replace_emb: torch.Tensor | None = None,
-        first_frame_num_tokens: int | None = None,
-        image_embed_seq_len: int = 0,
-        encoder_attention_mask: torch.Tensor | None = None,
-    ) -> torch.Tensor:
-        text_seq_length = encoder_hidden_states.shape[1]
-        hidden_states = torch.cat([hidden_states, encoder_hidden_states], dim=1)
-        residual = hidden_states
-        # 1. Input normalization
-        norm_hidden_states, gate = self.norm(hidden_states, emb=temb)
-        mlp_hidden_states = self.act_mlp(self.proj_mlp(norm_hidden_states))
-        norm_hidden_states, norm_encoder_hidden_states = (
-            norm_hidden_states[:, :-text_seq_length, :],
-            norm_hidden_states[:, -text_seq_length:, :],
-        )
-        # 2. Attention
-        attn_output, context_attn_output = self.attn(
-            hidden_states=norm_hidden_states,
-            encoder_hidden_states=norm_encoder_hidden_states,
-            attention_mask=attention_mask,
-            image_rotary_emb=image_rotary_emb,
-        )
-        # Text cross-attention: Q=proj(attn_output), K/V=normed text, reuse self.attn weights
-        if self.enable_text_cross_attention:
-            txt_kv = norm_encoder_hidden_states[:, image_embed_seq_len:, :]
-            text_mask = None
-            if encoder_attention_mask is not None:
-                text_mask = encoder_attention_mask[:, image_embed_seq_len:]
-                text_mask = text_mask.unsqueeze(1).unsqueeze(1).to(torch.bool)  # [B, 1, 1, L_txt]
-            cross_q = self.cross_attn_query_proj(attn_output)
-            cross_output, _ = self.attn(
-                hidden_states=cross_q,
-                query_input=cross_q,
-                key_input=txt_kv,
-                value_input=txt_kv,
-                attention_mask=text_mask,
-                image_rotary_emb=image_rotary_emb,
-            )
-            attn_output = attn_output + self.cross_attn_out_proj(cross_output)
-        attn_output = torch.cat([attn_output, context_attn_output], dim=1)
-        # 3. Modulation and residual connection
-        hidden_states = torch.cat([attn_output, mlp_hidden_states], dim=2)
-        hidden_states = gate.unsqueeze(1) * self.proj_out(hidden_states)
-        hidden_states = hidden_states + residual
-        hidden_states, encoder_hidden_states = (
-            hidden_states[:, :-text_seq_length, :],
-            hidden_states[:, -text_seq_length:, :],
-        )
-        return hidden_states, encoder_hidden_states
-class MotifVideoTransformerBlock(nn.Module):
-    def __init__(
-        self,
-        num_attention_heads: int,
-        attention_head_dim: int,
-        mlp_ratio: float,
-        qk_norm: str = "rms_norm",
-        norm_type: str = "layer_norm",
-        enable_text_cross_attention: bool = False,
-    ) -> None:
-        super().__init__()
-        hidden_size = num_attention_heads * attention_head_dim
-        self.norm1 = AdaLayerNormZero(hidden_size, norm_type=norm_type)
-        self.norm1_context = AdaLayerNormZero(hidden_size, norm_type=norm_type)
-        self.attn = Attention(
-            query_dim=hidden_size,
-            cross_attention_dim=None,
-            added_kv_proj_dim=hidden_size,
-            dim_head=attention_head_dim,
-            heads=num_attention_heads,
-            out_dim=hidden_size,
-            context_pre_only=False,
-            bias=True,
-            processor=MotifVideoAttnProcessor2_0(),
-            qk_norm=qk_norm,
-            eps=1e-6,
-        )
-        self.enable_text_cross_attention = enable_text_cross_attention
-        if enable_text_cross_attention:
-            self.cross_attn_query_proj = nn.Linear(hidden_size, hidden_size)
-            self.cross_attn_query_norm = nn.LayerNorm(hidden_size, eps=1e-6)
-            self.cross_attn_out_proj = nn.Linear(hidden_size, hidden_size)
-            nn.init.zeros_(self.cross_attn_out_proj.weight)
-            nn.init.zeros_(self.cross_attn_out_proj.bias)
-        self.norm2 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
-        self.norm2_context = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
-        self.ff = FeedForward(hidden_size, mult=mlp_ratio, activation_fn="gelu-approximate")
-        self.ff_context = FeedForward(hidden_size, mult=mlp_ratio, activation_fn="gelu-approximate")
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        encoder_hidden_states: torch.Tensor,
-        temb: torch.Tensor,
-        attention_mask: Optional[torch.Tensor] = None,
-        image_rotary_emb: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
-        token_replace_emb: torch.Tensor | None = None,
-        first_frame_num_tokens: int | None = None,
-        image_embed_seq_len: int = 0,
-        encoder_attention_mask: torch.Tensor | None = None,
-    ) -> Tuple[torch.Tensor, torch.Tensor]:
-        # 1. Input normalization
-        norm_hidden_states, gate_msa, shift_mlp, scale_mlp, gate_mlp = self.norm1(hidden_states, emb=temb)
-        norm_encoder_hidden_states, c_gate_msa, c_shift_mlp, c_scale_mlp, c_gate_mlp = self.norm1_context(
-            encoder_hidden_states, emb=temb
-        )
-        # 2. Joint attention
-        attn_output, context_attn_output = self.attn(
-            hidden_states=norm_hidden_states,
-            encoder_hidden_states=norm_encoder_hidden_states,
-            attention_mask=attention_mask,
-            image_rotary_emb=image_rotary_emb,
-        )
-        # 3. Modulation and residual connection
-        hidden_states = hidden_states + attn_output * gate_msa.unsqueeze(1)
-        # Text cross-attention: Q=proj(attn_output), K/V=normed text, reuse self.attn weights
-        if self.enable_text_cross_attention:
-            txt_kv = norm_encoder_hidden_states[:, image_embed_seq_len:, :]
-            text_mask = None
-            if encoder_attention_mask is not None:
-                text_mask = encoder_attention_mask[:, image_embed_seq_len:]
-                text_mask = text_mask.unsqueeze(1).unsqueeze(1).to(torch.bool)  # [B, 1, 1, L_txt]
-            cross_q = self.cross_attn_query_proj(attn_output)
-            cross_output, _ = self.attn(
-                hidden_states=cross_q,
-                query_input=cross_q,
-                key_input=txt_kv,
-                value_input=txt_kv,
-                attention_mask=text_mask,
-                image_rotary_emb=image_rotary_emb,
-            )
-            hidden_states = hidden_states + self.cross_attn_out_proj(cross_output)
-        encoder_hidden_states = encoder_hidden_states + context_attn_output * c_gate_msa.unsqueeze(1)
-        norm_hidden_states = self.norm2(hidden_states)
-        norm_encoder_hidden_states = self.norm2_context(encoder_hidden_states)
-        norm_hidden_states = norm_hidden_states * (1 + scale_mlp[:, None]) + shift_mlp[:, None]
-        norm_encoder_hidden_states = norm_encoder_hidden_states * (1 + c_scale_mlp[:, None]) + c_shift_mlp[:, None]
-        # 4. Feed-forward
-        ff_output = self.ff(norm_hidden_states)
-        context_ff_output = self.ff_context(norm_encoder_hidden_states)
-        hidden_states = hidden_states + gate_mlp.unsqueeze(1) * ff_output
-        encoder_hidden_states = encoder_hidden_states + c_gate_mlp.unsqueeze(1) * context_ff_output
-        return hidden_states, encoder_hidden_states
-TransformerBlockRegistry.register(
-    model_class=MotifVideoTransformerBlock,
-    metadata=TransformerBlockMetadata(
-        return_hidden_states_index=0,
-        return_encoder_hidden_states_index=1,
-    ),
-)
-TransformerBlockRegistry.register(
-    model_class=MotifVideoSingleTransformerBlock,
-    metadata=TransformerBlockMetadata(
-        return_hidden_states_index=0,
-        return_encoder_hidden_states_index=1,
-    ),
-)
-class MotifVideoTransformer3DModel(ModelMixin, ConfigMixin, PeftAdapterMixin, FromOriginalModelMixin, CacheMixin):
-    r"""
-    A Transformer model for video-like data used in [MotifVideo](https://huggingface.co/motif/motifvideo).
-    Args:
-        in_channels (`int`, defaults to `16`):
-            The number of channels in the input.
-        out_channels (`int`, defaults to `16`):
-            The number of channels in the output.
-        num_attention_heads (`int`, defaults to `24`):
-            The number of heads to use for multi-head attention.
-        attention_head_dim (`int`, defaults to `128`):
-            The number of channels in each head.
-        num_layers (`int`, defaults to `20`):
-            The number of layers of dual-stream blocks to use.
-        num_single_layers (`int`, defaults to `40`):
-            The number of layers of single-stream blocks to use.
-        mlp_ratio (`float`, defaults to `4.0`):
-            The ratio of the hidden layer size to the input size in the feedforward network.
-        patch_size (`int`, defaults to `2`):
-            The size of the spatial patches to use in the patch embedding layer.
-        patch_size_t (`int`, defaults to `1`):
-            The size of the temporal patches to use in the patch embedding layer.
-        qk_norm (`str`, defaults to `rms_norm`):
-            The normalization to use for the query and key projections in the attention layers.
-        text_embed_dim (`int`, defaults to `4096`):
-            Input dimension of text embeddings from the text encoder.
-        rope_theta (`float`, defaults to `256.0`):
-            The value of theta to use in the RoPE layer.
-        rope_axes_dim (`Tuple[int]`, defaults to `(16, 56, 56)`):
-            The dimensions of the axes to use in the RoPE layer.
-        base_latent_size (`int`, *optional*):
-            The maximum spatial dimension (in latent units) seen during training.
-            For example, if trained on 1280x1280 with a VAE downscale of 16, this is 80.
-    """
-    _supports_gradient_checkpointing = True
-    _skip_layerwise_casting_patterns = ["x_embedder", "context_embedder", "norm"]
-    _no_split_modules = [
-        "MotifVideoTransformerBlock",
-        "MotifVideoSingleTransformerBlock",
-        "MotifVideoPatchEmbed",
-    ]
-    @register_to_config
-    def __init__(
-        self,
-        in_channels: int = 33,
-        out_channels: int = 16,
-        num_attention_heads: int = 24,
-        attention_head_dim: int = 128,
-        num_layers: int = 20,
-        num_single_layers: int = 40,
-        num_decoder_layers: int = 0,
-        mlp_ratio: float = 4.0,
-        patch_size: int = 2,
-        patch_size_t: int = 1,
-        qk_norm: str = "rms_norm",
-        norm_type: str = "layer_norm",
-        text_embed_dim: int = 4096,
-        image_embed_dim: int | None = None,
-        pooled_projection_dim: int | None = None,
-        rope_theta: float = 256.0,
-        rope_axes_dim: Tuple[int, ...] = (16, 56, 56),
-        base_latent_size: int | None = None,
-        enable_text_cross_attention_dual: bool = False,
-        enable_text_cross_attention_single: bool = False,
-    ) -> None:
-        super().__init__()
-        inner_dim = num_attention_heads * attention_head_dim
-        out_channels = out_channels or in_channels
-        # 1. Latent and condition embedders
-        self.x_embedder = MotifVideoPatchEmbed((patch_size_t, patch_size, patch_size), in_channels, inner_dim)
-        self.context_embedder = PixArtAlphaTextProjection(in_features=text_embed_dim, hidden_size=inner_dim)
-        # First frame conditioning: Image conditioning embedders
-        self.image_embed_dim = image_embed_dim
-        if image_embed_dim is not None:
-            # Project image embeddings from vision encoder to transformer dim
-            self.image_embedder = MotifVideoImageProjection(in_features=image_embed_dim, hidden_size=inner_dim)
-        self.time_text_embed = MotifVideoConditionEmbedding(inner_dim, pooled_projection_dim)
-        # 2. RoPE
-        self.rope = MotifVideoRotaryPosEmbed(
-            patch_size, patch_size_t, rope_axes_dim, rope_theta, base_latent_size=base_latent_size
-        )
-        # Cross-attention config
-        self.enable_text_cross_attention_dual = enable_text_cross_attention_dual
-        self.enable_text_cross_attention_single = enable_text_cross_attention_single
-        # 3. Dual stream transformer blocks
-        self.transformer_blocks = nn.ModuleList(
-            [
-                MotifVideoTransformerBlock(
-                    num_attention_heads,
-                    attention_head_dim,
-                    mlp_ratio=mlp_ratio,
-                    qk_norm=qk_norm,
-                    norm_type=norm_type,
-                    enable_text_cross_attention=enable_text_cross_attention_dual,
-                )
-                for _ in range(num_layers)
-            ]
-        )
-        # 4. Single stream transformer blocks
-        # Encoder blocks get cross-attention; decoder blocks do not (no text stream in decoder)
-        num_encoder_single = num_single_layers - num_decoder_layers
-        self.single_transformer_blocks = nn.ModuleList(
-            [
-                MotifVideoSingleTransformerBlock(
-                    num_attention_heads,
-                    attention_head_dim,
-                    mlp_ratio=mlp_ratio,
-                    qk_norm=qk_norm,
-                    norm_type=norm_type,
-                    enable_text_cross_attention=enable_text_cross_attention_single
-                    if i < num_encoder_single
-                    else False,
-                )
-                for i in range(num_single_layers)
-            ]
-        )
-        # 5. Output projection
-        self.norm_out = AdaLayerNormContinuous(
-            inner_dim, inner_dim, elementwise_affine=False, eps=1e-6, norm_type=norm_type
-        )
-        self.proj_out = nn.Linear(inner_dim, patch_size_t * patch_size * patch_size * out_channels)
-        # Verify cross-attention config matches actual block state.
-        # Catches silent misconfiguration (e.g. checkpoint config with renamed keys).
-        for i, block in enumerate(self.transformer_blocks):
-            if block.enable_text_cross_attention != enable_text_cross_attention_dual:
-                raise ValueError(
-                    f"transformer_blocks[{i}].enable_text_cross_attention="
-                    f"{block.enable_text_cross_attention}, expected {enable_text_cross_attention_dual}. "
-                    f"Check checkpoint config.json key names match __init__ parameters."
-                )
-        num_encoder_single = num_single_layers - num_decoder_layers
-        for i, block in enumerate(self.single_transformer_blocks):
-            expected = enable_text_cross_attention_single if i < num_encoder_single else False
-            if block.enable_text_cross_attention != expected:
-                raise ValueError(
-                    f"single_transformer_blocks[{i}].enable_text_cross_attention="
-                    f"{block.enable_text_cross_attention}, expected {expected}. "
-                    f"Check checkpoint config.json key names match __init__ parameters."
-                )
-        self.gradient_checkpointing = False
-        self.num_decoder_layers = num_decoder_layers
-    @property
-    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.attn_processors
-    def attn_processors(self) -> Dict[str, AttentionProcessor]:
-        r"""
-        Returns:
-            `dict` of attention processors: A dictionary containing all attention processors used in the model with
-            indexed by its weight name.
-        """
-        # set recursively
-        processors = {}
-        def fn_recursive_add_processors(name: str, module: torch.nn.Module, processors: Dict[str, AttentionProcessor]):
-            if hasattr(module, "get_processor"):
-                processors[f"{name}.processor"] = module.get_processor()
-            for sub_name, child in module.named_children():
-                fn_recursive_add_processors(f"{name}.{sub_name}", child, processors)
-            return processors
-        for name, module in self.named_children():
-            fn_recursive_add_processors(name, module, processors)
-        return processors
-    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.set_attn_processor
-    def set_attn_processor(self, processor: Union[AttentionProcessor, Dict[str, AttentionProcessor]]):
-        r"""
-        Sets the attention processor to use to compute attention.
-        Parameters:
-            processor (`dict` of `AttentionProcessor` or only `AttentionProcessor`):
-                The instantiated processor class or a dictionary of processor classes that will be set as the processor
-                for **all** `Attention` layers.
-                If `processor` is a dict, the key needs to define the path to the corresponding cross attention
-                processor. This is strongly recommended when setting trainable attention processors.
-        """
-        count = len(self.attn_processors.keys())
-        if isinstance(processor, dict) and len(processor) != count:
-            raise ValueError(
-                f"A dict of processors was passed, but the number of processors {len(processor)} does not match the"
-                f" number of attention layers: {count}. Please make sure to pass {count} processor classes."
-            )
-        def fn_recursive_attn_processor(name: str, module: torch.nn.Module, processor):
-            if hasattr(module, "set_processor"):
-                if not isinstance(processor, dict):
-                    module.set_processor(processor)
-                else:
-                    module.set_processor(processor.pop(f"{name}.processor"))
-            for sub_name, child in module.named_children():
-                fn_recursive_attn_processor(f"{name}.{sub_name}", child, processor)
-        for name, module in self.named_children():
-            fn_recursive_attn_processor(name, module, processor)
-    def _maybe_gradient_checkpoint_block(self, block, *args):
-        if torch.is_grad_enabled() and self.gradient_checkpointing:
-            return self._gradient_checkpointing_func(block, *args)
-        return block(*args)
-    def _get_unwrapped_blocks(self, blocks):
-        if hasattr(blocks, "_checkpoint_wrapped_module"):
-            return blocks._checkpoint_wrapped_module
-        elif hasattr(blocks, "module"):
-            return blocks.module
-        return blocks
-    def _create_attention_mask(
-        self,
-        hidden_states: torch.Tensor,
-        encoder_attention_mask: torch.Tensor,
-    ) -> torch.Tensor:
-        """
-        Create attention mask of shape [B, 1, 1, N] where N = L + E,
-        based on latent tokens (always valid) and the encoder mask.
-        Args:
-            hidden_states: [B, L, D]
-            encoder_attention_mask: [B, E] (required)
-        Returns:
-            attention_mask: [B, 1, 1, N]
-        """
-        attention_mask = F.pad(
-            encoder_attention_mask.to(torch.bool),
-            (hidden_states.shape[1], 0),
-            value=True,
-        )
-        attention_mask = attention_mask.unsqueeze(1).unsqueeze(1)  # [B, 1, 1, L+E]
-        return attention_mask
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        timestep: torch.LongTensor,
-        encoder_hidden_states: torch.Tensor,
-        encoder_attention_mask: torch.Tensor | None = None,
-        pooled_projections: torch.Tensor | None = None,
-        image_embeds: torch.Tensor | None = None,
-        attention_kwargs: Optional[Dict[str, Any]] = None,
-        return_dict: bool = True,
-        tread_mixin: Optional[Any] = None,
-        tread_disabled: bool = False,
-    ) -> Union[torch.Tensor, Dict[str, torch.Tensor]]:
-        """
-        Forward pass of the MotifVideoTransformer3DModel.
-        Args:
-            hidden_states: Input latent tensor [B, C, F, H, W].
-            timestep: Diffusion timesteps [B].
-            encoder_hidden_states: Text conditioning [B, E, D].
-            encoder_attention_mask: Mask for text conditioning [B, E].
-            pooled_projections: Pooled text embeddings [B, D].
-            image_embeds: Optional image embeddings from vision encoder [B, N, D].
-            attention_kwargs: Additional arguments for attention processors.
-            return_dict: Whether to return a Transformer2DModelOutput.
-            tread_mixin: Optional TreadMixin instance for token reduction.
-            tread_disabled: When True, force tread_mixin to None (dense pass).
-                torch.compile specializes on this bool, producing separate graphs
-                for dense vs routed without attribute toggling.
-        Returns:
-            Transformer2DModelOutput or tuple containing the predicted samples.
-        """
-        if tread_disabled:
-            tread_mixin = None
-        elif tread_mixin is None:
-            tread_mixin = getattr(self, "_inference_tread_mixin", None)
-        if attention_kwargs is not None:
-            attention_kwargs = attention_kwargs.copy()
-            lora_scale = attention_kwargs.pop("scale", 1.0)
-        else:
-            lora_scale = 1.0
-        if USE_PEFT_BACKEND:
-            # weight the lora layers by setting `lora_scale` for each PEFT layer
-            scale_lora_layers(self, lora_scale)
-        else:
-            if attention_kwargs is not None and attention_kwargs.get("scale", None) is not None:
-                logger.warning(
-                    "Passing `scale` via `attention_kwargs` when not using the PEFT backend is ineffective."
-                )
-        batch_size, num_channels, num_frames, height, width = hidden_states.shape
-        p, p_t = self.config.patch_size, self.config.patch_size_t
-        post_patch_num_frames = num_frames // p_t
-        post_patch_height = height // p
-        post_patch_width = width // p
-        first_frame_num_tokens = 1 * post_patch_height * post_patch_width
-        # 1. RoPE
-        image_rotary_emb = self.rope(hidden_states, timestep=timestep)
-        # 2. Conditional embeddings
-        temb, token_replace_emb = self.time_text_embed(timestep, pooled_projections)
-        hidden_states = self.x_embedder(hidden_states)
-        encoder_hidden_states = self.context_embedder(encoder_hidden_states)
-        # First frame conditioning: Image embeddings from vision encoder
-        if image_embeds is not None:
-            # image_embeds: [B, N, D_img] -> [B, N, D]
-            image_embeds = self.image_embedder(image_embeds)
-            encoder_hidden_states = torch.cat([image_embeds, encoder_hidden_states], dim=1)
-            # Extend attention mask for image tokens
-            if encoder_attention_mask is not None:
-                image_mask = torch.ones(
-                    image_embeds.shape[0],
-                    image_embeds.shape[1],
-                    device=encoder_attention_mask.device,
-                    dtype=encoder_attention_mask.dtype,
-                )
-                encoder_attention_mask = torch.cat([image_mask, encoder_attention_mask], dim=1)
-        # image_embed_seq_len: used by cross-attention blocks to slice text from encoder_hidden_states
-        image_embed_seq_len = image_embeds.shape[1] if image_embeds is not None else 0
-        decoder_hidden_states = hidden_states.clone()
-        if encoder_attention_mask is not None:
-            attention_mask = self._create_attention_mask(
-                hidden_states=hidden_states,
-                encoder_attention_mask=encoder_attention_mask,
-            )
-        else:
-            attention_mask = None
-        # TREAD state initialization: manage token reduction manually to support activation checkpointing
-        tread_active = False
-        current_route = None
-        ids_keep = None
-        x_full = None
-        orig_mask = attention_mask
-        orig_rope = image_rotary_emb
-        latent_len = hidden_states.shape[1]
-        # 4. Dual stream transformer blocks (Encoder)
-        for i, block in enumerate(self.transformer_blocks):
-            # Drop tokens if (1) TREAD is enabled, (2) current block is within the TREAD route.
-            if is_tread_start(tread_mixin, tread_active, i):
-                tread_active = True
-                current_route = tread_mixin._tread_route
-                # Reduce sequence length at the start of a TREAD route
-                ids_keep = tread_mixin.keep_indices(hidden_states, current_route["sel"]).to(hidden_states.device)
-                x_full = hidden_states.contiguous()
-                hidden_states = tread_mixin.gather_tokens(hidden_states, ids_keep)
-                attention_mask = tread_mixin.adjust_mask(orig_mask, latent_len, ids_keep)
-                image_rotary_emb = tread_mixin.gather_rope(orig_rope, ids_keep)
-            hidden_states, encoder_hidden_states = self._maybe_gradient_checkpoint_block(
-                block,
-                hidden_states,
-                encoder_hidden_states,
-                temb,
-                attention_mask,
-                image_rotary_emb,
-                token_replace_emb,
-                first_frame_num_tokens,
-                image_embed_seq_len,
-                encoder_attention_mask,
-            )
-            if is_tread_end(tread_mixin, tread_active, i):
-                # Restore full sequence length at the end of a TREAD route
-                hidden_states = tread_mixin.scatter_tokens(hidden_states, ids_keep, x_full)
-                tread_active = False
-                current_route = None
-                ids_keep = None
-                x_full = None
-                attention_mask = orig_mask
-                image_rotary_emb = orig_rope
-        # We need to unwrap the blocks because CheckpointWrapper does not support len(),
-        # which is required for slicing the blocks into encoder and decoder parts.
-        single_transformer_blocks = self.single_transformer_blocks
-        # 5. Single stream transformer blocks (Encoder)
-        num_dual = len(self.transformer_blocks)
-        for i, block in enumerate(
-            single_transformer_blocks[: len(single_transformer_blocks) - self.num_decoder_layers]
-        ):
-            # Drop tokens if (1) TREAD is enabled, (2) current block is within the TREAD route.
-            abs_i = num_dual + i
-            if is_tread_start(tread_mixin, tread_active, abs_i):
-                tread_active = True
-                current_route = tread_mixin._tread_route
-                # Reduce sequence length at the start of a TREAD route
-                ids_keep = tread_mixin.keep_indices(hidden_states, current_route["sel"]).to(hidden_states.device)
-                x_full = hidden_states.contiguous()
-                hidden_states = tread_mixin.gather_tokens(hidden_states, ids_keep)
-                attention_mask = tread_mixin.adjust_mask(orig_mask, latent_len, ids_keep)
-                image_rotary_emb = tread_mixin.gather_rope(orig_rope, ids_keep)
-            hidden_states, encoder_hidden_states = self._maybe_gradient_checkpoint_block(
-                block,
-                hidden_states,
-                encoder_hidden_states,
-                temb,
-                attention_mask,
-                image_rotary_emb,
-                token_replace_emb,
-                first_frame_num_tokens,
-                image_embed_seq_len,
-                encoder_attention_mask,
-            )
-            if is_tread_end(tread_mixin, tread_active, abs_i):
-                # Restore full sequence length at the end of a TREAD route
-                hidden_states = tread_mixin.scatter_tokens(hidden_states, ids_keep, x_full)
-                tread_active = False
-                current_route = None
-                ids_keep = None
-                x_full = None
-                attention_mask = orig_mask
-                image_rotary_emb = orig_rope
-        # 6. Single stream transformer blocks (Decoder)
-        if self.num_decoder_layers > 0:
-            encoder_hidden_states = hidden_states
-            attention_mask = None
-            num_single = len(single_transformer_blocks)
-            for i, block in enumerate(single_transformer_blocks[-self.num_decoder_layers :]):
-                abs_i = num_dual + (num_single - self.num_decoder_layers) + i
-                if is_tread_start(tread_mixin, tread_active, abs_i):
-                    tread_active = True
-                    current_route = tread_mixin._tread_route
-                    # Reduce sequence length at the start of a TREAD route
-                    ids_keep = tread_mixin.keep_indices(decoder_hidden_states, current_route["sel"]).to(
-                        decoder_hidden_states.device
-                    )
-                    x_full = encoder_hidden_states.contiguous()
-                    x_t_full = decoder_hidden_states.contiguous()
-                    decoder_hidden_states = tread_mixin.gather_tokens(decoder_hidden_states, ids_keep)
-                    encoder_hidden_states = tread_mixin.gather_tokens(encoder_hidden_states, ids_keep)
-                    attention_mask = tread_mixin.adjust_mask(orig_mask, latent_len, ids_keep)
-                    image_rotary_emb = tread_mixin.gather_rope(orig_rope, ids_keep)
-                decoder_hidden_states, encoder_hidden_states = self._maybe_gradient_checkpoint_block(
-                    block,
-                    decoder_hidden_states,
-                    encoder_hidden_states,
-                    temb,
-                    attention_mask,
-                    image_rotary_emb,
-                    token_replace_emb,
-                    first_frame_num_tokens,
-                )
-                if is_tread_end(tread_mixin, tread_active, abs_i):
-                    # Restore full sequence length at the end of a TREAD route
-                    decoder_hidden_states = tread_mixin.scatter_tokens(decoder_hidden_states, ids_keep, x_t_full)
-                    encoder_hidden_states = tread_mixin.scatter_tokens(encoder_hidden_states, ids_keep, x_full)
-                    tread_active = False
-                    current_route = None
-                    ids_keep = None
-                    x_full = None
-                    x_t_full = None
-                    attention_mask = orig_mask
-                    image_rotary_emb = orig_rope
-            hidden_states = decoder_hidden_states
-        # 7. Output projection
-        hidden_states = self.norm_out(hidden_states, temb)
-        hidden_states = self.proj_out(hidden_states)
-        hidden_states = hidden_states.reshape(
-            batch_size, post_patch_num_frames, post_patch_height, post_patch_width, -1, p_t, p, p
-        )
-        hidden_states = hidden_states.permute(0, 4, 1, 5, 2, 6, 3, 7)
-        hidden_states = hidden_states.flatten(6, 7).flatten(4, 5).flatten(2, 3)
-        if USE_PEFT_BACKEND:
-            # remove `lora_scale` from each PEFT layer
-            unscale_lora_layers(self, lora_scale)
-        if not return_dict:
-            return (hidden_states,)
-        return Transformer2DModelOutput(
-            sample=hidden_states,
-        )