feat: move prompt_embeds trim to call for correct CFG alignment

encode_prompt now returns actual_seq_len instead of trimming directly.
__call__ trims both positive/negative embeds to max(pos_len, neg_len)
using real encoder outputs (no zero-padding). Enables Flash Attention
by setting attention_mask=None after trim.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

Files changed (1) hide show

pipeline_motif_video.py +20 -5

pipeline_motif_video.py CHANGED Viewed

@@ -539,11 +539,10 @@ class MotifVideoPipeline(DiffusionPipeline):
                 **prompt_embeds_kwargs,
             )
-        # Trim padding for batch=1 to enable Flash Attention (attn_mask=None → SDPA uses Flash backend)
         if batch_size == 1 and prompt_attention_mask is not None:
-            actual_len = prompt_attention_mask.sum(dim=-1).max().item()
-            prompt_embeds = prompt_embeds[:, :actual_len, :]
-            prompt_attention_mask = None
         # duplicate text embeddings for each generation per prompt, using mps friendly method
         seq_len = prompt_embeds.shape[1]
@@ -562,6 +561,7 @@ class MotifVideoPipeline(DiffusionPipeline):
             prompt_embeds,
             pooled_prompt_embeds,
             prompt_attention_mask,
         )
     @property
@@ -1087,7 +1087,7 @@ class MotifVideoPipeline(DiffusionPipeline):
         device = self._execution_device
         # 3. Prepare text embeddings
-        prompt_embeds, pooled_prompt_embeds, prompt_attention_mask = self.encode_prompt(
             prompt=prompt,
             num_videos_per_prompt=num_videos_per_prompt,
             prompt_embeds=prompt_embeds,
@@ -1097,12 +1097,18 @@ class MotifVideoPipeline(DiffusionPipeline):
             device=device,
         )
         if self.guider._enabled:
             negative_prompt = self._prepare_negative_prompt(negative_prompt, batch_size)
             (
                 negative_prompt_embeds,
                 negative_pooled_prompt_embeds,
                 negative_prompt_attention_mask,
             ) = self.encode_prompt(
                 prompt=negative_prompt,
                 num_videos_per_prompt=num_videos_per_prompt,
@@ -1113,6 +1119,15 @@ class MotifVideoPipeline(DiffusionPipeline):
                 device=device,
             )
         num_channels_latents = self.vae.config.z_dim
         latents = self.prepare_latents(
             batch_size * num_videos_per_prompt,

                 **prompt_embeds_kwargs,
             )
+        # Compute actual (non-padding) token count for batch=1 Flash Attention trimming in __call__
+        actual_seq_len = None
         if batch_size == 1 and prompt_attention_mask is not None:
+            actual_seq_len = int(prompt_attention_mask.sum(dim=-1).max().item())
         # duplicate text embeddings for each generation per prompt, using mps friendly method
         seq_len = prompt_embeds.shape[1]
             prompt_embeds,
             pooled_prompt_embeds,
             prompt_attention_mask,
+            actual_seq_len,
         )
     @property
         device = self._execution_device
         # 3. Prepare text embeddings
+        prompt_embeds, pooled_prompt_embeds, prompt_attention_mask, pos_actual_len = self.encode_prompt(
             prompt=prompt,
             num_videos_per_prompt=num_videos_per_prompt,
             prompt_embeds=prompt_embeds,
             device=device,
         )
+        if not self.guider._enabled and pos_actual_len is not None:
+            # No CFG: trim positive only
+            prompt_embeds = prompt_embeds[:, :pos_actual_len, :]
+            prompt_attention_mask = None
         if self.guider._enabled:
             negative_prompt = self._prepare_negative_prompt(negative_prompt, batch_size)
             (
                 negative_prompt_embeds,
                 negative_pooled_prompt_embeds,
                 negative_prompt_attention_mask,
+                neg_actual_len,
             ) = self.encode_prompt(
                 prompt=negative_prompt,
                 num_videos_per_prompt=num_videos_per_prompt,
                 device=device,
             )
+            # Trim prompt_embeds for batch=1 to enable Flash Attention (attn_mask=None → SDPA uses Flash backend).
+            # Use max(pos, neg) actual_len so both have real encoder embeddings at every position (no zero-padding).
+            if pos_actual_len is not None and neg_actual_len is not None:
+                trim_len = max(pos_actual_len, neg_actual_len)
+                prompt_embeds = prompt_embeds[:, :trim_len, :]
+                negative_prompt_embeds = negative_prompt_embeds[:, :trim_len, :]
+                prompt_attention_mask = None
+                negative_prompt_attention_mask = None
         num_channels_latents = self.vae.config.z_dim
         latents = self.prepare_latents(
             batch_size * num_videos_per_prompt,

feat: move prompt_embeds trim to __call__ for correct CFG alignment

feat: move prompt_embeds trim to call for correct CFG alignment