Sync model code with ace_step==1.6.0 to prevent runtime mutation

Browse files

Files changed (1) hide show

acestep-v15-base/modeling_acestep_v15_base.py +249 -92

acestep-v15-base/modeling_acestep_v15_base.py CHANGED Viewed

@@ -11,6 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import math
 import time
 from typing import Callable, List, Optional, Union
@@ -44,10 +45,10 @@ from vector_quantize_pytorch import ResidualFSQ
 # Local config import with fallback
 try:
     from .configuration_acestep_v15 import AceStepConfig
-    from .apg_guidance import adg_forward, apg_forward, MomentumBuffer
 except ImportError:
     from configuration_acestep_v15 import AceStepConfig
-    from apg_guidance import adg_forward, apg_forward, MomentumBuffer
 logger = logging.get_logger(__name__)
@@ -115,7 +116,7 @@ def create_4d_mask(
         # We want to mask out invalid keys (columns)
         # Expand shape: [Batch, 1, 1, Seq_Len]
         padding_mask_4d = attention_mask.view(attention_mask.shape[0], 1, 1, seq_len).to(torch.bool)
         # Broadcasting: Geometry Mask [1, 1, L, L] & Padding Mask [B, 1, 1, L]
         # Result shape: [B, 1, L, L]
         valid_mask = valid_mask & padding_mask_4d
@@ -125,13 +126,13 @@ def create_4d_mask(
     # ------------------------------------------------------
     # Get the minimal value for current dtype
     min_dtype = torch.finfo(dtype).min
     # Create result tensor filled with -inf by default
     mask_tensor = torch.full(valid_mask.shape, min_dtype, dtype=dtype, device=device)
     # Set valid positions to 0.0
     mask_tensor.masked_fill_(valid_mask, 0.0)
     return mask_tensor
@@ -200,7 +201,7 @@ def sample_t_r(batch_size, device, dtype, data_proportion=0.0, timestep_mu=-0.4,
 class TimestepEmbedding(nn.Module):
     """
     Timestep embedding module for diffusion models.
     Converts timestep values into high-dimensional embeddings using sinusoidal
     positional encoding, followed by MLP layers. Used for conditioning diffusion
     models on timestep information.
@@ -217,7 +218,7 @@ class TimestepEmbedding(nn.Module):
         self.act1 = nn.SiLU()
         self.linear_2 = nn.Linear(time_embed_dim, time_embed_dim, bias=True)
         self.in_channels = in_channels
         self.act2 = nn.SiLU()
         self.time_proj = nn.Linear(time_embed_dim, time_embed_dim * 6)
         self.scale = scale
@@ -305,7 +306,7 @@ class AceStepAttention(nn.Module):
         # Determine if this is cross-attention (requires encoder_hidden_states)
         is_cross_attention = self.is_cross_attention and encoder_hidden_states is not None
         # Cross-attention path: attend to encoder hidden states
         if is_cross_attention:
             encoder_hidden_shape = (*encoder_hidden_states.shape[:-1], -1, self.head_dim)
@@ -313,7 +314,7 @@ class AceStepAttention(nn.Module):
                 is_updated = past_key_value.is_updated.get(self.layer_idx)
                 # After the first generated token, we can reuse all key/value states from cache
                 curr_past_key_value = past_key_value.cross_attention_cache
                 # Conditions for calculating key and value states
                 if not is_updated:
                     # Compute and cache K/V for the first time
@@ -331,7 +332,7 @@ class AceStepAttention(nn.Module):
                 # No cache used, compute K/V directly
                 key_states = self.k_norm(self.k_proj(encoder_hidden_states).view(encoder_hidden_shape)).transpose(1, 2)
                 value_states = self.v_proj(encoder_hidden_states).view(encoder_hidden_shape).transpose(1, 2)
         # Self-attention path: attend to the same sequence
         else:
             # Project and normalize key/value states for self-attention
@@ -353,7 +354,7 @@ class AceStepAttention(nn.Module):
             attention_interface: Callable = eager_attention_forward
         elif self.config._attn_implementation != "eager":
             attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
         attn_output, attn_weights = attention_interface(
             self,
             query_states,
@@ -443,12 +444,12 @@ class AceStepEncoderLayer(GradientCheckpointingLayer):
 class AceStepDiTLayer(GradientCheckpointingLayer):
     """
     DiT (Diffusion Transformer) layer for AceStep model.
     Implements a transformer layer with three main components:
     1. Self-attention with adaptive layer norm (AdaLN)
     2. Cross-attention (optional) for conditioning on encoder outputs
     3. Feed-forward MLP with adaptive layer norm
     Uses scale-shift modulation from timestep embeddings for adaptive normalization.
     """
     def __init__(self, config: AceStepConfig, layer_idx: int, use_cross_attention: bool = True):
@@ -471,7 +472,7 @@ class AceStepDiTLayer(GradientCheckpointingLayer):
         # Scale-shift table for adaptive layer norm modulation (6 values: 3 for self-attn, 3 for MLP)
         self.scale_shift_table = nn.Parameter(torch.randn(1, 6, config.hidden_size) / config.hidden_size**0.5)
         self.attention_type = config.layer_types[layer_idx]
     def forward(
         self,
         hidden_states: torch.Tensor,
@@ -577,14 +578,14 @@ class AceStepPreTrainedModel(PreTrainedModel):
 class AceStepLyricEncoder(AceStepPreTrainedModel):
     """
     Encoder for processing lyric text embeddings.
     Encodes lyric text hidden states using a transformer encoder architecture
     with bidirectional attention. Projects text embeddings to model hidden size
     and processes them through multiple encoder layers.
     """
     def __init__(self, config):
         super().__init__(config)
         # Project text embeddings to model hidden size
         self.embed_tokens = nn.Linear(config.text_hidden_dim, config.hidden_size)
         self.norm = Qwen3RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
@@ -618,7 +619,7 @@ class AceStepLyricEncoder(AceStepPreTrainedModel):
         assert input_ids is None, "Only `input_ids` is supported for the lyric encoder."
         assert attention_mask is not None, "Attention mask must be provided for the lyric encoder."
         assert inputs_embeds is not None, "Inputs embeddings must be provided for the lyric encoder."
         # Project input embeddings: N x T x text_hidden_dim -> N x T x hidden_size
         inputs_embeds = self.embed_tokens(inputs_embeds)
         # Cache position: only used for mask construction (not for actual caching)
@@ -632,7 +633,7 @@ class AceStepLyricEncoder(AceStepPreTrainedModel):
         seq_len = inputs_embeds.shape[1]
         dtype = inputs_embeds.dtype
         device = inputs_embeds.device
         # 判断是否使用 Flash Attention 2
         is_flash_attn = (self.config._attn_implementation == "flash_attention_2")
@@ -649,7 +650,7 @@ class AceStepLyricEncoder(AceStepPreTrainedModel):
             # 如果没有 padding mask，传 None。
             # 滑动窗口逻辑由 Layer 内部传给 FA kernel 的 sliding_window 参数控制。
             full_attn_mask = attention_mask
             # 这里的逻辑是：如果配置启用了滑动窗口，FA 模式下我们也只需要传基础的 padding mask
             # Layer 会自己决定是否调用带 sliding window 的 kernel
             sliding_attn_mask = attention_mask if self.config.use_sliding_window else None
@@ -659,7 +660,7 @@ class AceStepLyricEncoder(AceStepPreTrainedModel):
             # 场景 B: CPU / Mac / SDPA (Eager 模式)
             # -------------------------------------------------------
             # 必须手动生成 4D Mask [B, 1, L, L]
             # 1. Full Attention (Bidirectional, Global)
             # 对应原来的 create_causal_mask + bidirectional
             full_attn_mask = create_4d_mask(
@@ -734,7 +735,7 @@ class AceStepLyricEncoder(AceStepPreTrainedModel):
 class AttentionPooler(AceStepPreTrainedModel):
     """
     Attention-based pooling module.
     Pools sequences of patches using a special token and attention mechanism.
     The special token attends to all patches and its output is used as the
     pooled representation. Used for aggregating patch-level features into
@@ -782,7 +783,7 @@ class AttentionPooler(AceStepPreTrainedModel):
         seq_len = x.shape[1]
         dtype = x.dtype
         device = x.device
         # 判断是否使用 Flash Attention 2
         is_flash_attn = (self.config._attn_implementation == "flash_attention_2")
@@ -799,7 +800,7 @@ class AttentionPooler(AceStepPreTrainedModel):
             # 如果没有 padding mask，传 None。
             # 滑动窗口逻辑由 Layer 内部传给 FA kernel 的 sliding_window 参数控制。
             full_attn_mask = attention_mask
             # 这里的逻辑是：如果配置启用了滑动窗口，FA 模式下我们也只需要传基础的 padding mask
             # Layer 会自己决定是否调用带 sliding window 的 kernel
             sliding_attn_mask = attention_mask if self.config.use_sliding_window else None
@@ -809,7 +810,7 @@ class AttentionPooler(AceStepPreTrainedModel):
             # 场景 B: CPU / Mac / SDPA (Eager 模式)
             # -------------------------------------------------------
             # 必须手动生成 4D Mask [B, 1, L, L]
             # 1. Full Attention (Bidirectional, Global)
             # 对应原来的 create_causal_mask + bidirectional
             full_attn_mask = create_4d_mask(
@@ -840,7 +841,7 @@ class AttentionPooler(AceStepPreTrainedModel):
             "full_attention": full_attn_mask,
             "sliding_attention": sliding_attn_mask,
         }
         for layer_module in self.layers:
             layer_outputs = layer_module(
                 hidden_states,
@@ -852,7 +853,7 @@ class AttentionPooler(AceStepPreTrainedModel):
             hidden_states = layer_outputs[0]
         hidden_states = self.norm(hidden_states)
         # Extract the special token output (first position) as pooled representation
         cls_output = hidden_states[:, 0, :]
         cls_output = rearrange(cls_output, "(b t) c -> b t c", b=B)
@@ -862,7 +863,7 @@ class AttentionPooler(AceStepPreTrainedModel):
 class AudioTokenDetokenizer(AceStepPreTrainedModel):
     """
     Audio token detokenizer module.
     Converts quantized audio tokens back to continuous acoustic representations.
     Expands each token into multiple patches using special tokens, processes them
     through encoder layers, and projects to acoustic hidden dimension.
@@ -917,7 +918,7 @@ class AudioTokenDetokenizer(AceStepPreTrainedModel):
         seq_len = x.shape[1]
         dtype = x.dtype
         device = x.device
         # 判断是否使用 Flash Attention 2
         is_flash_attn = (self.config._attn_implementation == "flash_attention_2")
@@ -934,7 +935,7 @@ class AudioTokenDetokenizer(AceStepPreTrainedModel):
             # 如果没有 padding mask，传 None。
             # 滑动窗口逻辑由 Layer 内部传给 FA kernel 的 sliding_window 参数控制。
             full_attn_mask = attention_mask
             # 这里的逻辑是：如果配置启用了滑动窗口，FA 模式下我们也只需要传基础的 padding mask
             # Layer 会自己决定是否调用带 sliding window 的 kernel
             sliding_attn_mask = attention_mask if self.config.use_sliding_window else None
@@ -944,7 +945,7 @@ class AudioTokenDetokenizer(AceStepPreTrainedModel):
             # 场景 B: CPU / Mac / SDPA (Eager 模式)
             # -------------------------------------------------------
             # 必须手动生成 4D Mask [B, 1, L, L]
             # 1. Full Attention (Bidirectional, Global)
             # 对应原来的 create_causal_mask + bidirectional
             full_attn_mask = create_4d_mask(
@@ -975,7 +976,7 @@ class AudioTokenDetokenizer(AceStepPreTrainedModel):
             "full_attention": full_attn_mask,
             "sliding_attention": sliding_attn_mask,
         }
         for layer_module in self.layers:
             layer_outputs = layer_module(
                 hidden_states,
@@ -987,7 +988,7 @@ class AudioTokenDetokenizer(AceStepPreTrainedModel):
             hidden_states = layer_outputs[0]
         hidden_states = self.norm(hidden_states)
         hidden_states = self.proj_out(hidden_states)
         hidden_states = rearrange(hidden_states, "(b t) p c -> b (t p) c", b=B, p=self.config.pool_window_size)
@@ -997,7 +998,7 @@ class AudioTokenDetokenizer(AceStepPreTrainedModel):
 class AceStepTimbreEncoder(AceStepPreTrainedModel):
     """
     Encoder for extracting timbre embeddings from reference audio.
     Processes packed reference audio acoustic features to extract timbre
     representations. Uses a special token (CLS-like) to aggregate information
     from the entire reference audio sequence. Outputs are unpacked back to
@@ -1005,7 +1006,7 @@ class AceStepTimbreEncoder(AceStepPreTrainedModel):
     """
     def __init__(self, config):
         super().__init__(config)
         # Project acoustic features to model hidden size
         self.embed_tokens = nn.Linear(config.timbre_hidden_dim, config.hidden_size)
         self.norm = Qwen3RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
@@ -1036,40 +1037,40 @@ class AceStepTimbreEncoder(AceStepPreTrainedModel):
         N, d = timbre_embs_packed.shape
         device = timbre_embs_packed.device
         dtype = timbre_embs_packed.dtype
         # Get batch size
         B = int(refer_audio_order_mask.max().item() + 1)
         # Calculate element count and positions for each batch
         counts = torch.bincount(refer_audio_order_mask, minlength=B)
         max_count = counts.max().item()
         # Calculate positions within batch
         sorted_indices = torch.argsort(refer_audio_order_mask * N + torch.arange(N, device=device), stable=True)
         sorted_batch_ids = refer_audio_order_mask[sorted_indices]
         positions = torch.arange(N, device=device)
-        batch_starts = torch.cat([torch.tensor([0], device=device),
                                 torch.cumsum(counts, dim=0)[:-1]])
         positions_in_sorted = positions - batch_starts[sorted_batch_ids]
         inverse_indices = torch.empty_like(sorted_indices)
         inverse_indices[sorted_indices] = torch.arange(N, device=device)
         positions_in_batch = positions_in_sorted[inverse_indices]
         # Use one-hot encoding and matrix multiplication (gradient-friendly approach)
         # Create one-hot encoding
         indices_2d = refer_audio_order_mask * max_count + positions_in_batch  # (N,)
         one_hot = F.one_hot(indices_2d, num_classes=B * max_count).to(dtype)  # (N, B*max_count)
         # Rearrange using matrix multiplication
         timbre_embs_flat = one_hot.t() @ timbre_embs_packed  # (B*max_count, d)
         timbre_embs_unpack = timbre_embs_flat.reshape(B, max_count, d)
         # Create mask indicating valid positions
         mask_flat = (one_hot.sum(dim=0) > 0).long()  # (B*max_count,)
         new_mask = mask_flat.reshape(B, max_count)
         return timbre_embs_unpack, new_mask
     @can_return_tuple
@@ -1093,7 +1094,7 @@ class AceStepTimbreEncoder(AceStepPreTrainedModel):
         seq_len = inputs_embeds.shape[1]
         dtype = inputs_embeds.dtype
         device = inputs_embeds.device
         # 判断是否使用 Flash Attention 2
         is_flash_attn = (self.config._attn_implementation == "flash_attention_2")
@@ -1110,7 +1111,7 @@ class AceStepTimbreEncoder(AceStepPreTrainedModel):
             # 如果没有 padding mask，传 None。
             # 滑动窗口逻辑由 Layer 内部传给 FA kernel 的 sliding_window 参数控制。
             full_attn_mask = attention_mask
             # 这里的逻辑是：如果配置启用了滑动窗口，FA 模式下我们也只需要传基础的 padding mask
             # Layer 会自己决定是否调用带 sliding window 的 kernel
             sliding_attn_mask = attention_mask if self.config.use_sliding_window else None
@@ -1120,7 +1121,7 @@ class AceStepTimbreEncoder(AceStepPreTrainedModel):
             # 场景 B: CPU / Mac / SDPA (Eager 模式)
             # -------------------------------------------------------
             # 必须手动生成 4D Mask [B, 1, L, L]
             # 1. Full Attention (Bidirectional, Global)
             # 对应原来的 create_causal_mask + bidirectional
             full_attn_mask = create_4d_mask(
@@ -1151,7 +1152,7 @@ class AceStepTimbreEncoder(AceStepPreTrainedModel):
             "full_attention": full_attn_mask,
             "sliding_attention": sliding_attn_mask,
         }
         # Initialize hidden states
         hidden_states = inputs_embeds
@@ -1181,7 +1182,7 @@ class AceStepTimbreEncoder(AceStepPreTrainedModel):
 class AceStepAudioTokenizer(AceStepPreTrainedModel):
     """
     Audio tokenizer module.
     Converts continuous acoustic features into discrete quantized tokens.
     Process: project -> pool patches -> quantize. Used for converting audio
     representations into discrete tokens for processing by the diffusion model.
@@ -1208,7 +1209,7 @@ class AceStepAudioTokenizer(AceStepPreTrainedModel):
         hidden_states: Optional[torch.FloatTensor] = None,
         **flash_attn_kwargs: Unpack[FlashAttentionKwargs],
     ) -> BaseModelOutput:
         # Project acoustic features to hidden size
         hidden_states = self.audio_acoustic_proj(hidden_states)
         # Pool sequences: N x T//pool_window_size x pool_window_size x d -> N x T//pool_window_size x d
@@ -1225,14 +1226,14 @@ class AceStepAudioTokenizer(AceStepPreTrainedModel):
 class Lambda(nn.Module):
     """
     Wrapper module for arbitrary lambda functions.
     Allows using lambda functions in nn.Sequential by wrapping them in a Module.
     Useful for simple transformations like transpose operations.
     """
     def __init__(self, func):
         super().__init__()
         self.func = func
     def forward(self, x):
         return self.func(x)
@@ -1240,7 +1241,7 @@ class Lambda(nn.Module):
 class AceStepDiTModel(AceStepPreTrainedModel):
     """
     DiT (Diffusion Transformer) model for AceStep.
     Main diffusion model that generates audio latents conditioned on text, lyrics,
     and timbre. Uses patch-based processing with transformer layers, timestep
     conditioning, and cross-attention to encoder outputs.
@@ -1258,7 +1259,7 @@ class AceStepDiTModel(AceStepPreTrainedModel):
         inner_dim = config.hidden_size
         patch_size = config.patch_size
         self.patch_size = patch_size
         # Input projection: patch embedding using 1D convolution
         # Converts sequence into patches for efficient processing
         self.proj_in = nn.Sequential(
@@ -1277,9 +1278,10 @@ class AceStepDiTModel(AceStepPreTrainedModel):
         # Two embeddings: one for timestep t, one for timestep difference (t - r)
         self.time_embed = TimestepEmbedding(in_channels=256, time_embed_dim=inner_dim)
         self.time_embed_r = TimestepEmbedding(in_channels=256, time_embed_dim=inner_dim)
         # Project encoder hidden states to model dimension
-        self.condition_embedder = nn.Linear(inner_dim, inner_dim, bias=True)
         # Output normalization and projection
         # Adaptive layer norm with scale-shift modulation, then de-patchify
@@ -1330,7 +1332,7 @@ class AceStepDiTModel(AceStepPreTrainedModel):
             use_cache = False
         if self.training:
             use_cache = False
         # Initialize cache if needed (only during inference for auto-regressive generation)
         if not self.training and use_cache and past_key_values is None:
             past_key_values = EncoderDecoderCache(DynamicCache(), DynamicCache())
@@ -1357,14 +1359,14 @@ class AceStepDiTModel(AceStepPreTrainedModel):
         # Project input to patches and project encoder states
         hidden_states = self.proj_in(hidden_states)
         encoder_hidden_states = self.condition_embedder(encoder_hidden_states)
         # Cache positions
         if cache_position is None:
             past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
             cache_position = torch.arange(
                 past_seen_tokens, past_seen_tokens + hidden_states.shape[1], device=hidden_states.device
             )
         # Position IDs
         if position_ids is None:
             position_ids = cache_position.unsqueeze(0)
@@ -1374,7 +1376,7 @@ class AceStepDiTModel(AceStepPreTrainedModel):
         encoder_seq_len = encoder_hidden_states.shape[1]
         dtype = hidden_states.dtype
         device = hidden_states.device
         # 判断是否使用 Flash Attention 2
         is_flash_attn = (self.config._attn_implementation == "flash_attention_2")
@@ -1392,7 +1394,7 @@ class AceStepDiTModel(AceStepPreTrainedModel):
             # 如果没有 padding mask，传 None。
             # 滑动窗口逻辑由 Layer 内部传给 FA kernel 的 sliding_window 参数控制。
             full_attn_mask = attention_mask
             # 这里的逻辑是：如果配置启用了滑动窗口，FA 模式下我们也只需要传基础的 padding mask
             # Layer 会自己决定是否调用带 sliding window 的 kernel
             sliding_attn_mask = attention_mask if self.config.use_sliding_window else None
@@ -1402,7 +1404,7 @@ class AceStepDiTModel(AceStepPreTrainedModel):
             # 场景 B: CPU / Mac / SDPA (Eager 模式)
             # -------------------------------------------------------
             # 必须手动生成 4D Mask [B, 1, L, L]
             # 1. Full Attention (Bidirectional, Global)
             # 对应原来的 create_causal_mask + bidirectional
             full_attn_mask = create_4d_mask(
@@ -1415,7 +1417,7 @@ class AceStepDiTModel(AceStepPreTrainedModel):
                 is_causal=False                    # <--- 关键：双向注意力
             )
             max_len = max(seq_len, encoder_seq_len)
             encoder_attention_mask = create_4d_mask(
                 seq_len=max_len,
                 dtype=dtype,
@@ -1483,7 +1485,7 @@ class AceStepDiTModel(AceStepPreTrainedModel):
                 # Extract the last element which is cross_attn_weights
                 if len(layer_outputs) >= 3:
                     all_cross_attentions += (layer_outputs[2],)
         if return_hidden_states:
             return hidden_states
@@ -1496,10 +1498,10 @@ class AceStepDiTModel(AceStepPreTrainedModel):
         hidden_states = (self.norm_out(hidden_states) * (1 + scale) + shift).type_as(hidden_states)
         # Project output: de-patchify back to original sequence format
         hidden_states = self.proj_out(hidden_states)
         # Crop back to original sequence length to ensure exact length match (remove padding)
         hidden_states = hidden_states[:, :original_seq_len, :]
         outputs = (hidden_states, past_key_values)
         if output_attentions:
@@ -1509,7 +1511,7 @@ class AceStepDiTModel(AceStepPreTrainedModel):
 class AceStepConditionEncoder(AceStepPreTrainedModel):
     """
     Condition encoder for AceStep model.
     Encodes multiple conditioning inputs (text, lyrics, timbre) and packs them
     into a single sequence for cross-attention in the diffusion model. Handles
     projection, encoding, and sequence packing.
@@ -1554,10 +1556,42 @@ class AceStepConditionEncoder(AceStepPreTrainedModel):
         return encoder_hidden_states, encoder_attention_mask
 class AceStepConditionGenerationModel(AceStepPreTrainedModel):
     """
     Main conditional generation model for AceStep.
     End-to-end model for generating audio conditioned on text, lyrics, and timbre.
     Combines encoder (for conditioning), decoder (diffusion model), tokenizer
     (for discrete tokenization), and detokenizer (for reconstruction).
@@ -1568,11 +1602,22 @@ class AceStepConditionGenerationModel(AceStepPreTrainedModel):
         self.config = config
         # Diffusion model components
         self.decoder = AceStepDiTModel(config)  # Main diffusion transformer
-        self.encoder = AceStepConditionEncoder(config)  # Condition encoder
-        self.tokenizer = AceStepAudioTokenizer(config)  # Audio tokenizer
-        self.detokenizer = AudioTokenDetokenizer(config)  # Audio detokenizer
         # Null condition embedding for classifier-free guidance
-        self.null_condition_emb = nn.Parameter(torch.randn(1, 1, config.hidden_size))
         # Initialize weights and apply final processing
         self.post_init()
@@ -1621,7 +1666,7 @@ class AceStepConditionGenerationModel(AceStepPreTrainedModel):
         precomputed_lm_hints_25Hz: Optional[torch.FloatTensor] = None,
         audio_codes: torch.FloatTensor = None,
     ):
         dtype = hidden_states.dtype
         encoder_hidden_states, encoder_attention_mask = self.encoder(
             text_hidden_states=text_hidden_states,
@@ -1709,7 +1754,7 @@ class AceStepConditionGenerationModel(AceStepPreTrainedModel):
         t_ = t.unsqueeze(-1).unsqueeze(-1)
         # Interpolate: x_t = t * x1 + (1 - t) * x0
         xt = t_ * x1 + (1.0 - t_) * x0
         # Predict flow (velocity) from diffusion model
         decoder_outputs = self.decoder(
             hidden_states=xt,
@@ -1726,10 +1771,10 @@ class AceStepConditionGenerationModel(AceStepPreTrainedModel):
         return {
             "diffusion_loss": diffusion_loss,
         }
     def training_losses(self, **kwargs):
         return self.forward(**kwargs)
     def prepare_noise(self, context_latents: torch.FloatTensor, seed: Union[int, List[int], None] = None):
         """
         Prepare noise tensor for generation with optional seeding.
@@ -1770,6 +1815,9 @@ class AceStepConditionGenerationModel(AceStepPreTrainedModel):
         return noise
     def get_x0_from_noise(self, zt, vt, t):
         return zt - vt * t.unsqueeze(-1).unsqueeze(-1)
     def renoise(self, x, t, noise=None):
@@ -1779,7 +1827,7 @@ class AceStepConditionGenerationModel(AceStepPreTrainedModel):
             t = t.unsqueeze(-1).unsqueeze(-1)
         xt = t * noise + (1 - t) * x
         return xt
     def generate_audio(
         self,
         text_hidden_states: torch.FloatTensor,
@@ -1797,7 +1845,7 @@ class AceStepConditionGenerationModel(AceStepPreTrainedModel):
         infer_method: str = "ode",
         use_cache: bool = True,
         infer_steps: int = 30,
-        diffusion_guidance_sale: float = 7.0,
         audio_cover_strength: float = 1.0,
         non_cover_text_hidden_states: Optional[torch.FloatTensor] = None,
         non_cover_text_attention_mask: Optional[torch.FloatTensor] = None,
@@ -1809,8 +1857,27 @@ class AceStepConditionGenerationModel(AceStepPreTrainedModel):
         use_adg: bool = False,
         shift: float = 1.0,
         cover_noise_strength: float = 0.0,
         **kwargs,
     ):
         if attention_mask is None:
             latent_length = src_latents.shape[1]
             attention_mask = torch.ones(src_latents.shape[0], latent_length, device=src_latents.device, dtype=src_latents.dtype)
@@ -1874,7 +1941,7 @@ class AceStepConditionGenerationModel(AceStepPreTrainedModel):
         bsz, device, dtype = context_latents.shape[0], context_latents.device, context_latents.dtype
         past_key_values = EncoderDecoderCache(DynamicCache(), DynamicCache())
         momentum_buffer = MomentumBuffer()
         # Cover noise initialization: blend noise with src_latents
         if cover_noise_strength > 0.0:
             # cover_noise_strength=1 means closest to src, so noise_level should be low
@@ -1900,16 +1967,24 @@ class AceStepConditionGenerationModel(AceStepPreTrainedModel):
             )
         else:
             xt = noise
         # main task condition
-        do_cfg_guidance = diffusion_guidance_sale > 1.0
         if do_cfg_guidance:
             encoder_hidden_states = torch.cat([encoder_hidden_states, self.null_condition_emb.expand_as(encoder_hidden_states)], dim=0)
             encoder_attention_mask = torch.cat([encoder_attention_mask, encoder_attention_mask], dim=0)
             # src_latents
             context_latents = torch.cat([context_latents, context_latents], dim=0)
             attention_mask = torch.cat([attention_mask, attention_mask], dim=0)
         _switched_to_non_cover = False
         with torch.no_grad():
             for step_idx, (t_curr, t_prev) in enumerate(iterator):
@@ -1925,7 +2000,7 @@ class AceStepConditionGenerationModel(AceStepPreTrainedModel):
                     encoder_attention_mask = encoder_attention_mask_non_cover
                     context_latents = context_latents_non_cover
                     past_key_values = EncoderDecoderCache(DynamicCache(), DynamicCache())
                 x = torch.cat([xt, xt], dim=0) if do_cfg_guidance else xt
                 t_curr_tensor = t_curr * torch.ones((x.shape[0],), device=device, dtype=dtype)
                 decoder_outputs = self.decoder(
@@ -1939,7 +2014,7 @@ class AceStepConditionGenerationModel(AceStepPreTrainedModel):
                     use_cache=True,
                     past_key_values=past_key_values,
                 )
                 vt = decoder_outputs[0]
                 past_key_values = decoder_outputs[1]
                 apply_cfg_guidance = t_curr >= cfg_interval_start and t_curr <= cfg_interval_end
@@ -1950,7 +2025,7 @@ class AceStepConditionGenerationModel(AceStepPreTrainedModel):
                             vt = apg_forward(
                                 pred_cond=pred_cond,
                                 pred_uncond=pred_null_cond,
-                                guidance_scale=diffusion_guidance_sale,
                                 momentum_buffer=momentum_buffer,
                                 dims=[1],
                             )
@@ -1960,10 +2035,21 @@ class AceStepConditionGenerationModel(AceStepPreTrainedModel):
                                 noise_pred_cond=pred_cond,
                                 noise_pred_uncond=pred_null_cond,
                                 sigma=t_curr,
-                                guidance_scale=diffusion_guidance_sale,
                             )
                     else:
                         vt = pred_cond
                 # Update x_t based on inference method
                 if infer_method == "sde":
                     # Stochastic Differential Equation: predict clean, then re-add noise
@@ -1971,14 +2057,85 @@ class AceStepConditionGenerationModel(AceStepPreTrainedModel):
                     pred_clean = self.get_x0_from_noise(xt, vt, t_curr_bsz)
                     next_timestep = 1.0 - (float(step_idx + 1) / infer_steps)
                     xt = self.renoise(pred_clean, next_timestep)
                 elif infer_method == "ode":
                     # Ordinary Differential Equation: Euler method
                     # dx/dt = -v, so x_{t+1} = x_t - v_t * dt
                     dt = t_curr - t_prev
                     dt_tensor = dt * torch.ones((bsz,), device=device, dtype=dtype).unsqueeze(-1).unsqueeze(-1)
                     xt = xt - vt * dt_tensor
         x_gen = xt
         end_time = time.time()
         time_costs["diffusion_time_cost"] = end_time - start_time
         time_costs["diffusion_per_step_time_cost"] = time_costs["diffusion_time_cost"] / infer_steps
@@ -2001,13 +2158,13 @@ def test_forward(model, seed=42):
         torch.cuda.manual_seed_all(seed)
         torch.backends.cudnn.deterministic = True
         torch.backends.cudnn.benchmark = False
     # Get model dtype and device
     model_dtype = next(model.parameters()).dtype
     device = next(model.parameters()).device
     print(f"Testing with dtype: {model_dtype}, device: {device}, seed: {seed}")
     # Test data preparation with matching dtype
     text_hidden_states = torch.randn(2, 77, 1024, dtype=model_dtype, device=device)
     text_attention_mask = torch.ones(2, 77, dtype=model_dtype, device=device)
@@ -2141,4 +2298,4 @@ if __name__ == "__main__":
     # model = model.float()
     model = model.to("cuda")
     model = model.bfloat16()
-    test_forward(model)

 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import copy
 import math
 import time
 from typing import Callable, List, Optional, Union
 # Local config import with fallback
 try:
     from .configuration_acestep_v15 import AceStepConfig
+    from .apg_guidance import adg_forward, apg_forward, cfg_forward, MomentumBuffer
 except ImportError:
     from configuration_acestep_v15 import AceStepConfig
+    from apg_guidance import adg_forward, apg_forward, cfg_forward, MomentumBuffer
 logger = logging.get_logger(__name__)
         # We want to mask out invalid keys (columns)
         # Expand shape: [Batch, 1, 1, Seq_Len]
         padding_mask_4d = attention_mask.view(attention_mask.shape[0], 1, 1, seq_len).to(torch.bool)
         # Broadcasting: Geometry Mask [1, 1, L, L] & Padding Mask [B, 1, 1, L]
         # Result shape: [B, 1, L, L]
         valid_mask = valid_mask & padding_mask_4d
     # ------------------------------------------------------
     # Get the minimal value for current dtype
     min_dtype = torch.finfo(dtype).min
     # Create result tensor filled with -inf by default
     mask_tensor = torch.full(valid_mask.shape, min_dtype, dtype=dtype, device=device)
     # Set valid positions to 0.0
     mask_tensor.masked_fill_(valid_mask, 0.0)
     return mask_tensor
 class TimestepEmbedding(nn.Module):
     """
     Timestep embedding module for diffusion models.
     Converts timestep values into high-dimensional embeddings using sinusoidal
     positional encoding, followed by MLP layers. Used for conditioning diffusion
     models on timestep information.
         self.act1 = nn.SiLU()
         self.linear_2 = nn.Linear(time_embed_dim, time_embed_dim, bias=True)
         self.in_channels = in_channels
         self.act2 = nn.SiLU()
         self.time_proj = nn.Linear(time_embed_dim, time_embed_dim * 6)
         self.scale = scale
         # Determine if this is cross-attention (requires encoder_hidden_states)
         is_cross_attention = self.is_cross_attention and encoder_hidden_states is not None
         # Cross-attention path: attend to encoder hidden states
         if is_cross_attention:
             encoder_hidden_shape = (*encoder_hidden_states.shape[:-1], -1, self.head_dim)
                 is_updated = past_key_value.is_updated.get(self.layer_idx)
                 # After the first generated token, we can reuse all key/value states from cache
                 curr_past_key_value = past_key_value.cross_attention_cache
                 # Conditions for calculating key and value states
                 if not is_updated:
                     # Compute and cache K/V for the first time
                 # No cache used, compute K/V directly
                 key_states = self.k_norm(self.k_proj(encoder_hidden_states).view(encoder_hidden_shape)).transpose(1, 2)
                 value_states = self.v_proj(encoder_hidden_states).view(encoder_hidden_shape).transpose(1, 2)
         # Self-attention path: attend to the same sequence
         else:
             # Project and normalize key/value states for self-attention
             attention_interface: Callable = eager_attention_forward
         elif self.config._attn_implementation != "eager":
             attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
         attn_output, attn_weights = attention_interface(
             self,
             query_states,
 class AceStepDiTLayer(GradientCheckpointingLayer):
     """
     DiT (Diffusion Transformer) layer for AceStep model.
     Implements a transformer layer with three main components:
     1. Self-attention with adaptive layer norm (AdaLN)
     2. Cross-attention (optional) for conditioning on encoder outputs
     3. Feed-forward MLP with adaptive layer norm
     Uses scale-shift modulation from timestep embeddings for adaptive normalization.
     """
     def __init__(self, config: AceStepConfig, layer_idx: int, use_cross_attention: bool = True):
         # Scale-shift table for adaptive layer norm modulation (6 values: 3 for self-attn, 3 for MLP)
         self.scale_shift_table = nn.Parameter(torch.randn(1, 6, config.hidden_size) / config.hidden_size**0.5)
         self.attention_type = config.layer_types[layer_idx]
     def forward(
         self,
         hidden_states: torch.Tensor,
 class AceStepLyricEncoder(AceStepPreTrainedModel):
     """
     Encoder for processing lyric text embeddings.
     Encodes lyric text hidden states using a transformer encoder architecture
     with bidirectional attention. Projects text embeddings to model hidden size
     and processes them through multiple encoder layers.
     """
     def __init__(self, config):
         super().__init__(config)
         # Project text embeddings to model hidden size
         self.embed_tokens = nn.Linear(config.text_hidden_dim, config.hidden_size)
         self.norm = Qwen3RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
         assert input_ids is None, "Only `input_ids` is supported for the lyric encoder."
         assert attention_mask is not None, "Attention mask must be provided for the lyric encoder."
         assert inputs_embeds is not None, "Inputs embeddings must be provided for the lyric encoder."
         # Project input embeddings: N x T x text_hidden_dim -> N x T x hidden_size
         inputs_embeds = self.embed_tokens(inputs_embeds)
         # Cache position: only used for mask construction (not for actual caching)
         seq_len = inputs_embeds.shape[1]
         dtype = inputs_embeds.dtype
         device = inputs_embeds.device
         # 判断是否使用 Flash Attention 2
         is_flash_attn = (self.config._attn_implementation == "flash_attention_2")
             # 如果没有 padding mask，传 None。
             # 滑动窗口逻辑由 Layer 内部传给 FA kernel 的 sliding_window 参数控制。
             full_attn_mask = attention_mask
             # 这里的逻辑是：如果配置启用了滑动窗口，FA 模式下我们也只需要传基础的 padding mask
             # Layer 会自己决定是否调用带 sliding window 的 kernel
             sliding_attn_mask = attention_mask if self.config.use_sliding_window else None
             # 场景 B: CPU / Mac / SDPA (Eager 模式)
             # -------------------------------------------------------
             # 必须手动生成 4D Mask [B, 1, L, L]
             # 1. Full Attention (Bidirectional, Global)
             # 对应原来的 create_causal_mask + bidirectional
             full_attn_mask = create_4d_mask(
 class AttentionPooler(AceStepPreTrainedModel):
     """
     Attention-based pooling module.
     Pools sequences of patches using a special token and attention mechanism.
     The special token attends to all patches and its output is used as the
     pooled representation. Used for aggregating patch-level features into
         seq_len = x.shape[1]
         dtype = x.dtype
         device = x.device
         # 判断是否使用 Flash Attention 2
         is_flash_attn = (self.config._attn_implementation == "flash_attention_2")
             # 如果没有 padding mask，传 None。
             # 滑动窗口逻辑由 Layer 内部传给 FA kernel 的 sliding_window 参数控制。
             full_attn_mask = attention_mask
             # 这里的逻辑是：如果配置启用了滑动窗口，FA 模式下我们也只需要传基础的 padding mask
             # Layer 会自己决定是否调用带 sliding window 的 kernel
             sliding_attn_mask = attention_mask if self.config.use_sliding_window else None
             # 场景 B: CPU / Mac / SDPA (Eager 模式)
             # -------------------------------------------------------
             # 必须手动生成 4D Mask [B, 1, L, L]
             # 1. Full Attention (Bidirectional, Global)
             # 对应原来的 create_causal_mask + bidirectional
             full_attn_mask = create_4d_mask(
             "full_attention": full_attn_mask,
             "sliding_attention": sliding_attn_mask,
         }
         for layer_module in self.layers:
             layer_outputs = layer_module(
                 hidden_states,
             hidden_states = layer_outputs[0]
         hidden_states = self.norm(hidden_states)
         # Extract the special token output (first position) as pooled representation
         cls_output = hidden_states[:, 0, :]
         cls_output = rearrange(cls_output, "(b t) c -> b t c", b=B)
 class AudioTokenDetokenizer(AceStepPreTrainedModel):
     """
     Audio token detokenizer module.
     Converts quantized audio tokens back to continuous acoustic representations.
     Expands each token into multiple patches using special tokens, processes them
     through encoder layers, and projects to acoustic hidden dimension.
         seq_len = x.shape[1]
         dtype = x.dtype
         device = x.device
         # 判断是否使用 Flash Attention 2
         is_flash_attn = (self.config._attn_implementation == "flash_attention_2")
             # 如果没有 padding mask，传 None。
             # 滑动窗口逻辑由 Layer 内部传给 FA kernel 的 sliding_window 参数控制。
             full_attn_mask = attention_mask
             # 这里的逻辑是：如果配置启用了滑动窗口，FA 模式下我们也只需要传基础的 padding mask
             # Layer 会自己决定是否调用带 sliding window 的 kernel
             sliding_attn_mask = attention_mask if self.config.use_sliding_window else None
             # 场景 B: CPU / Mac / SDPA (Eager 模式)
             # -------------------------------------------------------
             # 必须手动生成 4D Mask [B, 1, L, L]
             # 1. Full Attention (Bidirectional, Global)
             # 对应原来的 create_causal_mask + bidirectional
             full_attn_mask = create_4d_mask(
             "full_attention": full_attn_mask,
             "sliding_attention": sliding_attn_mask,
         }
         for layer_module in self.layers:
             layer_outputs = layer_module(
                 hidden_states,
             hidden_states = layer_outputs[0]
         hidden_states = self.norm(hidden_states)
         hidden_states = self.proj_out(hidden_states)
         hidden_states = rearrange(hidden_states, "(b t) p c -> b (t p) c", b=B, p=self.config.pool_window_size)
 class AceStepTimbreEncoder(AceStepPreTrainedModel):
     """
     Encoder for extracting timbre embeddings from reference audio.
     Processes packed reference audio acoustic features to extract timbre
     representations. Uses a special token (CLS-like) to aggregate information
     from the entire reference audio sequence. Outputs are unpacked back to
     """
     def __init__(self, config):
         super().__init__(config)
         # Project acoustic features to model hidden size
         self.embed_tokens = nn.Linear(config.timbre_hidden_dim, config.hidden_size)
         self.norm = Qwen3RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
         N, d = timbre_embs_packed.shape
         device = timbre_embs_packed.device
         dtype = timbre_embs_packed.dtype
         # Get batch size
         B = int(refer_audio_order_mask.max().item() + 1)
         # Calculate element count and positions for each batch
         counts = torch.bincount(refer_audio_order_mask, minlength=B)
         max_count = counts.max().item()
         # Calculate positions within batch
         sorted_indices = torch.argsort(refer_audio_order_mask * N + torch.arange(N, device=device), stable=True)
         sorted_batch_ids = refer_audio_order_mask[sorted_indices]
         positions = torch.arange(N, device=device)
+        batch_starts = torch.cat([torch.tensor([0], device=device),
                                 torch.cumsum(counts, dim=0)[:-1]])
         positions_in_sorted = positions - batch_starts[sorted_batch_ids]
         inverse_indices = torch.empty_like(sorted_indices)
         inverse_indices[sorted_indices] = torch.arange(N, device=device)
         positions_in_batch = positions_in_sorted[inverse_indices]
         # Use one-hot encoding and matrix multiplication (gradient-friendly approach)
         # Create one-hot encoding
         indices_2d = refer_audio_order_mask * max_count + positions_in_batch  # (N,)
         one_hot = F.one_hot(indices_2d, num_classes=B * max_count).to(dtype)  # (N, B*max_count)
         # Rearrange using matrix multiplication
         timbre_embs_flat = one_hot.t() @ timbre_embs_packed  # (B*max_count, d)
         timbre_embs_unpack = timbre_embs_flat.reshape(B, max_count, d)
         # Create mask indicating valid positions
         mask_flat = (one_hot.sum(dim=0) > 0).long()  # (B*max_count,)
         new_mask = mask_flat.reshape(B, max_count)
         return timbre_embs_unpack, new_mask
     @can_return_tuple
         seq_len = inputs_embeds.shape[1]
         dtype = inputs_embeds.dtype
         device = inputs_embeds.device
         # 判断是否使用 Flash Attention 2
         is_flash_attn = (self.config._attn_implementation == "flash_attention_2")
             # 如果没有 padding mask，传 None。
             # 滑动窗口逻辑由 Layer 内部传给 FA kernel 的 sliding_window 参数控制。
             full_attn_mask = attention_mask
             # 这里的逻辑是：如果配置启用了滑动窗口，FA 模式下我们也只需要传基础的 padding mask
             # Layer 会自己决定是否调用带 sliding window 的 kernel
             sliding_attn_mask = attention_mask if self.config.use_sliding_window else None
             # 场景 B: CPU / Mac / SDPA (Eager 模式)
             # -------------------------------------------------------
             # 必须手动生成 4D Mask [B, 1, L, L]
             # 1. Full Attention (Bidirectional, Global)
             # 对应原来的 create_causal_mask + bidirectional
             full_attn_mask = create_4d_mask(
             "full_attention": full_attn_mask,
             "sliding_attention": sliding_attn_mask,
         }
         # Initialize hidden states
         hidden_states = inputs_embeds
 class AceStepAudioTokenizer(AceStepPreTrainedModel):
     """
     Audio tokenizer module.
     Converts continuous acoustic features into discrete quantized tokens.
     Process: project -> pool patches -> quantize. Used for converting audio
     representations into discrete tokens for processing by the diffusion model.
         hidden_states: Optional[torch.FloatTensor] = None,
         **flash_attn_kwargs: Unpack[FlashAttentionKwargs],
     ) -> BaseModelOutput:
         # Project acoustic features to hidden size
         hidden_states = self.audio_acoustic_proj(hidden_states)
         # Pool sequences: N x T//pool_window_size x pool_window_size x d -> N x T//pool_window_size x d
 class Lambda(nn.Module):
     """
     Wrapper module for arbitrary lambda functions.
     Allows using lambda functions in nn.Sequential by wrapping them in a Module.
     Useful for simple transformations like transpose operations.
     """
     def __init__(self, func):
         super().__init__()
         self.func = func
     def forward(self, x):
         return self.func(x)
 class AceStepDiTModel(AceStepPreTrainedModel):
     """
     DiT (Diffusion Transformer) model for AceStep.
     Main diffusion model that generates audio latents conditioned on text, lyrics,
     and timbre. Uses patch-based processing with transformer layers, timestep
     conditioning, and cross-attention to encoder outputs.
         inner_dim = config.hidden_size
         patch_size = config.patch_size
         self.patch_size = patch_size
         # Input projection: patch embedding using 1D convolution
         # Converts sequence into patches for efficient processing
         self.proj_in = nn.Sequential(
         # Two embeddings: one for timestep t, one for timestep difference (t - r)
         self.time_embed = TimestepEmbedding(in_channels=256, time_embed_dim=inner_dim)
         self.time_embed_r = TimestepEmbedding(in_channels=256, time_embed_dim=inner_dim)
         # Project encoder hidden states to model dimension
+        condition_dim = getattr(config, "encoder_hidden_size", None) or config.hidden_size
+        self.condition_embedder = nn.Linear(condition_dim, inner_dim, bias=True)
         # Output normalization and projection
         # Adaptive layer norm with scale-shift modulation, then de-patchify
             use_cache = False
         if self.training:
             use_cache = False
         # Initialize cache if needed (only during inference for auto-regressive generation)
         if not self.training and use_cache and past_key_values is None:
             past_key_values = EncoderDecoderCache(DynamicCache(), DynamicCache())
         # Project input to patches and project encoder states
         hidden_states = self.proj_in(hidden_states)
         encoder_hidden_states = self.condition_embedder(encoder_hidden_states)
         # Cache positions
         if cache_position is None:
             past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
             cache_position = torch.arange(
                 past_seen_tokens, past_seen_tokens + hidden_states.shape[1], device=hidden_states.device
             )
         # Position IDs
         if position_ids is None:
             position_ids = cache_position.unsqueeze(0)
         encoder_seq_len = encoder_hidden_states.shape[1]
         dtype = hidden_states.dtype
         device = hidden_states.device
         # 判断是否使用 Flash Attention 2
         is_flash_attn = (self.config._attn_implementation == "flash_attention_2")
             # 如果没有 padding mask，传 None。
             # 滑动窗口逻辑由 Layer 内部传给 FA kernel 的 sliding_window 参数控制。
             full_attn_mask = attention_mask
             # 这里的逻辑是：如果配置启用了滑动窗口，FA 模式下我们也只需要传基础的 padding mask
             # Layer 会自己决定是否调用带 sliding window 的 kernel
             sliding_attn_mask = attention_mask if self.config.use_sliding_window else None
             # 场景 B: CPU / Mac / SDPA (Eager 模式)
             # -------------------------------------------------------
             # 必须手动生成 4D Mask [B, 1, L, L]
             # 1. Full Attention (Bidirectional, Global)
             # 对应原来的 create_causal_mask + bidirectional
             full_attn_mask = create_4d_mask(
                 is_causal=False                    # <--- 关键：双向注意力
             )
             max_len = max(seq_len, encoder_seq_len)
             encoder_attention_mask = create_4d_mask(
                 seq_len=max_len,
                 dtype=dtype,
                 # Extract the last element which is cross_attn_weights
                 if len(layer_outputs) >= 3:
                     all_cross_attentions += (layer_outputs[2],)
         if return_hidden_states:
             return hidden_states
         hidden_states = (self.norm_out(hidden_states) * (1 + scale) + shift).type_as(hidden_states)
         # Project output: de-patchify back to original sequence format
         hidden_states = self.proj_out(hidden_states)
         # Crop back to original sequence length to ensure exact length match (remove padding)
         hidden_states = hidden_states[:, :original_seq_len, :]
         outputs = (hidden_states, past_key_values)
         if output_attentions:
 class AceStepConditionEncoder(AceStepPreTrainedModel):
     """
     Condition encoder for AceStep model.
     Encodes multiple conditioning inputs (text, lyrics, timbre) and packs them
     into a single sequence for cross-attention in the diffusion model. Handles
     projection, encoding, and sequence packing.
         return encoder_hidden_states, encoder_attention_mask
+def _repaint_step_injection(xt, clean_src, mask, t_next, noise):
+    """Replace non-repaint regions of *xt* with noised source latents."""
+    zt = t_next * noise + (1.0 - t_next) * clean_src
+    m = mask.unsqueeze(-1).expand_as(xt)
+    return torch.where(m, xt, zt)
+def _repaint_boundary_blend(x_gen, clean_src, mask, cf_frames):
+    """Blend generated latents with source at repaint boundaries."""
+    soft = mask.float().clone()
+    if cf_frames <= 0:
+        m = soft.unsqueeze(-1).expand_as(x_gen)
+        return m * x_gen + (1.0 - m) * clean_src
+    B, T = mask.shape
+    for b in range(B):
+        row = mask[b]
+        if row.all() or not row.any():
+            continue
+        idx = torch.nonzero(row, as_tuple=False).squeeze(-1)
+        if idx.numel() == 0:
+            continue
+        left, right = idx[0].item(), idx[-1].item() + 1
+        fs = max(left - cf_frames, 0)
+        if left - fs > 0:
+            soft[b, fs:left] = torch.linspace(0, 1, left - fs + 2, device=soft.device)[1:-1]
+        fe = min(right + cf_frames, T)
+        if fe - right > 0:
+            soft[b, right:fe] = torch.linspace(1, 0, fe - right + 2, device=soft.device)[1:-1]
+    m = soft.unsqueeze(-1).expand_as(x_gen)
+    return m * x_gen + (1.0 - m) * clean_src
 class AceStepConditionGenerationModel(AceStepPreTrainedModel):
     """
     Main conditional generation model for AceStep.
     End-to-end model for generating audio conditioned on text, lyrics, and timbre.
     Combines encoder (for conditioning), decoder (diffusion model), tokenizer
     (for discrete tokenization), and detokenizer (for reconstruction).
         self.config = config
         # Diffusion model components
         self.decoder = AceStepDiTModel(config)  # Main diffusion transformer
+        # Build encoder config: use separate encoder_hidden_size when available
+        # (4B models have encoder_hidden_size=2048 != hidden_size=2560)
+        _enc_hs = getattr(config, "encoder_hidden_size", None) or config.hidden_size
+        if _enc_hs != config.hidden_size:
+            encoder_config = copy.deepcopy(config)
+            encoder_config.hidden_size = _enc_hs
+            encoder_config.intermediate_size = getattr(config, "encoder_intermediate_size", None) or config.intermediate_size
+            encoder_config.num_attention_heads = getattr(config, "encoder_num_attention_heads", None) or config.num_attention_heads
+            encoder_config.num_key_value_heads = getattr(config, "encoder_num_key_value_heads", None) or config.num_key_value_heads
+        else:
+            encoder_config = config
+        self.encoder = AceStepConditionEncoder(encoder_config)  # Condition encoder
+        self.tokenizer = AceStepAudioTokenizer(encoder_config)  # Audio tokenizer
+        self.detokenizer = AudioTokenDetokenizer(encoder_config)  # Audio detokenizer
         # Null condition embedding for classifier-free guidance
+        self.null_condition_emb = nn.Parameter(torch.randn(1, 1, encoder_config.hidden_size))
         # Initialize weights and apply final processing
         self.post_init()
         precomputed_lm_hints_25Hz: Optional[torch.FloatTensor] = None,
         audio_codes: torch.FloatTensor = None,
     ):
         dtype = hidden_states.dtype
         encoder_hidden_states, encoder_attention_mask = self.encoder(
             text_hidden_states=text_hidden_states,
         t_ = t.unsqueeze(-1).unsqueeze(-1)
         # Interpolate: x_t = t * x1 + (1 - t) * x0
         xt = t_ * x1 + (1.0 - t_) * x0
         # Predict flow (velocity) from diffusion model
         decoder_outputs = self.decoder(
             hidden_states=xt,
         return {
             "diffusion_loss": diffusion_loss,
         }
     def training_losses(self, **kwargs):
         return self.forward(**kwargs)
     def prepare_noise(self, context_latents: torch.FloatTensor, seed: Union[int, List[int], None] = None):
         """
         Prepare noise tensor for generation with optional seeding.
         return noise
     def get_x0_from_noise(self, zt, vt, t):
+        if t.shape[0] != zt.shape[0]:
+            raise ValueError(f"Batch size mismatch: t has {t.shape[0]}, zt has {zt.shape[0]}")
         return zt - vt * t.unsqueeze(-1).unsqueeze(-1)
     def renoise(self, x, t, noise=None):
             t = t.unsqueeze(-1).unsqueeze(-1)
         xt = t * noise + (1 - t) * x
         return xt
     def generate_audio(
         self,
         text_hidden_states: torch.FloatTensor,
         infer_method: str = "ode",
         use_cache: bool = True,
         infer_steps: int = 30,
+        diffusion_guidance_scale: float = 7.0,
         audio_cover_strength: float = 1.0,
         non_cover_text_hidden_states: Optional[torch.FloatTensor] = None,
         non_cover_text_attention_mask: Optional[torch.FloatTensor] = None,
         use_adg: bool = False,
         shift: float = 1.0,
         cover_noise_strength: float = 0.0,
+        repaint_mask: Optional[torch.Tensor] = None,
+        clean_src_latents: Optional[torch.FloatTensor] = None,
+        repaint_crossfade_frames: int = 10,
+        repaint_injection_ratio: float = 0.5,
+        sampler_mode: str = "euler",
+        velocity_norm_threshold: float = 0.0,
+        velocity_ema_factor: float = 0.0,
         **kwargs,
     ):
+        # Backward-compat: accept the old misspelled key "diffusion_guidance_sale"
+        # so that callers that have not yet updated their code still work correctly.
+        # Note: if both keys are passed simultaneously, the old key wins because Python
+        # cannot distinguish "explicit new key" from "new key at its default value".
+        # In practice callers should only ever pass one of the two.
+        if "diffusion_guidance_sale" in kwargs:
+            logger.warning(
+                "generate_audio() received deprecated kwarg 'diffusion_guidance_sale'; "
+                "please rename it to 'diffusion_guidance_scale'."
+            )
+            diffusion_guidance_scale = kwargs.pop("diffusion_guidance_sale")
         if attention_mask is None:
             latent_length = src_latents.shape[1]
             attention_mask = torch.ones(src_latents.shape[0], latent_length, device=src_latents.device, dtype=src_latents.dtype)
         bsz, device, dtype = context_latents.shape[0], context_latents.device, context_latents.dtype
         past_key_values = EncoderDecoderCache(DynamicCache(), DynamicCache())
         momentum_buffer = MomentumBuffer()
         # Cover noise initialization: blend noise with src_latents
         if cover_noise_strength > 0.0:
             # cover_noise_strength=1 means closest to src, so noise_level should be low
             )
         else:
             xt = noise
         # main task condition
+        do_cfg_guidance = diffusion_guidance_scale > 1.0
         if do_cfg_guidance:
             encoder_hidden_states = torch.cat([encoder_hidden_states, self.null_condition_emb.expand_as(encoder_hidden_states)], dim=0)
             encoder_attention_mask = torch.cat([encoder_attention_mask, encoder_attention_mask], dim=0)
             # src_latents
             context_latents = torch.cat([context_latents, context_latents], dim=0)
             attention_mask = torch.cat([attention_mask, attention_mask], dim=0)
+        use_heun = sampler_mode == "heun"
+        use_norm_clamp = velocity_norm_threshold > 0.0
+        use_ema = velocity_ema_factor > 0.0
+        prev_vt = None
+        if use_heun and infer_method == "sde":
+            logger.warning("Heun sampler is not compatible with SDE; falling back to Euler.")
+            use_heun = False
         _switched_to_non_cover = False
         with torch.no_grad():
             for step_idx, (t_curr, t_prev) in enumerate(iterator):
                     encoder_attention_mask = encoder_attention_mask_non_cover
                     context_latents = context_latents_non_cover
                     past_key_values = EncoderDecoderCache(DynamicCache(), DynamicCache())
                 x = torch.cat([xt, xt], dim=0) if do_cfg_guidance else xt
                 t_curr_tensor = t_curr * torch.ones((x.shape[0],), device=device, dtype=dtype)
                 decoder_outputs = self.decoder(
                     use_cache=True,
                     past_key_values=past_key_values,
                 )
                 vt = decoder_outputs[0]
                 past_key_values = decoder_outputs[1]
                 apply_cfg_guidance = t_curr >= cfg_interval_start and t_curr <= cfg_interval_end
                             vt = apg_forward(
                                 pred_cond=pred_cond,
                                 pred_uncond=pred_null_cond,
+                                guidance_scale=diffusion_guidance_scale,
                                 momentum_buffer=momentum_buffer,
                                 dims=[1],
                             )
                                 noise_pred_cond=pred_cond,
                                 noise_pred_uncond=pred_null_cond,
                                 sigma=t_curr,
+                                guidance_scale=diffusion_guidance_scale,
                             )
                     else:
                         vt = pred_cond
+                # Velocity norm clamping — prevents outlier predictions
+                if use_norm_clamp:
+                    vt_norm = torch.norm(vt, dim=(1, 2), keepdim=True)
+                    xt_norm = torch.norm(xt, dim=(1, 2), keepdim=True) + 1e-10
+                    scale = torch.clamp(velocity_norm_threshold * xt_norm / (vt_norm + 1e-10), max=1.0)
+                    vt = vt * scale
+                # Velocity EMA smoothing — stabilises denoising trajectory
+                if use_ema and prev_vt is not None:
+                    vt = (1.0 - velocity_ema_factor) * vt + velocity_ema_factor * prev_vt
                 # Update x_t based on inference method
                 if infer_method == "sde":
                     # Stochastic Differential Equation: predict clean, then re-add noise
                     pred_clean = self.get_x0_from_noise(xt, vt, t_curr_bsz)
                     next_timestep = 1.0 - (float(step_idx + 1) / infer_steps)
                     xt = self.renoise(pred_clean, next_timestep)
+                    t_after_step = next_timestep
+                elif use_heun and infer_method == "ode":
+                    # Heun (second-order) ODE step via trapezoidal rule
+                    dt = t_curr - t_prev
+                    dt_tensor = dt * torch.ones((bsz,), device=device, dtype=dtype).unsqueeze(-1).unsqueeze(-1)
+                    # Predictor: Euler step to get xt_predicted at t_prev
+                    xt_predicted = xt - vt * dt_tensor
+                    # Corrector: evaluate model at the predicted point
+                    x2 = torch.cat([xt_predicted, xt_predicted], dim=0) if do_cfg_guidance else xt_predicted
+                    t_prev_tensor = t_prev * torch.ones((x2.shape[0],), device=device, dtype=dtype)
+                    # Reset KV cache for corrector pass (Heun needs fresh evaluation)
+                    corrector_kv = EncoderDecoderCache(DynamicCache(), DynamicCache())
+                    decoder_outputs2 = self.decoder(
+                        hidden_states=x2,
+                        timestep=t_prev_tensor,
+                        timestep_r=t_prev_tensor,
+                        attention_mask=attention_mask,
+                        encoder_hidden_states=encoder_hidden_states,
+                        encoder_attention_mask=encoder_attention_mask,
+                        context_latents=context_latents,
+                        use_cache=False,
+                        past_key_values=corrector_kv,
+                    )
+                    vt2 = decoder_outputs2[0]
+                    if do_cfg_guidance:
+                        pred_cond2, pred_null_cond2 = vt2.chunk(2)
+                        # Recompute CFG interval for corrector timestep (t_prev, not t_curr)
+                        apply_cfg_corrector = t_prev >= cfg_interval_start and t_prev <= cfg_interval_end
+                        if apply_cfg_corrector:
+                            if not use_adg:
+                                # Use basic CFG for corrector to avoid mutating APG momentum twice per step
+                                vt2 = cfg_forward(pred_cond2, pred_null_cond2, diffusion_guidance_scale)
+                            elif t_prev > 0:
+                                # Guard against sigma=0 which causes NaN in ADG division
+                                vt2 = adg_forward(
+                                    latents=xt_predicted,
+                                    noise_pred_cond=pred_cond2,
+                                    noise_pred_uncond=pred_null_cond2,
+                                    sigma=t_prev,
+                                    guidance_scale=diffusion_guidance_scale,
+                                )
+                            else:
+                                vt2 = cfg_forward(pred_cond2, pred_null_cond2, diffusion_guidance_scale)
+                        else:
+                            vt2 = pred_cond2
+                    if use_norm_clamp:
+                        vt2_norm = torch.norm(vt2, dim=(1, 2), keepdim=True)
+                        xt_pred_norm = torch.norm(xt_predicted, dim=(1, 2), keepdim=True) + 1e-10
+                        scale2 = torch.clamp(velocity_norm_threshold * xt_pred_norm / (vt2_norm + 1e-10), max=1.0)
+                        vt2 = vt2 * scale2
+                    if use_ema:
+                        vt2 = (1.0 - velocity_ema_factor) * vt2 + velocity_ema_factor * vt
+                    # Average the two velocity predictions (trapezoidal rule)
+                    vt_avg = 0.5 * (vt + vt2)
+                    xt = xt - vt_avg * dt_tensor
+                    vt = vt_avg
+                    t_after_step = t_prev
                 elif infer_method == "ode":
                     # Ordinary Differential Equation: Euler method
                     # dx/dt = -v, so x_{t+1} = x_t - v_t * dt
                     dt = t_curr - t_prev
                     dt_tensor = dt * torch.ones((bsz,), device=device, dtype=dtype).unsqueeze(-1).unsqueeze(-1)
                     xt = xt - vt * dt_tensor
+                    t_after_step = t_prev
+                prev_vt = vt
+                injection_cutoff = round(repaint_injection_ratio * infer_steps)
+                if repaint_mask is not None and clean_src_latents is not None and step_idx < injection_cutoff:
+                    xt = _repaint_step_injection(
+                        xt, clean_src_latents, repaint_mask, t_after_step, noise,
+                    )
         x_gen = xt
+        if repaint_mask is not None and clean_src_latents is not None and repaint_crossfade_frames > 0:
+            x_gen = _repaint_boundary_blend(
+                x_gen, clean_src_latents, repaint_mask, repaint_crossfade_frames,
+            )
         end_time = time.time()
         time_costs["diffusion_time_cost"] = end_time - start_time
         time_costs["diffusion_per_step_time_cost"] = time_costs["diffusion_time_cost"] / infer_steps
         torch.cuda.manual_seed_all(seed)
         torch.backends.cudnn.deterministic = True
         torch.backends.cudnn.benchmark = False
     # Get model dtype and device
     model_dtype = next(model.parameters()).dtype
     device = next(model.parameters()).device
     print(f"Testing with dtype: {model_dtype}, device: {device}, seed: {seed}")
     # Test data preparation with matching dtype
     text_hidden_states = torch.randn(2, 77, 1024, dtype=model_dtype, device=device)
     text_attention_mask = torch.ones(2, 77, dtype=model_dtype, device=device)
     # model = model.float()
     model = model.to("cuda")
     model = model.bfloat16()
+    test_forward(model)