potsawee
/

TextSyncMimi-v1

+"""PyTorch TextSyncMimi model - Text-synchronous neural audio codec based on Mimi."""
+import torch
+import torch.nn as nn
+from typing import Optional, Dict, List, Union
+from configuration_mimi import MimiConfig
+from modeling_mimi_clean import MimiPreTrainedModel, MimiModel
+from modeling_backbone_components import (
+    CrossAttentionTransformer,
+    CausalAttentionTransformer
+)
+class TextSyncMimi(MimiPreTrainedModel):
+    """
+    TextSyncMimi: Text-Synchronous Neural Audio Codec Model
+    A neural audio codec model that combines text and speech representations for
+    high-quality text-to-speech synthesis. Features:
+    - Learnable text embeddings
+    - Cross-attention transformer for text-speech alignment
+    - Autoregressive transformer for causal speech generation
+    - BCE-based end token prediction for dynamic duration control
+    Architecture:
+    - Text Embedding Layer: Maps token IDs to 4,096-dim embeddings
+    - Mimi Encoder: Pre-trained audio encoder (frozen)
+    - Text Projection: Linear projection from 4,096 to 512 dimensions
+    - Cross-Attention Transformer: Aligns text with speech features
+    - Autoregressive Transformer: Generates speech representations
+    - End Token Classifier: Predicts when to stop generating
+    """
+    def __init__(
+        self,
+        config: Optional[Union[MimiConfig, 'TextSyncMimiConfig']] = None,
+        model_id: Optional[str] = None,
+        token: Optional[str] = None,
+        alpha: Optional[float] = None,
+        cross_attention_layers: Optional[int] = None,
+        causal_attention_layers: Optional[int] = None,
+        bce_threshold: Optional[float] = None,
+        vocab_size: Optional[int] = None,
+    ):
+        """
+        Initialize TextSyncMimi model.
+        Args:
+            config: Model configuration (TextSyncMimiConfig or MimiConfig)
+            model_id: Mimi model ID (e.g., "kyutai/mimi"). If None, uses config.mimi_model_id
+            token: Hugging Face authentication token
+            alpha: Weight for BCE end token loss. If None, uses config.alpha
+            cross_attention_layers: Number of cross-attention layers. If None, uses config
+            causal_attention_layers: Number of autoregressive layers. If None, uses config
+            bce_threshold: BCE loss threshold. If None, uses config.bce_threshold
+            vocab_size: Text vocabulary size. If None, uses config.vocab_size
+        """
+        # Handle config initialization for both manual instantiation and from_pretrained
+        if config is None:
+            if model_id is None:
+                raise ValueError("Either config or model_id must be provided")
+            config = MimiConfig.from_pretrained(model_id, token=token)
+        super().__init__(config)
+        # Extract parameters from config if not explicitly provided
+        if hasattr(config, 'mimi_model_id'):
+            model_id = model_id or config.mimi_model_id
+        if model_id is None:
+            raise ValueError("model_id must be provided either as argument or in config.mimi_model_id")
+        alpha = alpha if alpha is not None else getattr(config, 'alpha', 1.0)
+        cross_attention_layers = cross_attention_layers if cross_attention_layers is not None else getattr(config, 'cross_attention_layers', 2)
+        causal_attention_layers = causal_attention_layers if causal_attention_layers is not None else getattr(config, 'causal_attention_layers', 2)
+        bce_threshold = bce_threshold if bce_threshold is not None else getattr(config, 'bce_threshold', 0.1)
+        vocab_size = vocab_size if vocab_size is not None else getattr(config, 'vocab_size', 128256)
+        # load the mimi backbone
+        self.config = config
+        model = MimiModel.from_pretrained(model_id, token=token)
+        # hyperparameters for auxiliary loss
+        self.alpha = alpha
+        self.bce_threshold = bce_threshold
+        # Learnable text token embedding
+        self.text_token_embedding = nn.Embedding(vocab_size, 4096)
+        # Text projection
+        self.text_proj = nn.Linear(4096, 512)
+        # Cross-attention transformer
+        cross_attention_config = MimiConfig(**self.config.__dict__)
+        cross_attention_config.num_hidden_layers = cross_attention_layers
+        cross_attention_config.hidden_size = 512
+        self.cross_attention_transformer = CrossAttentionTransformer(cross_attention_config)
+        # decoder part (v1)
+        # Auto-regressive decoder:
+        # <|text_speech_latent|> [t_i] [s_i] <|time_speech_start|> [z_(i,1)] [z_(i,2)] ... [z_(i,K)] <|time_speech_end|>
+        # masking (not computing loss for <|text_speech_latent|> [t_i] [s_i] <|time_speech_start|>
+        # t_i already mapped from 4096 (e.g., llama embedding) -> 512
+        # s_i already 512
+        # z is mimi's decoder-input which is also 512
+        causal_attention_config = MimiConfig(**self.config.__dict__)
+        causal_attention_config.num_hidden_layers = causal_attention_layers
+        causal_attention_config.hidden_size = 512
+        self.ar_transformer = CausalAttentionTransformer(causal_attention_config)
+        # embedding for special positions in the autoregressive decoder
+        self.text_speech_latent_embed = nn.Embedding(1, 512)
+        self.time_speech_start_embed = nn.Embedding(1, 512)
+        self.time_speech_end_embed = nn.Embedding(1, 512)
+        # Binary classification head for end token prediction
+        self.end_token_classifier = nn.Linear(512, 1)
+        self.post_init()
+        # Frozen Mimi components
+        self.encoder = model.encoder
+        self.encoder_transformer = model.encoder_transformer
+        self.quantizer = model.quantizer
+        self.downsample = model.downsample
+        self.upsample = model.upsample
+        # print the number of parameters for each sub network in Millions
+        self._print_subnetwork_parameter_counts()
+    def initialize_text_embeddings_from_weights(self, embedding_weight: torch.Tensor) -> None:
+        """
+        Initialize text embeddings from a weight matrix.
+        Args:
+            embedding_weight: Weight matrix of shape (vocab_size, 4096)
+        """
+        if embedding_weight.dim() != 2 or embedding_weight.size(1) != 4096:
+            raise ValueError("embedding_weight must have shape (vocab_size, 4096)")
+        if embedding_weight.size(0) != self.text_token_embedding.num_embeddings:
+            raise ValueError("Provided vocab_size does not match model's text_token_embedding")
+        with torch.no_grad():
+            self.text_token_embedding.weight.copy_(embedding_weight)
+        for p in self.text_token_embedding.parameters():
+            p.requires_grad = True
+    def initialize_text_embeddings_from_llama(self, llama_embeddings_module: torch.nn.Module) -> None:
+        """
+        Initialize text embeddings from a LLaMA embedding module.
+        Args:
+            llama_embeddings_module: LLaMA embedding module with weight shape (vocab_size, 4096)
+        """
+        if not hasattr(llama_embeddings_module, 'weight'):
+            raise ValueError("llama_embeddings_module must have a 'weight' attribute")
+        weight = llama_embeddings_module.weight.data
+        self.initialize_text_embeddings_from_weights(weight)
+    def _print_subnetwork_parameter_counts(self) -> None:
+        """Print parameter counts for model subnetworks."""
+        print("=" * 70)
+        print("TextSyncMimi Parameter Counts")
+        print("=" * 70)
+        print(f"Encoder: {sum(p.numel() for p in self.encoder.parameters()) / 1e6:.2f}M")
+        print(f"Encoder Transformer: {sum(p.numel() for p in self.encoder_transformer.parameters()) / 1e6:.2f}M")
+        print(f"Cross-Attention Transformer: {sum(p.numel() for p in self.cross_attention_transformer.parameters()) / 1e6:.2f}M")
+        print(f"AR Transformer: {sum(p.numel() for p in self.ar_transformer.parameters()) / 1e6:.2f}M")
+        print(f"Quantizer: {sum(p.numel() for p in self.quantizer.parameters()) / 1e6:.2f}M")
+        print("=" * 70)
+    def encode_audio_to_representation(
+        self,
+        input_values: torch.Tensor,
+        audio_attention_mask: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        """
+        Encode audio to speech representation.
+        Args:
+            input_values: Audio waveform (B, 1, audio_len)
+            audio_attention_mask: Attention mask (B, audio_len)
+        Returns:
+            Speech embeddings (B, 512, 12.5 * T)
+        """
+        batch_size = input_values.shape[0]
+        device = input_values.device
+        # Encode through Mimi encoder pipeline
+        embeddings = self.encoder(input_values)
+        encoder_outputs = self.encoder_transformer(embeddings.transpose(1, 2))
+        embeddings = encoder_outputs[0].transpose(1, 2)
+        embeddings = self.downsample(embeddings)
+        # Apply attention mask if provided
+        if audio_attention_mask is not None:
+            speech_seq_len = embeddings.shape[-1]
+            speech_attention_mask = torch.zeros(batch_size, speech_seq_len, device=device, dtype=torch.bool)
+            for b in range(batch_size):
+                actual_audio_len = audio_attention_mask[b].sum().item()
+                actual_speech_len = int(actual_audio_len * 12.5 / 24000)
+                actual_speech_len = min(actual_speech_len, speech_seq_len)
+                if actual_speech_len > 0:
+                    speech_attention_mask[b, :actual_speech_len] = True
+            speech_mask_expanded = speech_attention_mask.unsqueeze(1)
+            embeddings = embeddings * speech_mask_expanded.float()
+        return embeddings
+    def generate_autoregressive(
+        self,
+        text_token_ids: torch.LongTensor,
+        input_values: Optional[torch.Tensor] = None,
+        speech_embeddings: Optional[torch.Tensor] = None,
+        audio_attention_mask: Optional[torch.Tensor] = None,
+        speech_attention_mask: Optional[torch.Tensor] = None,
+        text_attention_mask: Optional[torch.Tensor] = None,
+        max_z_tokens: int = 50,
+        end_token_threshold: float = 0.5,
+        device: Optional[torch.device] = None,
+    ) -> List[List[torch.Tensor]]:
+        """
+        Generate audio autoregressively.
+        Args:
+            text_token_ids: Text token IDs (B, L)
+            input_values: Audio input (B, 1, 24000 * T) - for normal mode
+            speech_embeddings: Pre-computed speech embeddings (B, T, 512) - for cached mode
+            audio_attention_mask: Audio mask (B, audio_seq_len) - for normal mode
+            speech_attention_mask: Speech mask (B, speech_seq_len) - for cached mode
+            text_attention_mask: Text mask (B, text_seq_len)
+            max_z_tokens: Maximum z tokens per text position
+            end_token_threshold: Probability threshold for stopping
+            device: Device for computation
+        Returns:
+            List of z_tokens lists (one per batch item)
+        """
+        if device is None:
+            device = text_token_ids.device
+        self.eval()
+        with torch.no_grad():
+            # Get speech embeddings for cross-attention context
+            if speech_embeddings is not None:
+                # Use pre-computed speech embeddings (cached mode)
+                # speech_embeddings should already be (B, T, 512)
+                pass  # speech_embeddings is already provided
+            else:
+                # Compute speech embeddings from input_values (normal mode)
+                if input_values is None:
+                    raise ValueError("Either input_values or speech_embeddings must be provided")
+                speech_embeddings = self.encode_audio_to_representation(
+                    input_values,
+                    audio_attention_mask=audio_attention_mask
+                )
+                speech_embeddings = speech_embeddings.transpose(1, 2)  # (B, T, 512)
+            # Embed token ids then project to 512
+            text_embeddings_4096 = self.text_token_embedding(text_token_ids)  # (B, L, 4096)
+            text_embeddings_proj = self.text_proj(text_embeddings_4096)  # (B, L, 512)
+            # Apply cross attention (same as in forward)
+            # Create attention masks
+            formatted_text_attention_mask = None
+            formatted_speech_attention_mask = None
+            batch_size, text_seq_len = text_embeddings_proj.shape[:2]
+            if text_attention_mask is not None:
+                causal_mask = torch.tril(torch.ones(text_seq_len, text_seq_len, device=device, dtype=text_embeddings_proj.dtype))
+                causal_mask = causal_mask.view(1, 1, text_seq_len, text_seq_len).expand(batch_size, -1, -1, -1)
+                padding_mask = text_attention_mask.view(batch_size, 1, 1, text_seq_len)
+                combined_mask = causal_mask * padding_mask
+                formatted_text_attention_mask = torch.where(combined_mask.bool(), 0.0, float('-inf'))
+            else:
+                causal_mask = torch.tril(torch.ones(text_seq_len, text_seq_len, device=device, dtype=text_embeddings_proj.dtype))
+                causal_mask = causal_mask.view(1, 1, text_seq_len, text_seq_len).expand(batch_size, -1, -1, -1)
+                formatted_text_attention_mask = torch.where(causal_mask.bool(), 0.0, float('-inf'))
+            # Handle speech attention mask (use speech_attention_mask if available, otherwise audio_attention_mask)
+            if speech_attention_mask is not None:
+                # For cached data, speech_attention_mask is already in the right format
+                speech_seq_len = speech_embeddings.shape[1]
+                speech_mask = speech_attention_mask.bool()
+                formatted_speech_attention_mask = speech_mask.view(batch_size, 1, 1, speech_seq_len)
+                formatted_speech_attention_mask = torch.where(formatted_speech_attention_mask, 0.0, float('-inf'))
+            elif audio_attention_mask is not None:
+                # For non-cached data, convert audio_attention_mask to speech_attention_mask
+                speech_seq_len = speech_embeddings.shape[1]
+                speech_mask = torch.zeros(batch_size, speech_seq_len, dtype=torch.bool, device=device)
+                for b in range(batch_size):
+                    audio_len = audio_attention_mask[b].sum().item()
+                    speech_len = int(audio_len * 12.5 / 24000)
+                    speech_len = min(speech_len, speech_seq_len)
+                    speech_mask[b, :speech_len] = True
+                formatted_speech_attention_mask = speech_mask.view(batch_size, 1, 1, speech_seq_len)
+                formatted_speech_attention_mask = torch.where(formatted_speech_attention_mask, 0.0, float('-inf'))
+            else:
+                formatted_speech_attention_mask = None
+            # Cross attention
+            cross_attention_outputs = self.cross_attention_transformer(
+                hidden_states=text_embeddings_proj,
+                encoder_hidden_states=speech_embeddings,
+                attention_mask=formatted_text_attention_mask,
+                encoder_attention_mask=formatted_speech_attention_mask,
+                alignment_chunk_sizes=None,  # V1 learns alignment
+            )
+            cross_attention_outputs = cross_attention_outputs.last_hidden_state
+            # Get special embeddings
+            text_speech_latent_emb = self.text_speech_latent_embed(torch.zeros(1, dtype=torch.long, device=device))
+            time_speech_start_emb = self.time_speech_start_embed(torch.zeros(1, dtype=torch.long, device=device))
+            time_speech_end_emb = self.time_speech_end_embed(torch.zeros(1, dtype=torch.long, device=device))
+            generated_z_tokens = []
+            # Generate for each batch item
+            for b in range(batch_size):
+                # Get valid text length for this sample
+                if text_attention_mask is not None:
+                    valid_text_len = text_attention_mask[b].sum().item()
+                else:
+                    valid_text_len = text_embeddings_proj.shape[1]
+                # Start sequence with text_speech_latent for context
+                sequence = [text_speech_latent_emb]  # (1, 512)
+                batch_z_tokens = []  # Store z_tokens for this batch item
+                # Generate for each text position
+                for i in range(valid_text_len):
+                    # Add t_i and s_i
+                    t_i = text_embeddings_proj[b, i:i+1]  # (1, 512)
+                    s_i = cross_attention_outputs[b, i:i+1]  # (1, 512)
+                    sequence.extend([t_i, s_i])
+                    # Add time_speech_start
+                    sequence.append(time_speech_start_emb)
+                    # Generate z tokens autoregressively for this text position
+                    z_count = 0
+                    while z_count < max_z_tokens:
+                        # Prepare current sequence for AR transformer
+                        current_sequence = torch.cat(sequence, dim=0).unsqueeze(0)  # (1, seq_len, 512)
+                        # Create attention mask for current sequence
+                        seq_len = current_sequence.shape[1]
+                        ar_attention_mask = torch.ones(1, seq_len, dtype=torch.bool, device=device)
+                        # Get prediction from AR transformer
+                        ar_outputs = self.ar_transformer(
+                            hidden_states=current_sequence,
+                            attention_mask=ar_attention_mask,
+                        )
+                        # Get the last prediction
+                        last_prediction = ar_outputs.last_hidden_state[0, -1:, :]  # (1, 512)
+                        # Check stopping condition using BCE classifier (v1.1)
+                        end_token_logit = self.end_token_classifier(last_prediction).squeeze(-1)  # (1,)
+                        end_token_prob = torch.sigmoid(end_token_logit).item()  # Convert to probability
+                        # Stop if probability is high enough (>= threshold means stop)
+                        if end_token_prob >= end_token_threshold:
+                            # Stop generating z tokens
+                            break
+                        else:
+                            # Add this prediction as next z token to both sequence (for context) and z_tokens (for output)
+                            sequence.append(last_prediction)
+                            batch_z_tokens.append(last_prediction.squeeze(0))  # Remove batch dimension for output
+                            z_count += 1
+                    # Add time_speech_end to sequence for context
+                    sequence.append(time_speech_end_emb)
+                # Store z_tokens for this batch item
+                generated_z_tokens.append(batch_z_tokens)
+            return generated_z_tokens
+    def forward(
+        self,
+        text_token_ids: torch.LongTensor,
+        input_values: Optional[torch.Tensor] = None,
+        speech_embeddings: Optional[torch.Tensor] = None,
+        alignment_chunk_sizes: torch.Tensor = None,
+        audio_attention_mask: Optional[torch.Tensor] = None,
+        speech_attention_mask: Optional[torch.Tensor] = None,
+        text_attention_mask: Optional[torch.Tensor] = None,
+        **kwargs,
+    ) -> Dict[str, torch.Tensor]:
+        """
+        Forward pass for training.
+        Args:
+            text_token_ids: Text token IDs (B, L)
+            input_values: Audio input (B, 1, 24000 * T) - for normal mode
+            speech_embeddings: Pre-computed speech embeddings (B, T, 512) - for cached mode
+            alignment_chunk_sizes: Alignment chunk sizes (B, L)
+            audio_attention_mask: Audio mask (B, audio_seq_len)
+            speech_attention_mask: Speech mask (B, speech_seq_len)
+            text_attention_mask: Text mask (B, text_seq_len)
+        Returns:
+            Dictionary with 'loss', 'reconstruction_loss', and 'bce_end_token_loss'
+        """
+        # Get speech embeddings
+        if speech_embeddings is not None:
+            pass
+        elif input_values is not None:
+            # Normal mode: compute speech embeddings from input_values
+            speech_embeddings_raw = self.encode_audio_to_representation(
+                input_values,
+                audio_attention_mask
+            )
+            # speech_embeddings_raw.shape = (B, 512, 12.5*T)
+            # Transpose: [B, 512, 12.5*T] -> [B, 12.5*T, 512]
+            speech_embeddings = speech_embeddings_raw.transpose(1, 2)
+        else:
+            raise ValueError("Either input_values or speech_embeddings must be provided")
+        # Embed token ids and project to 512-dim
+        text_embeddings_4096 = self.text_token_embedding(text_token_ids)  # (B, L, 4096)
+        text_embeddings = self.text_proj(text_embeddings_4096)  # (B, L, 512)
+        # Create proper attention masks for cross-attention
+        formatted_text_attention_mask = None
+        formatted_speech_attention_mask = None
+        # Handle text attention mask (causal mask for decoder)
+        batch_size, text_seq_len = text_embeddings.shape[:2]
+        if text_attention_mask is not None:
+            # Create causal mask and apply padding mask
+            causal_mask = torch.tril(torch.ones(text_seq_len, text_seq_len, device=text_embeddings.device, dtype=text_embeddings.dtype))
+            causal_mask = causal_mask.view(1, 1, text_seq_len, text_seq_len).expand(batch_size, -1, -1, -1)
+            # Apply padding mask to causal mask
+            padding_mask = text_attention_mask.view(batch_size, 1, 1, text_seq_len)
+            combined_mask = causal_mask * padding_mask
+            # Convert to attention scores (-inf for masked positions)
+            formatted_text_attention_mask = torch.where(combined_mask.bool(), 0.0, float('-inf'))
+        else:
+            # Create causal mask for all positions (no padding mask)
+            causal_mask = torch.tril(torch.ones(text_seq_len, text_seq_len, device=text_embeddings.device, dtype=text_embeddings.dtype))
+            causal_mask = causal_mask.view(1, 1, text_seq_len, text_seq_len).expand(batch_size, -1, -1, -1)
+            formatted_text_attention_mask = torch.where(causal_mask.bool(), 0.0, float('-inf'))
+        # Handle speech attention mask (encoder mask)
+        # Use speech_attention_mask if available (cached mode), otherwise audio_attention_mask (normal mode)
+        if speech_attention_mask is not None:
+            # Cached mode: speech_attention_mask is already in the right format
+            speech_seq_len = speech_embeddings.shape[1]
+            speech_mask = speech_attention_mask.bool()
+            # Convert to attention format: [batch_size, 1, 1, speech_seq_len]
+            formatted_speech_attention_mask = speech_mask.view(batch_size, 1, 1, speech_seq_len)
+            formatted_speech_attention_mask = torch.where(formatted_speech_attention_mask, 0.0, float('-inf'))
+        elif audio_attention_mask is not None:
+            # Normal mode: convert audio mask to speech embedding mask
+            speech_seq_len = speech_embeddings.shape[1]
+            # Create speech attention mask based on actual lengths
+            speech_mask = torch.zeros(batch_size, speech_seq_len, dtype=torch.bool, device=speech_embeddings.device)
+            for b in range(batch_size):
+                audio_len = audio_attention_mask[b].sum().item()
+                speech_len = int(audio_len * 12.5 / 24000)
+                speech_len = min(speech_len, speech_seq_len)
+                speech_mask[b, :speech_len] = True
+            # Convert to attention format: [batch_size, 1, 1, speech_seq_len]
+            formatted_speech_attention_mask = speech_mask.view(batch_size, 1, 1, speech_seq_len)
+            formatted_speech_attention_mask = torch.where(formatted_speech_attention_mask, 0.0, float('-inf'))
+        else:
+            # No masking
+            formatted_speech_attention_mask = None
+        # Cross attention: text attends to speech (no alignment constraints in V1)
+        # hidden_states (decoder) = text, encoder_hidden_states = speech
+        cross_attention_outputs = self.cross_attention_transformer(
+            hidden_states=text_embeddings,
+            encoder_hidden_states=speech_embeddings,
+            attention_mask=formatted_text_attention_mask,  # Causal mask for text (decoder)
+            encoder_attention_mask=formatted_speech_attention_mask,  # Mask for speech (encoder)
+            alignment_chunk_sizes=None, # v1 doesn't use alignment_chunk_sizes -- the model should learn the alignment itself
+        )
+        cross_attention_outputs = cross_attention_outputs.last_hidden_state
+        # Auto-regressive decoder part
+        # Following v0.5 where the target is the dequantized Mimi decoder-input
+        # Compute target representation = Mimi decoder-input (quantized->dequantized at 12.5*seconds)
+        # 12.5*seconds => T
+        with torch.no_grad():
+            embeddings_bct = speech_embeddings.transpose(1, 2)  # (B, 512, T)
+            codes_kbt = self.quantizer.encode(embeddings_bct)   # [K, B, T]
+            codes_bkt = codes_kbt.transpose(0, 1)               # [B, K, T]
+            decoder_input_emb = self.quantizer.decode(codes_bkt)  # (B, 512, T)
+            target_representation = decoder_input_emb.transpose(1, 2)  # (B, T, 512)
+        # Build the interleaved sequence for the autoregressive decoder
+        # as well as the mask for loss computation
+        # Get special embeddings (all are single embeddings)
+        device = text_embeddings.device
+        text_speech_latent_emb = self.text_speech_latent_embed(torch.zeros(1, dtype=torch.long, device=device))  # (1, 512)
+        time_speech_start_emb = self.time_speech_start_embed(torch.zeros(1, dtype=torch.long, device=device))    # (1, 512)
+        time_speech_end_emb = self.time_speech_end_embed(torch.zeros(1, dtype=torch.long, device=device))        # (1, 512)
+        batch_size = text_embeddings.shape[0]
+        interleaved_sequences = []
+        loss_masks = []
+        bce_labels_batch = []  # BCE labels: 0 for z tokens, 1 for time_speech_end_emb
+        bce_masks = []  # BCE mask: True for z tokens and time_speech_end_emb
+        sequence_lengths = []  # Track actual sequence lengths before padding
+        all_z_tokens = []  # Collect all valid z_tokens for separation loss
+        max_total_length = 0
+        for b in range(batch_size):
+            # Start with text_speech_latent embedding
+            sequence_parts = [text_speech_latent_emb]  # List to collect sequence parts
+            loss_mask_parts = [False]  # Don't compute loss on special tokens
+            bce_label_parts = [0]  # BCE labels (dummy for text_speech_latent_emb)
+            bce_mask_parts = [False]  # BCE mask (False for text_speech_latent_emb)
+            # Get valid text length for this batch item
+            if text_attention_mask is not None:
+                valid_text_len = text_attention_mask[b].sum().item()
+            else:
+                valid_text_len = text_embeddings.shape[1]
+            # Track current position in target_representation
+            speech_position = 0
+            # For each text token
+            for i in range(valid_text_len):
+                # Add t_i (text embedding)
+                t_i = text_embeddings[b, i:i+1]  # (1, 512)
+                sequence_parts.append(t_i)
+                loss_mask_parts.append(False)
+                bce_label_parts.append(0)  # Dummy label for t_i
+                bce_mask_parts.append(False)  # No BCE loss for t_i
+                # Add s_i (cross attention output)
+                s_i = cross_attention_outputs[b, i:i+1]  # (1, 512)
+                sequence_parts.append(s_i)
+                loss_mask_parts.append(False)
+                bce_label_parts.append(0)  # Dummy label for s_i
+                bce_mask_parts.append(False)  # No BCE loss for s_i
+                # Add time_speech_start
+                sequence_parts.append(time_speech_start_emb)
+                loss_mask_parts.append(False)
+                bce_label_parts.append(0)  # Dummy label for time_speech_start
+                bce_mask_parts.append(False)  # No BCE loss for time_speech_start
+                # Add z tokens for this chunk
+                chunk_size = alignment_chunk_sizes[b, i].item()
+                if chunk_size > 0:  # Only add if chunk size is positive
+                    end_position = speech_position + chunk_size
+                    # Make sure we don't exceed target_representation length
+                    end_position = min(end_position, target_representation.shape[1])
+                    actual_chunk_size = end_position - speech_position
+                    if actual_chunk_size > 0:
+                        z_tokens = target_representation[b, speech_position:end_position]  # (actual_chunk_size, 512)
+                        sequence_parts.append(z_tokens)
+                        loss_mask_parts.extend([True] * actual_chunk_size)  # Compute loss on z tokens
+                        bce_label_parts.extend([0] * actual_chunk_size)  # Label 0 for z tokens
+                        bce_mask_parts.extend([True] * actual_chunk_size)  # Compute BCE loss on z tokens
+                        # Collect z_tokens for separation loss computation
+                        all_z_tokens.append(z_tokens)
+                    speech_position = end_position
+                # Add time_speech_end
+                sequence_parts.append(time_speech_end_emb)
+                loss_mask_parts.append(False)
+                bce_label_parts.append(1)
+                bce_mask_parts.append(True)
+            # Concatenate all parts for this batch item
+            full_sequence = torch.cat(sequence_parts, dim=0)  # (total_length, 512)
+            loss_mask = torch.tensor(loss_mask_parts, dtype=torch.bool, device=device)
+            bce_labels = torch.tensor(bce_label_parts, dtype=torch.float, device=device)
+            bce_mask = torch.tensor(bce_mask_parts, dtype=torch.bool, device=device)
+            interleaved_sequences.append(full_sequence)
+            loss_masks.append(loss_mask)
+            bce_labels_batch.append(bce_labels)
+            bce_masks.append(bce_mask)
+            sequence_lengths.append(full_sequence.shape[0])  # Track actual length before padding
+            max_total_length = max(max_total_length, full_sequence.shape[0])
+        # Pad sequences
+        padded_sequences = []
+        padded_loss_masks = []
+        padded_bce_labels = []
+        padded_bce_masks = []
+        for sequence, loss_mask, bce_labels, bce_mask in zip(interleaved_sequences, loss_masks, bce_labels_batch, bce_masks):
+            current_length = sequence.shape[0]
+            if current_length < max_total_length:
+                padding = torch.zeros(max_total_length - current_length, 512, device=device, dtype=sequence.dtype)
+                padded_sequence = torch.cat([sequence, padding], dim=0)
+                mask_padding = torch.zeros(max_total_length - current_length, dtype=torch.bool, device=device)
+                padded_mask = torch.cat([loss_mask, mask_padding], dim=0)
+                bce_label_padding = torch.zeros(max_total_length - current_length, dtype=torch.float, device=device)
+                padded_bce_label = torch.cat([bce_labels, bce_label_padding], dim=0)
+                bce_mask_padding = torch.zeros(max_total_length - current_length, dtype=torch.bool, device=device)
+                padded_bce_mask = torch.cat([bce_mask, bce_mask_padding], dim=0)
+            else:
+                padded_sequence = sequence
+                padded_mask = loss_mask
+                padded_bce_label = bce_labels
+                padded_bce_mask = bce_mask
+            padded_sequences.append(padded_sequence)
+            padded_loss_masks.append(padded_mask)
+            padded_bce_labels.append(padded_bce_label)
+            padded_bce_masks.append(padded_bce_mask)
+        # Stack into batch tensors
+        interleaved_batch = torch.stack(padded_sequences, dim=0)  # (batch_size, max_total_length, 512)
+        loss_mask_batch = torch.stack(padded_loss_masks, dim=0)   # (batch_size, max_total_length)
+        bce_labels_batch_tensor = torch.stack(padded_bce_labels, dim=0)  # (batch_size, max_total_length)
+        bce_mask_batch = torch.stack(padded_bce_masks, dim=0)     # (batch_size, max_total_length)
+        # Autoregressive prediction
+        if max_total_length > 1:
+            ar_input = interleaved_batch[:, :-1, :]  # (batch_size, max_total_length-1, 512)
+            ar_targets = interleaved_batch[:, 1:, :]  # (batch_size, max_total_length-1, 512)
+            ar_loss_mask = loss_mask_batch[:, 1:]    # (batch_size, max_total_length-1) - shift mask left
+            ar_bce_labels = bce_labels_batch_tensor[:, 1:]  # (batch_size, max_total_length-1) - shift labels left
+            ar_bce_mask = bce_mask_batch[:, 1:]      # (batch_size, max_total_length-1) - shift mask left
+            # Create attention mask for autoregressive transformer
+            # We need to mask padded positions while maintaining causal property
+            ar_seq_len = ar_input.shape[1]
+            ar_attention_mask = torch.zeros(batch_size, ar_seq_len, dtype=torch.bool, device=device)
+            for b in range(batch_size):
+                valid_len = min(ar_seq_len, sequence_lengths[b] - 1)
+                if valid_len > 0:
+                    ar_attention_mask[b, :valid_len] = True
+            ar_outputs = self.ar_transformer(
+                hidden_states=ar_input,
+                attention_mask=ar_attention_mask,  # This will be combined with causal mask inside transformer
+            )
+            ar_predictions = ar_outputs.last_hidden_state  # (batch_size, max_total_length-1, 512)
+            # Compute BCE predictions for end token classification
+            bce_logits = self.end_token_classifier(ar_predictions).squeeze(-1)  # (batch_size, max_total_length-1)
+            # Compute L2 loss only where ar_loss_mask is True (z tokens)
+            if ar_loss_mask.any():
+                # Extract valid positions for loss computation
+                valid_predictions = ar_predictions[ar_loss_mask]  # (num_valid_positions, 512)
+                valid_targets = ar_targets[ar_loss_mask]          # (num_valid_positions, 512)
+                # Compute L2 loss (MSE)
+                reconstruction_loss = nn.functional.mse_loss(
+                    valid_predictions,
+                    valid_targets,
+                    reduction='mean'
+                )
+            else:
+                # Fallback if no valid positions (shouldn't happen in practice)
+                reconstruction_loss = torch.tensor(0.0, device=device, requires_grad=True)
+            # Compute BCE loss for end token classification (v1.1)
+            if ar_bce_mask.any():
+                # Extract valid positions for BCE loss computation
+                valid_bce_logits = bce_logits[ar_bce_mask]  # (num_valid_bce_positions,)
+                valid_bce_labels = ar_bce_labels[ar_bce_mask]  # (num_valid_bce_positions,)
+                # Compute BCE loss
+                bce_end_token_loss = nn.functional.binary_cross_entropy_with_logits(
+                    valid_bce_logits,
+                    valid_bce_labels,
+                    reduction='mean'
+                )
+            else:
+                # Fallback if no valid BCE positions
+                bce_end_token_loss = torch.tensor(0.0, device=device, requires_grad=True)
+            if self.bce_threshold > 0.0:
+                clamped_bce_loss = torch.clamp(bce_end_token_loss - self.bce_threshold, min=0.0)
+                total_loss = reconstruction_loss + self.alpha * clamped_bce_loss
+            else:
+                total_loss = reconstruction_loss + self.alpha * bce_end_token_loss
+        else:
+            reconstruction_loss = torch.tensor(0.0, device=device, requires_grad=True)
+            bce_end_token_loss = torch.tensor(0.0, device=device, requires_grad=True)
+            total_loss = reconstruction_loss + torch.tensor(0.0, device=device, requires_grad=True)
+        return {
+            'loss': total_loss,
+            'reconstruction_loss': reconstruction_loss,
+            'bce_end_token_loss': bce_end_token_loss,
+        }
+__all__ = ["TextSyncMimi"]