Upload configuration_text_sync_mimi.py with huggingface_hub

Browse files

Files changed (1) hide show

configuration_text_sync_mimi.py +88 -0

configuration_text_sync_mimi.py ADDED Viewed

	@@ -0,0 +1,88 @@

+"""TextSyncMimi model configuration"""
+from transformers.utils import logging
+from configuration_mimi import MimiConfig
+logger = logging.get_logger(__name__)
+class TextSyncMimiConfig(MimiConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`TextSyncMimi`].
+    It is used to instantiate a TextSyncMimi model according to the specified arguments,
+    defining the model architecture.
+    Configuration objects inherit from [`MimiConfig`] and include all Mimi parameters plus
+    additional TextSyncMimi-specific parameters.
+    Args:
+        mimi_model_id (`str`, *optional*, defaults to `"kyutai/mimi"`):
+            The Mimi model ID to use as the audio codec backbone.
+        vocab_size (`int`, *optional*, defaults to 128256):
+            Vocabulary size of the text tokenizer (LLaMA-3 tokenizer).
+        alpha (`float`, *optional*, defaults to 1.0):
+            Weight for the BCE end token loss in the total loss.
+        cross_attention_layers (`int`, *optional*, defaults to 4):
+            Number of cross-attention transformer layers for text-speech alignment.
+        causal_attention_layers (`int`, *optional*, defaults to 4):
+            Number of causal attention transformer layers for autoregressive generation.
+        bce_threshold (`float`, *optional*, defaults to 0.1):
+            BCE loss threshold - stop optimizing when BCE < threshold.
+        end_token_threshold (`float`, *optional*, defaults to 0.5):
+            BCE probability threshold for stopping generation (>= threshold means stop).
+        max_z_tokens (`int`, *optional*, defaults to 50):
+            Maximum z tokens to generate per text position during inference.
+        text_embedding_dim (`int`, *optional*, defaults to 4096):
+            Dimension of the text embeddings (LLaMA embedding size).
+        **kwargs: Additional parameters passed to MimiConfig (hidden_size, sample_rate, etc.)
+    Example:
+    ```python
+    >>> from transformers import TextSyncMimiConfig, TextSyncMimi
+    >>> # Initializing a TextSyncMimi configuration
+    >>> configuration = TextSyncMimiConfig()
+    >>> # Initializing a model (with random weights) from the configuration
+    >>> model = TextSyncMimi(configuration)
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```
+    """
+    model_type = "text_sync_mimi"
+    def __init__(
+        self,
+        mimi_model_id: str = "kyutai/mimi",
+        vocab_size: int = 128256,
+        alpha: float = 1.0,
+        cross_attention_layers: int = 4,
+        causal_attention_layers: int = 4,
+        bce_threshold: float = 0.1,
+        end_token_threshold: float = 0.5,
+        max_z_tokens: int = 50,
+        text_embedding_dim: int = 4096,
+        **kwargs,
+    ):
+        # Initialize parent MimiConfig with default values
+        # This ensures all MimiConfig attributes are set
+        super().__init__(**kwargs)
+        # TextSyncMimi-specific parameters
+        self.mimi_model_id = mimi_model_id
+        self.vocab_size = vocab_size
+        self.alpha = alpha
+        self.cross_attention_layers = cross_attention_layers
+        self.causal_attention_layers = causal_attention_layers
+        self.bce_threshold = bce_threshold
+        self.end_token_threshold = end_token_threshold
+        self.max_z_tokens = max_z_tokens
+        self.text_embedding_dim = text_embedding_dim
+__all__ = ["TextSyncMimiConfig"]