AstraMindAI
/

xttsv2

xtts

custom_code

Model card Files Files and versions

xet

Community

mlinmg commited on Oct 28, 2024

Commit

2ebc88c

verified ·

1 Parent(s): d195c37

Upload 2 files

Browse files

Files changed (2) hide show

config.json +139 -48
xtts2_config.py +152 -357

config.json CHANGED Viewed

@@ -1,61 +1,152 @@
 {
-  "_name_or_path": "AstraMindAI/xtts2",
   "architectures": [
-    "Xtts"
   ],
   "torch_dtype": "float32",
   "auto_map": {
-    "AutoConfig": "AstraMindAI/xtts2--xtts2_config.XTTSConfig",
-    "AutoModelForCausalLM": "AstraMindAI/xtts2--xtts2_modeling.Xtts"
   },
   "cond_d_vector_in_each_upsampling_layer": true,
   "d_vector_dim": 512,
   "decoder_input_dim": 1024,
   "input_sample_rate": 22050,
-  "model_type": "xtts_hifigan",
   "output_hop_length": 256,
   "output_sample_rate": 24000,
-  "resblock_dilation_sizes": [
-    [
-      1,
-      3,
-      5
-    ],
-    [
-      1,
-      3,
-      5
-    ],
-    [
-      1,
-      3,
-      5
-    ]
-  ],
-  "resblock_kernel_sizes": [
-    3,
-    7,
-    11
-  ],
-  "speaker_encoder_config": {
-    "model_config": null,
-    "model_name": "speaker_encoder",
-    "preprocess_config": null,
-    "speaker_embedding_dim": 512,
-    "use_torch_spec": true
-  },
-  "transformers_version": "4.45.1",
-  "upsample_initial_channel": 512,
-  "upsample_kernel_sizes": [
-    16,
-    16,
-    4,
-    4
-  ],
-  "upsample_rates": [
-    8,
-    8,
-    2,
-    2
-  ]
 }

 {
+     "_name_or_path": "AstraMindAI/xtts2-gpt",
   "architectures": [
+    "XttsGPT"
   ],
   "torch_dtype": "float32",
   "auto_map": {
+    "AutoConfig": "AstraMindAI/xtts2-gpt--gpt_config.XTTSGPTConfig",
+    "AutoModelForCausalLM": "AstraMindAI/xtts2-gpt--xtts2_gpt_modeling.XttsGPT",
+    "AutoTokenizer": "AstraMindAI/xtts2-gpt--tokenizer.XTTSTokenizerFast"
+  },
+  "audio_config": {
+    "fmax": 8000,
+    "fmin": 0,
+    "hop_length": 256,
+    "mel_channels": 80,
+    "mel_norms_file": null,
+    "n_fft": 1024,
+    "output_sample_rate": 24000,
+    "power": 1.0,
+    "sample_rate": 22050,
+    "win_length": 1024
   },
   "cond_d_vector_in_each_upsampling_layer": true,
   "d_vector_dim": 512,
   "decoder_input_dim": 1024,
+  "duration_const": 102400,
+  "gpt": {
+    "model_type": "xtts_gpt"
+  },
+  "gpt_code_stride_len": 1024,
+  "gpt_config": {
+    "_attn_implementation_autoset": false,
+    "_name_or_path": "",
+    "add_cross_attention": false,
+    "architectures": null,
+    "audio_config": {
+      "mel_channels": 80,
+      "output_sample_rate": 24000,
+      "sample_rate": 22050
+    },
+    "bad_words_ids": null,
+    "begin_suppress_tokens": null,
+    "bos_token_id": null,
+    "chunk_size_feed_forward": 0,
+    "cross_attention_hidden_size": null,
+    "decoder_input_dim": 1024,
+    "decoder_start_token_id": null,
+    "diversity_penalty": 0.0,
+    "do_sample": false,
+    "early_stopping": false,
+    "enable_redaction": false,
+    "encoder_no_repeat_ngram_size": 0,
+    "eos_token_id": null,
+    "exponential_decay_length_penalty": null,
+    "finetuning_task": null,
+    "forced_bos_token_id": null,
+    "forced_eos_token_id": null,
+    "gpt_batch_size": 1,
+    "gpt_max_audio_tokens": 605,
+    "hidden_size": 1024,
+    "id2label": {
+      "0": "LABEL_0",
+      "1": "LABEL_1"
+    },
+    "initializer_range": 0.02,
+    "is_decoder": false,
+    "is_encoder_decoder": false,
+    "kv_cache": true,
+    "label2id": {
+      "LABEL_0": 0,
+      "LABEL_1": 1
+    },
+    "layer_norm_epsilon": 1e-05,
+    "length_penalty": 1.0,
+    "max_audio_tokens": 605,
+    "max_length": 20,
+    "max_prompt_tokens": 70,
+    "max_text_tokens": 402,
+    "min_length": 0,
+    "model_type": "xtts_gpt",
+    "no_repeat_ngram_size": 0,
+    "num_attention_heads": 16,
+    "num_audio_tokens": 1026,
+    "num_beam_groups": 1,
+    "num_beams": 1,
+    "num_hidden_layers": 30,
+    "num_return_sequences": 1,
+    "number_text_tokens": 6681,
+    "output_attentions": false,
+    "output_hidden_states": false,
+    "output_scores": false,
+    "pad_token_id": null,
+    "prefix": null,
+    "problem_type": null,
+    "pruned_heads": {},
+    "remove_invalid_values": false,
+    "reorder_and_upcast_attn": false,
+    "repetition_penalty": 1.0,
+    "return_dict": true,
+    "return_dict_in_generate": false,
+    "scale_attn_by_inverse_layer_idx": false,
+    "sep_token_id": null,
+    "start_audio_token": 1024,
+    "start_text_token": null,
+    "stop_audio_token": 1025,
+    "stop_text_token": null,
+    "suppress_tokens": null,
+    "task_specific_params": null,
+    "temperature": 1.0,
+    "tf_legacy_loss": false,
+    "tie_encoder_decoder": false,
+    "tie_word_embeddings": true,
+    "tokenizer_class": null,
+    "top_k": 50,
+    "top_p": 1.0,
+    "torch_dtype": null,
+    "torchscript": false,
+    "transformers_version": "4.46.0",
+    "typical_p": 1.0,
+    "use_bfloat16": false,
+    "use_masking_gt_prompt_approach": true,
+    "use_perceiver_resampler": true,
+    "vocab_size": 6681
+  },
   "input_sample_rate": 22050,
+  "languages": [
+    "en",
+    "es",
+    "fr",
+    "de",
+    "it",
+    "pt",
+    "pl",
+    "tr",
+    "ru",
+    "nl",
+    "cs",
+    "ar",
+    "zh-cn",
+    "hu",
+    "ko",
+    "ja",
+    "hi"
+  ],
+  "model_type": "xtts",
+  "num_chars": 255,
   "output_hop_length": 256,
   "output_sample_rate": 24000,
+  "tokenizer_file": "",
+  "transformers_version": "4.46.0"
 }

xtts2_config.py CHANGED Viewed

@@ -1,17 +1,120 @@
 from dataclasses import asdict, dataclass
-from typing import Dict, List, Optional
 from transformers.configuration_utils import PretrainedConfig
 @dataclass
-class SpeakerEncoderConfig:
-    """Configuration for the speaker encoder component"""
-    model_name: str = "speaker_encoder"
-    preprocess_config: Optional[Dict] = None
-    model_config: Optional[Dict] = None
-    speaker_embedding_dim: int = 512
-    use_torch_spec: bool = True
 @dataclass
 class XTTSAudioConfig:
@@ -29,390 +132,82 @@ class XTTSAudioConfig:
 class XTTSConfig(PretrainedConfig):
-    """Combined configuration class for XTTS including both HifiGAN and GPT components"""
     model_type = "xtts"
     def __init__(
             self,
-            # HifiGAN Audio parameters
             input_sample_rate: int = 22050,
             output_sample_rate: int = 24000,
             output_hop_length: int = 256,
-            # HifiGAN Model architecture
             decoder_input_dim: int = 1024,
             d_vector_dim: int = 512,
             cond_d_vector_in_each_upsampling_layer: bool = True,
-            # HifiGAN Upsampling parameters
-            upsample_rates: List[int] = None,
-            upsample_kernel_sizes: List[int] = None,
-            upsample_initial_channel: int = 512,
-            # HifiGAN Resblock parameters
-            resblock_kernel_sizes: List[int] = None,
-            resblock_dilation_sizes: List[List[int]] = None,
-            # HifiGAN Speaker encoder
-            speaker_encoder_config: Optional[Dict] = None,
-            # GPT Model architecture
-            vocab_size: int = 256,
             num_chars: int = 255,
-            # GPT parameters
-            gpt_batch_size: int = 1,
-            gpt_max_audio_tokens: int = 605,
-            gpt_max_text_tokens: int = 402,
-            gpt_max_prompt_tokens: int = 70,
-            gpt_layers: int = 30,
-            gpt_n_model_channels: int = 1024,
-            gpt_n_heads: int = 16,
-            gpt_number_text_tokens: int = 6681,
-            gpt_start_text_token: Optional[int] = None,
-            gpt_stop_text_token: Optional[int] = None,
-            gpt_num_audio_tokens: int = 1026,
-            gpt_start_audio_token: int = 1024,
-            gpt_stop_audio_token: int = 1025,
-            gpt_code_stride_len: int = 1024,
-            gpt_use_masking_gt_prompt_approach: bool = True,
-            gpt_use_perceiver_resampler: bool = True,
-            gpt_checkpointing: bool = False,
-            gpt_train_solo_embeddings: bool = False,
-            # GPT Training parameters
-            enable_redaction: bool = False,
-            kv_cache: bool = True,
-            perceiver_cond_length_compression: int = 256,
-            label_smoothing: float = 0.0,
-            # GPT Generation parameters
-            temperature: float = 0.75,
-            length_penalty: float = 1.0,
-            repetition_penalty: float = 5.0,
-            top_k: int = 50,
-            top_p: float = 0.85,
-            gpt_cond_len: int = 30,
-            gpt_cond_chunk_len: int = 4,
-            max_ref_len: int = 30,
-            sound_norm_refs: bool = False,
-            # GPT Audio processing
-            audio_config: Optional[XTTSAudioConfig] = None,
-            # GPT Constants and limits
-            duration_const: int = 102400,
-            char_limits: Optional[Dict[str, int]] = None,
             languages: Optional[List[str]] = None,
-            # Base config parameters
-            pad_token_id: Optional[int] = None,
-            bos_token_id: Optional[int] = None,
-            eos_token_id: Optional[int] = None,
-            **kwargs,
-    ):
-        super().__init__(
-            pad_token_id=pad_token_id,
-            bos_token_id=bos_token_id,
-            eos_token_id=eos_token_id,
-            **kwargs
-        )
-        # Set default lists for HifiGAN
-        if upsample_rates is None:
-            upsample_rates = [8, 8, 2, 2]
-        if upsample_kernel_sizes is None:
-            upsample_kernel_sizes = [16, 16, 4, 4]
-        if resblock_kernel_sizes is None:
-            resblock_kernel_sizes = [3, 7, 11]
-        if resblock_dilation_sizes is None:
-            resblock_dilation_sizes = [[1, 3, 5], [1, 3, 5], [1, 3, 5]]
-        # Set default dicts for GPT
-        if char_limits is None:
-            char_limits = {
-                "en": 250, "de": 253, "fr": 273, "es": 239,
-                "it": 213, "pt": 203, "pl": 224, "zh": 82,
-                "ar": 166, "cs": 186, "ru": 182, "nl": 251,
-                "tr": 226, "ja": 71, "hu": 224, "ko": 95,
-            }
-        if languages is None:
-            languages = [
-                "en", "es", "fr", "de", "it", "pt", "pl", "tr", "ru", "nl",
-                "cs", "ar", "zh-cn", "hu", "ko", "ja", "hi"
-            ]
-        # Initialize HifiGAN parameters
-        # Audio parameters
         self.input_sample_rate = input_sample_rate
         self.output_sample_rate = output_sample_rate
         self.output_hop_length = output_hop_length
-        # Model architecture
         self.decoder_input_dim = decoder_input_dim
         self.d_vector_dim = d_vector_dim
         self.cond_d_vector_in_each_upsampling_layer = cond_d_vector_in_each_upsampling_layer
-        # Upsampling parameters
-        self.upsample_rates = upsample_rates
-        self.upsample_kernel_sizes = upsample_kernel_sizes
-        self.upsample_initial_channel = upsample_initial_channel
-        # Resblock parameters
-        self.resblock_kernel_sizes = resblock_kernel_sizes
-        self.resblock_dilation_sizes = resblock_dilation_sizes
-        # Speaker encoder - store as dictionary
-        if speaker_encoder_config is None:
-            self.speaker_encoder_config = asdict(SpeakerEncoderConfig())
-        elif isinstance(speaker_encoder_config, dict):
-            default_config = asdict(SpeakerEncoderConfig())
-            default_config.update(speaker_encoder_config)
-            self.speaker_encoder_config = default_config
-        elif isinstance(speaker_encoder_config, SpeakerEncoderConfig):
-            self.speaker_encoder_config = asdict(speaker_encoder_config)
-        else:
-            raise ValueError("speaker_encoder_config must be either a dictionary or SpeakerEncoderConfig instance")
-        # Initialize GPT parameters
-        self.vocab_size = vocab_size
         self.num_chars = num_chars
-        # GPT model parameters
-        self.gpt_batch_size = gpt_batch_size
-        self.gpt_max_audio_tokens = gpt_max_audio_tokens
-        self.gpt_max_text_tokens = gpt_max_text_tokens
-        self.gpt_max_prompt_tokens = gpt_max_prompt_tokens
-        self.gpt_layers = gpt_layers
-        self.gpt_n_model_channels = gpt_n_model_channels
-        self.gpt_n_heads = gpt_n_heads
-        self.gpt_number_text_tokens = gpt_number_text_tokens
-        self.gpt_start_text_token = gpt_start_text_token
-        self.gpt_stop_text_token = gpt_stop_text_token
-        self.gpt_num_audio_tokens = gpt_num_audio_tokens
-        self.gpt_start_audio_token = gpt_start_audio_token
-        self.gpt_stop_audio_token = gpt_stop_audio_token
-        self.gpt_code_stride_len = gpt_code_stride_len
-        self.gpt_use_masking_gt_prompt_approach = gpt_use_masking_gt_prompt_approach
-        self.gpt_use_perceiver_resampler = gpt_use_perceiver_resampler
-        self.gpt_checkpointing = gpt_checkpointing
-        self.gpt_train_solo_embeddings = gpt_train_solo_embeddings
-        # Training parameters
-        self.enable_redaction = enable_redaction
-        self.kv_cache = kv_cache
-        self.perceiver_cond_length_compression = perceiver_cond_length_compression
-        self.label_smoothing = label_smoothing
-        # Generation parameters
-        self.temperature = temperature
-        self.length_penalty = length_penalty
-        self.repetition_penalty = repetition_penalty
-        self.top_k = top_k
-        self.top_p = top_p
-        self.gpt_cond_len = gpt_cond_len
-        self.gpt_cond_chunk_len = gpt_cond_chunk_len
-        self.max_ref_len = max_ref_len
-        self.sound_norm_refs = sound_norm_refs
-        # Audio processing
-        if audio_config is None:
-            audio_config = XTTSAudioConfig()
-        elif isinstance(audio_config, dict):
-            audio_config = XTTSAudioConfig(**audio_config)
-        self.audio_config = audio_config
-        # Constants and limits
-        self.duration_const = duration_const
-        self.char_limits = char_limits
-        self.languages = languages
     def to_dict(self) -> Dict:
-        """Convert the config to a dictionary format."""
-        # Get parent class dict
         output = super().to_dict()
-        # Add all attributes
-        output.update({
-            # HifiGAN parameters
-            "input_sample_rate": self.input_sample_rate,
-            "output_sample_rate": self.output_sample_rate,
-            "output_hop_length": self.output_hop_length,
-            "decoder_input_dim": self.decoder_input_dim,
-            "d_vector_dim": self.d_vector_dim,
-            "cond_d_vector_in_each_upsampling_layer": self.cond_d_vector_in_each_upsampling_layer,
-            "upsample_rates": self.upsample_rates,
-            "upsample_kernel_sizes": self.upsample_kernel_sizes,
-            "upsample_initial_channel": self.upsample_initial_channel,
-            "resblock_kernel_sizes": self.resblock_kernel_sizes,
-            "resblock_dilation_sizes": self.resblock_dilation_sizes,
-            "speaker_encoder_config": self.speaker_encoder_config,
-            # GPT parameters
-            "vocab_size": self.vocab_size,
-            "num_chars": self.num_chars,
-            "gpt_batch_size": self.gpt_batch_size,
-            "gpt_max_audio_tokens": self.gpt_max_audio_tokens,
-            "gpt_max_text_tokens": self.gpt_max_text_tokens,
-            "gpt_max_prompt_tokens": self.gpt_max_prompt_tokens,
-            "gpt_layers": self.gpt_layers,
-            "gpt_n_model_channels": self.gpt_n_model_channels,
-            "gpt_n_heads": self.gpt_n_heads,
-            "gpt_number_text_tokens": self.gpt_number_text_tokens,
-            "gpt_start_text_token": self.gpt_start_text_token,
-            "gpt_stop_text_token": self.gpt_stop_text_token,
-            "gpt_num_audio_tokens": self.gpt_num_audio_tokens,
-            "gpt_start_audio_token": self.gpt_start_audio_token,
-            "gpt_stop_audio_token": self.gpt_stop_audio_token,
-            "gpt_code_stride_len": self.gpt_code_stride_len,
-            "gpt_use_masking_gt_prompt_approach": self.gpt_use_masking_gt_prompt_approach,
-            "gpt_use_perceiver_resampler": self.gpt_use_perceiver_resampler,
-            "gpt_checkpointing": self.gpt_checkpointing,
-            "gpt_train_solo_embeddings": self.gpt_train_solo_embeddings,
-            "enable_redaction": self.enable_redaction,
-            "kv_cache": self.kv_cache,
-            "perceiver_cond_length_compression": self.perceiver_cond_length_compression,
-            "label_smoothing": self.label_smoothing,
-            "temperature": self.temperature,
-            "length_penalty": self.length_penalty,
-            "repetition_penalty": self.repetition_penalty,
-            "top_k": self.top_k,
-            "top_p": self.top_p,
-            "gpt_cond_len": self.gpt_cond_len,
-            "gpt_cond_chunk_len": self.gpt_cond_chunk_len,
-            "max_ref_len": self.max_ref_len,
-            "sound_norm_refs": self.sound_norm_refs,
-            "audio_config": asdict(self.audio_config),
-            "duration_const": self.duration_const,
-            "char_limits": self.char_limits,
-            "languages": self.languages,
-        })
         return output
     @classmethod
-    def from_dict(cls, config_dict: Dict) -> "XTTSConfig":
-        """Create a config instance from a dictionary."""
-        config_copy = config_dict.copy()
-        # Handle special nested configs
-        if "audio_config" in config_copy:
-            config_copy["audio_config"] = XTTSAudioConfig(**config_copy["audio_config"])
-        return cls(**config_copy)
-    def get_speaker_encoder_config(self) -> SpeakerEncoderConfig:
-        """Get speaker encoder config as a SpeakerEncoderConfig instance"""
-        return SpeakerEncoderConfig(**self.speaker_encoder_config)
-    def update_with_tokenizer(self, tokenizer=None):
-        """Update configuration values based on tokenizer"""
-        if tokenizer is not None:
-            self.gpt_number_text_tokens = tokenizer.get_vocab_size()
-            self.gpt_start_text_token = tokenizer.bos_token_id
-            self.gpt_stop_text_token = tokenizer.eos_token_id
-            self.vocab_size = tokenizer.get_vocab_size()
-            self.pad_token_id = tokenizer.pad_token_id
-            self.bos_token_id = tokenizer.bos_token_id
-            self.eos_token_id = tokenizer.eos_token_id
-    def get_hifigan_config(self) -> Dict:
-        """Extract HiFiGAN-specific configuration"""
-        return {
-            "input_sample_rate": self.input_sample_rate,
-            "output_sample_rate": self.output_sample_rate,
-            "output_hop_length": self.output_hop_length,
-            "decoder_input_dim": self.decoder_input_dim,
-            "d_vector_dim": self.d_vector_dim,
-            "cond_d_vector_in_each_upsampling_layer": self.cond_d_vector_in_each_upsampling_layer,
-            "upsample_rates": self.upsample_rates,
-            "upsample_kernel_sizes": self.upsample_kernel_sizes,
-            "upsample_initial_channel": self.upsample_initial_channel,
-            "resblock_kernel_sizes": self.resblock_kernel_sizes,
-            "resblock_dilation_sizes": self.resblock_dilation_sizes,
-            "speaker_encoder_config": self.speaker_encoder_config
-        }
-    def get_gpt_config(self) -> Dict:
-        """Extract GPT-specific configuration"""
-        return {
-            "vocab_size": self.vocab_size,
-            "num_chars": self.num_chars,
-            "gpt_batch_size": self.gpt_batch_size,
-            "gpt_max_audio_tokens": self.gpt_max_audio_tokens,
-            "gpt_max_text_tokens": self.gpt_max_text_tokens,
-            "gpt_max_prompt_tokens": self.gpt_max_prompt_tokens,
-            "gpt_layers": self.gpt_layers,
-            "gpt_n_model_channels": self.gpt_n_model_channels,
-            "gpt_n_heads": self.gpt_n_heads,
-            "gpt_number_text_tokens": self.gpt_number_text_tokens,
-            "gpt_start_text_token": self.gpt_start_text_token,
-            "gpt_stop_text_token": self.gpt_stop_text_token,
-            "gpt_num_audio_tokens": self.gpt_num_audio_tokens,
-            "gpt_start_audio_token": self.gpt_start_audio_token,
-            "gpt_stop_audio_token": self.gpt_stop_audio_token,
-            "gpt_code_stride_len": self.gpt_code_stride_len,
-            "gpt_use_masking_gt_prompt_approach": self.gpt_use_masking_gt_prompt_approach,
-            "gpt_use_perceiver_resampler": self.gpt_use_perceiver_resampler,
-            "gpt_checkpointing": self.gpt_checkpointing,
-            "gpt_train_solo_embeddings": self.gpt_train_solo_embeddings,
-            "enable_redaction": self.enable_redaction,
-            "kv_cache": self.kv_cache,
-            "perceiver_cond_length_compression": self.perceiver_cond_length_compression,
-            "label_smoothing": self.label_smoothing,
-            "audio_config": self.audio_config,
-            "pad_token_id": self.pad_token_id,
-            "bos_token_id": self.bos_token_id,
-            "eos_token_id": self.eos_token_id
-        }
-    def get_generation_config(self) -> Dict:
-        """Extract generation-specific configuration"""
-        return {
-            "temperature": self.temperature,
-            "length_penalty": self.length_penalty,
-            "repetition_penalty": self.repetition_penalty,
-            "top_k": self.top_k,
-            "top_p": self.top_p,
-            "gpt_cond_len": self.gpt_cond_len,
-            "gpt_cond_chunk_len": self.gpt_cond_chunk_len,
-            "max_ref_len": self.max_ref_len,
-            "sound_norm_refs": self.sound_norm_refs
-        }
-    def validate(self):
-        """Validate configuration values"""
-        if self.gpt_max_text_tokens <= 0:
-            raise ValueError("gpt_max_text_tokens must be positive")
-        if self.gpt_max_audio_tokens <= 0:
-            raise ValueError("gpt_max_audio_tokens must be positive")
-        if self.gpt_layers <= 0:
-            raise ValueError("gpt_layers must be positive")
-        if self.gpt_n_heads <= 0:
-            raise ValueError("gpt_n_heads must be positive")
-        if self.gpt_n_model_channels <= 0:
-            raise ValueError("gpt_n_model_channels must be positive")
-        if len(self.upsample_rates) != len(self.upsample_kernel_sizes):
-            raise ValueError("upsample_rates and upsample_kernel_sizes must have same length")
-        if not all(isinstance(x, int) and x > 0 for x in self.upsample_rates):
-            raise ValueError("all upsample_rates must be positive integers")
-    def get_audio_config(self) -> XTTSAudioConfig:
-        """Get the audio configuration"""
-        return self.audio_config
-    @property
-    def num_hidden_layers(self) -> int:
-        """Get number of hidden layers (alias for gpt_layers)"""
-        return self.gpt_layers
-    @property
-    def hidden_size(self) -> int:
-        """Get hidden size (alias for gpt_n_model_channels)"""
-        return self.gpt_n_model_channels
-    @property
-    def num_attention_heads(self) -> int:
-        """Get number of attention heads (alias for gpt_n_heads)"""
-        return self.gpt_n_heads

 from dataclasses import asdict, dataclass
+from typing import Dict, Optional, List
 from transformers.configuration_utils import PretrainedConfig
+from transformers.utils import logging
+logger = logging.get_logger(__name__)
 @dataclass
+class GPTAudioConfig:
+    """Configuration for GPT audio processing parameters"""
+    mel_channels: int = 80
+    sample_rate: int = 22050
+    output_sample_rate: int = 24000
+class XTTSGPTConfig(PretrainedConfig):
+    """Configuration class for the GPT component of XTTS."""
+    model_type = "xtts_gpt"
+    def __init__(
+            self,
+            # Model architecture
+            hidden_size: int = 1024,  # gpt_n_model_channels in original
+            num_hidden_layers: int = 30,  # gpt_layers in original
+            num_attention_heads: int = 16,  # gpt_n_heads in original
+            # Tokenizer settings
+            vocab_size: int = 6681,  # gpt_number_text_tokens in original
+            number_text_tokens: int = 6681,  # Explicit text token vocabulary size
+            start_text_token: Optional[int] = None,
+            stop_text_token: Optional[int] = None,
+            # Audio token settings
+            num_audio_tokens: int = 1026,  # gpt_num_audio_tokens in original
+            start_audio_token: int = 1024,  # gpt_start_audio_token in original
+            stop_audio_token: int = 1025,  # gpt_stop_audio_token in original
+            # Sequence length settings
+            max_audio_tokens: int = 605,  # gpt_max_audio_tokens in original
+            max_text_tokens: int = 402,  # gpt_max_text_tokens in original
+            max_prompt_tokens: int = 70,  # gpt_max_prompt_tokens in original
+            gpt_max_audio_tokens: int = 605,  # Used for generation
+            # Model behavior settings
+            use_masking_gt_prompt_approach: bool = True,  # gpt_use_masking_gt_prompt_approach in original
+            use_perceiver_resampler: bool = True,  # gpt_use_perceiver_resampler in original
+            kv_cache: bool = True,
+            enable_redaction: bool = False,
+            # GPT batch settings
+            gpt_batch_size: int = 1,
+            # Audio processing
+            audio_config: Optional[Dict] = None,
+            # Architecture specifics
+            layer_norm_epsilon: float = 1e-5,
+            initializer_range: float = 0.02,
+            add_cross_attention: bool = False,
+            scale_attn_by_inverse_layer_idx: bool = False,
+            reorder_and_upcast_attn: bool = False,
+            # Size settings for the decoder
+            decoder_input_dim: int = 1024,
+            **kwargs
+    ):
+        super().__init__(**kwargs)
+        self.audio_config = GPTAudioConfig(
+            **audio_config if audio_config is not None else {}
+        )
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.vocab_size = vocab_size
+        self.number_text_tokens = number_text_tokens
+        self.start_text_token = start_text_token
+        self.stop_text_token = stop_text_token
+        self.num_audio_tokens = num_audio_tokens
+        self.start_audio_token = start_audio_token
+        self.stop_audio_token = stop_audio_token
+        self.max_audio_tokens = max_audio_tokens
+        self.max_text_tokens = max_text_tokens
+        self.max_prompt_tokens = max_prompt_tokens
+        self.gpt_max_audio_tokens = gpt_max_audio_tokens
+        self.use_masking_gt_prompt_approach = use_masking_gt_prompt_approach
+        self.use_perceiver_resampler = use_perceiver_resampler
+        self.kv_cache = kv_cache
+        self.enable_redaction = enable_redaction
+        self.gpt_batch_size = gpt_batch_size
+        self.layer_norm_epsilon = layer_norm_epsilon
+        self.initializer_range = initializer_range
+        self.add_cross_attention = add_cross_attention
+        self.scale_attn_by_inverse_layer_idx = scale_attn_by_inverse_layer_idx
+        self.reorder_and_upcast_attn = reorder_and_upcast_attn
+        self.decoder_input_dim = decoder_input_dim
+    def to_dict(self) -> Dict:
+        """Convert the config to a dictionary."""
+        output = super().to_dict()
+        output["audio_config"] = asdict(self.audio_config)
+        return output
+    @classmethod
+    def from_dict(cls, config_dict: Dict, *args, **kwargs) -> "XTTSGPTConfig":
+        """Create a config from a dictionary."""
+        return cls(**config_dict)
 @dataclass
 class XTTSAudioConfig:
 class XTTSConfig(PretrainedConfig):
+    """Configuration class for XTTS model components except GPT."""
     model_type = "xtts"
     def __init__(
             self,
+            # Audio settings
+            audio_config: Optional[Dict] = None,
             input_sample_rate: int = 22050,
             output_sample_rate: int = 24000,
             output_hop_length: int = 256,
+            # Model architecture
             decoder_input_dim: int = 1024,
             d_vector_dim: int = 512,
             cond_d_vector_in_each_upsampling_layer: bool = True,
+            # Training settings
+            gpt_code_stride_len: int = 1024,
+            duration_const: int = 102400,
+            # Tokenizer settings
+            tokenizer_file: str = "",
             num_chars: int = 255,
+            # Language support
             languages: Optional[List[str]] = None,
+            # GPT configuration
+            gpt_config: Optional[Dict] = None,
+            **kwargs
+    ):
+        super().__init__(**kwargs)
+        # Initialize audio config
+        self.audio_config = XTTSAudioConfig(
+            **audio_config if audio_config is not None else {}
+        )
         self.input_sample_rate = input_sample_rate
         self.output_sample_rate = output_sample_rate
         self.output_hop_length = output_hop_length
         self.decoder_input_dim = decoder_input_dim
         self.d_vector_dim = d_vector_dim
         self.cond_d_vector_in_each_upsampling_layer = cond_d_vector_in_each_upsampling_layer
+        self.gpt_code_stride_len = gpt_code_stride_len
+        self.duration_const = duration_const
+        self.tokenizer_file = tokenizer_file
         self.num_chars = num_chars
+        # Initialize GPT config
+        self.gpt = XTTSGPTConfig(**gpt_config if gpt_config is not None else {})
+        if languages is None:
+            self.languages = [
+                "en", "es", "fr", "de", "it", "pt", "pl", "tr", "ru",
+                "nl", "cs", "ar", "zh-cn", "hu", "ko", "ja", "hi"
+            ]
+        else:
+            self.languages = languages
     def to_dict(self) -> Dict:
+        """Convert the config to a dictionary."""
         output = super().to_dict()
+        output["audio_config"] = asdict(self.audio_config)
+        output["gpt_config"] = self.gpt.to_dict()
         return output
     @classmethod
+    def from_dict(cls, config_dict: Dict, *args, **kwargs) -> "XTTSConfig":
+        """Create a config from a dictionary."""
+        if "gpt_config" in config_dict:
+            gpt_config = config_dict["gpt_config"]
+            config_dict = {k: v for k, v in config_dict.items() if k != "gpt_config"}
+            return cls(gpt_config=gpt_config, **config_dict)
+        return cls(**config_dict)