mlinmg commited on
Commit
2ebc88c
·
verified ·
1 Parent(s): d195c37

Upload 2 files

Browse files
Files changed (2) hide show
  1. config.json +139 -48
  2. xtts2_config.py +152 -357
config.json CHANGED
@@ -1,61 +1,152 @@
1
  {
2
- "_name_or_path": "AstraMindAI/xtts2",
3
  "architectures": [
4
- "Xtts"
5
  ],
6
  "torch_dtype": "float32",
7
  "auto_map": {
8
- "AutoConfig": "AstraMindAI/xtts2--xtts2_config.XTTSConfig",
9
- "AutoModelForCausalLM": "AstraMindAI/xtts2--xtts2_modeling.Xtts"
 
 
 
 
 
 
 
 
 
 
 
 
 
10
  },
11
  "cond_d_vector_in_each_upsampling_layer": true,
12
  "d_vector_dim": 512,
13
  "decoder_input_dim": 1024,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14
  "input_sample_rate": 22050,
15
- "model_type": "xtts_hifigan",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16
  "output_hop_length": 256,
17
  "output_sample_rate": 24000,
18
- "resblock_dilation_sizes": [
19
- [
20
- 1,
21
- 3,
22
- 5
23
- ],
24
- [
25
- 1,
26
- 3,
27
- 5
28
- ],
29
- [
30
- 1,
31
- 3,
32
- 5
33
- ]
34
- ],
35
- "resblock_kernel_sizes": [
36
- 3,
37
- 7,
38
- 11
39
- ],
40
- "speaker_encoder_config": {
41
- "model_config": null,
42
- "model_name": "speaker_encoder",
43
- "preprocess_config": null,
44
- "speaker_embedding_dim": 512,
45
- "use_torch_spec": true
46
- },
47
- "transformers_version": "4.45.1",
48
- "upsample_initial_channel": 512,
49
- "upsample_kernel_sizes": [
50
- 16,
51
- 16,
52
- 4,
53
- 4
54
- ],
55
- "upsample_rates": [
56
- 8,
57
- 8,
58
- 2,
59
- 2
60
- ]
61
  }
 
1
  {
2
+ "_name_or_path": "AstraMindAI/xtts2-gpt",
3
  "architectures": [
4
+ "XttsGPT"
5
  ],
6
  "torch_dtype": "float32",
7
  "auto_map": {
8
+ "AutoConfig": "AstraMindAI/xtts2-gpt--gpt_config.XTTSGPTConfig",
9
+ "AutoModelForCausalLM": "AstraMindAI/xtts2-gpt--xtts2_gpt_modeling.XttsGPT",
10
+ "AutoTokenizer": "AstraMindAI/xtts2-gpt--tokenizer.XTTSTokenizerFast"
11
+ },
12
+ "audio_config": {
13
+ "fmax": 8000,
14
+ "fmin": 0,
15
+ "hop_length": 256,
16
+ "mel_channels": 80,
17
+ "mel_norms_file": null,
18
+ "n_fft": 1024,
19
+ "output_sample_rate": 24000,
20
+ "power": 1.0,
21
+ "sample_rate": 22050,
22
+ "win_length": 1024
23
  },
24
  "cond_d_vector_in_each_upsampling_layer": true,
25
  "d_vector_dim": 512,
26
  "decoder_input_dim": 1024,
27
+ "duration_const": 102400,
28
+ "gpt": {
29
+ "model_type": "xtts_gpt"
30
+ },
31
+ "gpt_code_stride_len": 1024,
32
+ "gpt_config": {
33
+ "_attn_implementation_autoset": false,
34
+ "_name_or_path": "",
35
+ "add_cross_attention": false,
36
+ "architectures": null,
37
+ "audio_config": {
38
+ "mel_channels": 80,
39
+ "output_sample_rate": 24000,
40
+ "sample_rate": 22050
41
+ },
42
+ "bad_words_ids": null,
43
+ "begin_suppress_tokens": null,
44
+ "bos_token_id": null,
45
+ "chunk_size_feed_forward": 0,
46
+ "cross_attention_hidden_size": null,
47
+ "decoder_input_dim": 1024,
48
+ "decoder_start_token_id": null,
49
+ "diversity_penalty": 0.0,
50
+ "do_sample": false,
51
+ "early_stopping": false,
52
+ "enable_redaction": false,
53
+ "encoder_no_repeat_ngram_size": 0,
54
+ "eos_token_id": null,
55
+ "exponential_decay_length_penalty": null,
56
+ "finetuning_task": null,
57
+ "forced_bos_token_id": null,
58
+ "forced_eos_token_id": null,
59
+ "gpt_batch_size": 1,
60
+ "gpt_max_audio_tokens": 605,
61
+ "hidden_size": 1024,
62
+ "id2label": {
63
+ "0": "LABEL_0",
64
+ "1": "LABEL_1"
65
+ },
66
+ "initializer_range": 0.02,
67
+ "is_decoder": false,
68
+ "is_encoder_decoder": false,
69
+ "kv_cache": true,
70
+ "label2id": {
71
+ "LABEL_0": 0,
72
+ "LABEL_1": 1
73
+ },
74
+ "layer_norm_epsilon": 1e-05,
75
+ "length_penalty": 1.0,
76
+ "max_audio_tokens": 605,
77
+ "max_length": 20,
78
+ "max_prompt_tokens": 70,
79
+ "max_text_tokens": 402,
80
+ "min_length": 0,
81
+ "model_type": "xtts_gpt",
82
+ "no_repeat_ngram_size": 0,
83
+ "num_attention_heads": 16,
84
+ "num_audio_tokens": 1026,
85
+ "num_beam_groups": 1,
86
+ "num_beams": 1,
87
+ "num_hidden_layers": 30,
88
+ "num_return_sequences": 1,
89
+ "number_text_tokens": 6681,
90
+ "output_attentions": false,
91
+ "output_hidden_states": false,
92
+ "output_scores": false,
93
+ "pad_token_id": null,
94
+ "prefix": null,
95
+ "problem_type": null,
96
+ "pruned_heads": {},
97
+ "remove_invalid_values": false,
98
+ "reorder_and_upcast_attn": false,
99
+ "repetition_penalty": 1.0,
100
+ "return_dict": true,
101
+ "return_dict_in_generate": false,
102
+ "scale_attn_by_inverse_layer_idx": false,
103
+ "sep_token_id": null,
104
+ "start_audio_token": 1024,
105
+ "start_text_token": null,
106
+ "stop_audio_token": 1025,
107
+ "stop_text_token": null,
108
+ "suppress_tokens": null,
109
+ "task_specific_params": null,
110
+ "temperature": 1.0,
111
+ "tf_legacy_loss": false,
112
+ "tie_encoder_decoder": false,
113
+ "tie_word_embeddings": true,
114
+ "tokenizer_class": null,
115
+ "top_k": 50,
116
+ "top_p": 1.0,
117
+ "torch_dtype": null,
118
+ "torchscript": false,
119
+ "transformers_version": "4.46.0",
120
+ "typical_p": 1.0,
121
+ "use_bfloat16": false,
122
+ "use_masking_gt_prompt_approach": true,
123
+ "use_perceiver_resampler": true,
124
+ "vocab_size": 6681
125
+ },
126
  "input_sample_rate": 22050,
127
+ "languages": [
128
+ "en",
129
+ "es",
130
+ "fr",
131
+ "de",
132
+ "it",
133
+ "pt",
134
+ "pl",
135
+ "tr",
136
+ "ru",
137
+ "nl",
138
+ "cs",
139
+ "ar",
140
+ "zh-cn",
141
+ "hu",
142
+ "ko",
143
+ "ja",
144
+ "hi"
145
+ ],
146
+ "model_type": "xtts",
147
+ "num_chars": 255,
148
  "output_hop_length": 256,
149
  "output_sample_rate": 24000,
150
+ "tokenizer_file": "",
151
+ "transformers_version": "4.46.0"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
152
  }
xtts2_config.py CHANGED
@@ -1,17 +1,120 @@
1
  from dataclasses import asdict, dataclass
2
- from typing import Dict, List, Optional
3
  from transformers.configuration_utils import PretrainedConfig
 
 
 
4
 
5
 
6
  @dataclass
7
- class SpeakerEncoderConfig:
8
- """Configuration for the speaker encoder component"""
9
- model_name: str = "speaker_encoder"
10
- preprocess_config: Optional[Dict] = None
11
- model_config: Optional[Dict] = None
12
- speaker_embedding_dim: int = 512
13
- use_torch_spec: bool = True
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14
 
 
 
 
 
15
 
16
  @dataclass
17
  class XTTSAudioConfig:
@@ -29,390 +132,82 @@ class XTTSAudioConfig:
29
 
30
 
31
  class XTTSConfig(PretrainedConfig):
32
- """Combined configuration class for XTTS including both HifiGAN and GPT components"""
33
  model_type = "xtts"
34
 
35
  def __init__(
36
  self,
37
- # HifiGAN Audio parameters
 
38
  input_sample_rate: int = 22050,
39
  output_sample_rate: int = 24000,
40
  output_hop_length: int = 256,
41
 
42
- # HifiGAN Model architecture
43
  decoder_input_dim: int = 1024,
44
  d_vector_dim: int = 512,
45
  cond_d_vector_in_each_upsampling_layer: bool = True,
46
 
47
- # HifiGAN Upsampling parameters
48
- upsample_rates: List[int] = None,
49
- upsample_kernel_sizes: List[int] = None,
50
- upsample_initial_channel: int = 512,
51
-
52
- # HifiGAN Resblock parameters
53
- resblock_kernel_sizes: List[int] = None,
54
- resblock_dilation_sizes: List[List[int]] = None,
55
-
56
- # HifiGAN Speaker encoder
57
- speaker_encoder_config: Optional[Dict] = None,
58
 
59
- # GPT Model architecture
60
- vocab_size: int = 256,
61
  num_chars: int = 255,
62
 
63
- # GPT parameters
64
- gpt_batch_size: int = 1,
65
- gpt_max_audio_tokens: int = 605,
66
- gpt_max_text_tokens: int = 402,
67
- gpt_max_prompt_tokens: int = 70,
68
- gpt_layers: int = 30,
69
- gpt_n_model_channels: int = 1024,
70
- gpt_n_heads: int = 16,
71
- gpt_number_text_tokens: int = 6681,
72
- gpt_start_text_token: Optional[int] = None,
73
- gpt_stop_text_token: Optional[int] = None,
74
- gpt_num_audio_tokens: int = 1026,
75
- gpt_start_audio_token: int = 1024,
76
- gpt_stop_audio_token: int = 1025,
77
- gpt_code_stride_len: int = 1024,
78
- gpt_use_masking_gt_prompt_approach: bool = True,
79
- gpt_use_perceiver_resampler: bool = True,
80
- gpt_checkpointing: bool = False,
81
- gpt_train_solo_embeddings: bool = False,
82
-
83
- # GPT Training parameters
84
- enable_redaction: bool = False,
85
- kv_cache: bool = True,
86
- perceiver_cond_length_compression: int = 256,
87
- label_smoothing: float = 0.0,
88
-
89
- # GPT Generation parameters
90
- temperature: float = 0.75,
91
- length_penalty: float = 1.0,
92
- repetition_penalty: float = 5.0,
93
- top_k: int = 50,
94
- top_p: float = 0.85,
95
- gpt_cond_len: int = 30,
96
- gpt_cond_chunk_len: int = 4,
97
- max_ref_len: int = 30,
98
- sound_norm_refs: bool = False,
99
-
100
- # GPT Audio processing
101
- audio_config: Optional[XTTSAudioConfig] = None,
102
-
103
- # GPT Constants and limits
104
- duration_const: int = 102400,
105
- char_limits: Optional[Dict[str, int]] = None,
106
  languages: Optional[List[str]] = None,
107
 
108
- # Base config parameters
109
- pad_token_id: Optional[int] = None,
110
- bos_token_id: Optional[int] = None,
111
- eos_token_id: Optional[int] = None,
112
- **kwargs,
113
- ):
114
- super().__init__(
115
- pad_token_id=pad_token_id,
116
- bos_token_id=bos_token_id,
117
- eos_token_id=eos_token_id,
118
- **kwargs
119
- )
120
 
121
- # Set default lists for HifiGAN
122
- if upsample_rates is None:
123
- upsample_rates = [8, 8, 2, 2]
124
- if upsample_kernel_sizes is None:
125
- upsample_kernel_sizes = [16, 16, 4, 4]
126
- if resblock_kernel_sizes is None:
127
- resblock_kernel_sizes = [3, 7, 11]
128
- if resblock_dilation_sizes is None:
129
- resblock_dilation_sizes = [[1, 3, 5], [1, 3, 5], [1, 3, 5]]
130
-
131
- # Set default dicts for GPT
132
- if char_limits is None:
133
- char_limits = {
134
- "en": 250, "de": 253, "fr": 273, "es": 239,
135
- "it": 213, "pt": 203, "pl": 224, "zh": 82,
136
- "ar": 166, "cs": 186, "ru": 182, "nl": 251,
137
- "tr": 226, "ja": 71, "hu": 224, "ko": 95,
138
- }
139
 
140
- if languages is None:
141
- languages = [
142
- "en", "es", "fr", "de", "it", "pt", "pl", "tr", "ru", "nl",
143
- "cs", "ar", "zh-cn", "hu", "ko", "ja", "hi"
144
- ]
145
 
146
- # Initialize HifiGAN parameters
147
- # Audio parameters
148
  self.input_sample_rate = input_sample_rate
149
  self.output_sample_rate = output_sample_rate
150
  self.output_hop_length = output_hop_length
151
 
152
- # Model architecture
153
  self.decoder_input_dim = decoder_input_dim
154
  self.d_vector_dim = d_vector_dim
155
  self.cond_d_vector_in_each_upsampling_layer = cond_d_vector_in_each_upsampling_layer
156
 
157
- # Upsampling parameters
158
- self.upsample_rates = upsample_rates
159
- self.upsample_kernel_sizes = upsample_kernel_sizes
160
- self.upsample_initial_channel = upsample_initial_channel
161
-
162
- # Resblock parameters
163
- self.resblock_kernel_sizes = resblock_kernel_sizes
164
- self.resblock_dilation_sizes = resblock_dilation_sizes
165
-
166
- # Speaker encoder - store as dictionary
167
- if speaker_encoder_config is None:
168
- self.speaker_encoder_config = asdict(SpeakerEncoderConfig())
169
- elif isinstance(speaker_encoder_config, dict):
170
- default_config = asdict(SpeakerEncoderConfig())
171
- default_config.update(speaker_encoder_config)
172
- self.speaker_encoder_config = default_config
173
- elif isinstance(speaker_encoder_config, SpeakerEncoderConfig):
174
- self.speaker_encoder_config = asdict(speaker_encoder_config)
175
- else:
176
- raise ValueError("speaker_encoder_config must be either a dictionary or SpeakerEncoderConfig instance")
177
 
178
- # Initialize GPT parameters
179
- self.vocab_size = vocab_size
180
  self.num_chars = num_chars
181
 
182
- # GPT model parameters
183
- self.gpt_batch_size = gpt_batch_size
184
- self.gpt_max_audio_tokens = gpt_max_audio_tokens
185
- self.gpt_max_text_tokens = gpt_max_text_tokens
186
- self.gpt_max_prompt_tokens = gpt_max_prompt_tokens
187
- self.gpt_layers = gpt_layers
188
- self.gpt_n_model_channels = gpt_n_model_channels
189
- self.gpt_n_heads = gpt_n_heads
190
- self.gpt_number_text_tokens = gpt_number_text_tokens
191
- self.gpt_start_text_token = gpt_start_text_token
192
- self.gpt_stop_text_token = gpt_stop_text_token
193
- self.gpt_num_audio_tokens = gpt_num_audio_tokens
194
- self.gpt_start_audio_token = gpt_start_audio_token
195
- self.gpt_stop_audio_token = gpt_stop_audio_token
196
- self.gpt_code_stride_len = gpt_code_stride_len
197
- self.gpt_use_masking_gt_prompt_approach = gpt_use_masking_gt_prompt_approach
198
- self.gpt_use_perceiver_resampler = gpt_use_perceiver_resampler
199
- self.gpt_checkpointing = gpt_checkpointing
200
- self.gpt_train_solo_embeddings = gpt_train_solo_embeddings
201
 
202
- # Training parameters
203
- self.enable_redaction = enable_redaction
204
- self.kv_cache = kv_cache
205
- self.perceiver_cond_length_compression = perceiver_cond_length_compression
206
- self.label_smoothing = label_smoothing
207
-
208
- # Generation parameters
209
- self.temperature = temperature
210
- self.length_penalty = length_penalty
211
- self.repetition_penalty = repetition_penalty
212
- self.top_k = top_k
213
- self.top_p = top_p
214
- self.gpt_cond_len = gpt_cond_len
215
- self.gpt_cond_chunk_len = gpt_cond_chunk_len
216
- self.max_ref_len = max_ref_len
217
- self.sound_norm_refs = sound_norm_refs
218
-
219
- # Audio processing
220
- if audio_config is None:
221
- audio_config = XTTSAudioConfig()
222
- elif isinstance(audio_config, dict):
223
- audio_config = XTTSAudioConfig(**audio_config)
224
- self.audio_config = audio_config
225
-
226
- # Constants and limits
227
- self.duration_const = duration_const
228
- self.char_limits = char_limits
229
- self.languages = languages
230
 
231
  def to_dict(self) -> Dict:
232
- """Convert the config to a dictionary format."""
233
- # Get parent class dict
234
  output = super().to_dict()
235
-
236
- # Add all attributes
237
- output.update({
238
- # HifiGAN parameters
239
- "input_sample_rate": self.input_sample_rate,
240
- "output_sample_rate": self.output_sample_rate,
241
- "output_hop_length": self.output_hop_length,
242
- "decoder_input_dim": self.decoder_input_dim,
243
- "d_vector_dim": self.d_vector_dim,
244
- "cond_d_vector_in_each_upsampling_layer": self.cond_d_vector_in_each_upsampling_layer,
245
- "upsample_rates": self.upsample_rates,
246
- "upsample_kernel_sizes": self.upsample_kernel_sizes,
247
- "upsample_initial_channel": self.upsample_initial_channel,
248
- "resblock_kernel_sizes": self.resblock_kernel_sizes,
249
- "resblock_dilation_sizes": self.resblock_dilation_sizes,
250
- "speaker_encoder_config": self.speaker_encoder_config,
251
-
252
- # GPT parameters
253
- "vocab_size": self.vocab_size,
254
- "num_chars": self.num_chars,
255
- "gpt_batch_size": self.gpt_batch_size,
256
- "gpt_max_audio_tokens": self.gpt_max_audio_tokens,
257
- "gpt_max_text_tokens": self.gpt_max_text_tokens,
258
- "gpt_max_prompt_tokens": self.gpt_max_prompt_tokens,
259
- "gpt_layers": self.gpt_layers,
260
- "gpt_n_model_channels": self.gpt_n_model_channels,
261
- "gpt_n_heads": self.gpt_n_heads,
262
- "gpt_number_text_tokens": self.gpt_number_text_tokens,
263
- "gpt_start_text_token": self.gpt_start_text_token,
264
- "gpt_stop_text_token": self.gpt_stop_text_token,
265
- "gpt_num_audio_tokens": self.gpt_num_audio_tokens,
266
- "gpt_start_audio_token": self.gpt_start_audio_token,
267
- "gpt_stop_audio_token": self.gpt_stop_audio_token,
268
- "gpt_code_stride_len": self.gpt_code_stride_len,
269
- "gpt_use_masking_gt_prompt_approach": self.gpt_use_masking_gt_prompt_approach,
270
- "gpt_use_perceiver_resampler": self.gpt_use_perceiver_resampler,
271
- "gpt_checkpointing": self.gpt_checkpointing,
272
- "gpt_train_solo_embeddings": self.gpt_train_solo_embeddings,
273
- "enable_redaction": self.enable_redaction,
274
- "kv_cache": self.kv_cache,
275
- "perceiver_cond_length_compression": self.perceiver_cond_length_compression,
276
- "label_smoothing": self.label_smoothing,
277
- "temperature": self.temperature,
278
- "length_penalty": self.length_penalty,
279
- "repetition_penalty": self.repetition_penalty,
280
- "top_k": self.top_k,
281
- "top_p": self.top_p,
282
- "gpt_cond_len": self.gpt_cond_len,
283
- "gpt_cond_chunk_len": self.gpt_cond_chunk_len,
284
- "max_ref_len": self.max_ref_len,
285
- "sound_norm_refs": self.sound_norm_refs,
286
- "audio_config": asdict(self.audio_config),
287
- "duration_const": self.duration_const,
288
- "char_limits": self.char_limits,
289
- "languages": self.languages,
290
- })
291
-
292
  return output
293
 
294
  @classmethod
295
- def from_dict(cls, config_dict: Dict) -> "XTTSConfig":
296
- """Create a config instance from a dictionary."""
297
- config_copy = config_dict.copy()
298
-
299
- # Handle special nested configs
300
- if "audio_config" in config_copy:
301
- config_copy["audio_config"] = XTTSAudioConfig(**config_copy["audio_config"])
302
-
303
- return cls(**config_copy)
304
-
305
- def get_speaker_encoder_config(self) -> SpeakerEncoderConfig:
306
- """Get speaker encoder config as a SpeakerEncoderConfig instance"""
307
- return SpeakerEncoderConfig(**self.speaker_encoder_config)
308
-
309
- def update_with_tokenizer(self, tokenizer=None):
310
- """Update configuration values based on tokenizer"""
311
- if tokenizer is not None:
312
- self.gpt_number_text_tokens = tokenizer.get_vocab_size()
313
- self.gpt_start_text_token = tokenizer.bos_token_id
314
- self.gpt_stop_text_token = tokenizer.eos_token_id
315
- self.vocab_size = tokenizer.get_vocab_size()
316
- self.pad_token_id = tokenizer.pad_token_id
317
- self.bos_token_id = tokenizer.bos_token_id
318
- self.eos_token_id = tokenizer.eos_token_id
319
-
320
- def get_hifigan_config(self) -> Dict:
321
- """Extract HiFiGAN-specific configuration"""
322
- return {
323
- "input_sample_rate": self.input_sample_rate,
324
- "output_sample_rate": self.output_sample_rate,
325
- "output_hop_length": self.output_hop_length,
326
- "decoder_input_dim": self.decoder_input_dim,
327
- "d_vector_dim": self.d_vector_dim,
328
- "cond_d_vector_in_each_upsampling_layer": self.cond_d_vector_in_each_upsampling_layer,
329
- "upsample_rates": self.upsample_rates,
330
- "upsample_kernel_sizes": self.upsample_kernel_sizes,
331
- "upsample_initial_channel": self.upsample_initial_channel,
332
- "resblock_kernel_sizes": self.resblock_kernel_sizes,
333
- "resblock_dilation_sizes": self.resblock_dilation_sizes,
334
- "speaker_encoder_config": self.speaker_encoder_config
335
- }
336
-
337
- def get_gpt_config(self) -> Dict:
338
- """Extract GPT-specific configuration"""
339
- return {
340
- "vocab_size": self.vocab_size,
341
- "num_chars": self.num_chars,
342
- "gpt_batch_size": self.gpt_batch_size,
343
- "gpt_max_audio_tokens": self.gpt_max_audio_tokens,
344
- "gpt_max_text_tokens": self.gpt_max_text_tokens,
345
- "gpt_max_prompt_tokens": self.gpt_max_prompt_tokens,
346
- "gpt_layers": self.gpt_layers,
347
- "gpt_n_model_channels": self.gpt_n_model_channels,
348
- "gpt_n_heads": self.gpt_n_heads,
349
- "gpt_number_text_tokens": self.gpt_number_text_tokens,
350
- "gpt_start_text_token": self.gpt_start_text_token,
351
- "gpt_stop_text_token": self.gpt_stop_text_token,
352
- "gpt_num_audio_tokens": self.gpt_num_audio_tokens,
353
- "gpt_start_audio_token": self.gpt_start_audio_token,
354
- "gpt_stop_audio_token": self.gpt_stop_audio_token,
355
- "gpt_code_stride_len": self.gpt_code_stride_len,
356
- "gpt_use_masking_gt_prompt_approach": self.gpt_use_masking_gt_prompt_approach,
357
- "gpt_use_perceiver_resampler": self.gpt_use_perceiver_resampler,
358
- "gpt_checkpointing": self.gpt_checkpointing,
359
- "gpt_train_solo_embeddings": self.gpt_train_solo_embeddings,
360
- "enable_redaction": self.enable_redaction,
361
- "kv_cache": self.kv_cache,
362
- "perceiver_cond_length_compression": self.perceiver_cond_length_compression,
363
- "label_smoothing": self.label_smoothing,
364
- "audio_config": self.audio_config,
365
- "pad_token_id": self.pad_token_id,
366
- "bos_token_id": self.bos_token_id,
367
- "eos_token_id": self.eos_token_id
368
- }
369
-
370
- def get_generation_config(self) -> Dict:
371
- """Extract generation-specific configuration"""
372
- return {
373
- "temperature": self.temperature,
374
- "length_penalty": self.length_penalty,
375
- "repetition_penalty": self.repetition_penalty,
376
- "top_k": self.top_k,
377
- "top_p": self.top_p,
378
- "gpt_cond_len": self.gpt_cond_len,
379
- "gpt_cond_chunk_len": self.gpt_cond_chunk_len,
380
- "max_ref_len": self.max_ref_len,
381
- "sound_norm_refs": self.sound_norm_refs
382
- }
383
-
384
- def validate(self):
385
- """Validate configuration values"""
386
- if self.gpt_max_text_tokens <= 0:
387
- raise ValueError("gpt_max_text_tokens must be positive")
388
- if self.gpt_max_audio_tokens <= 0:
389
- raise ValueError("gpt_max_audio_tokens must be positive")
390
- if self.gpt_layers <= 0:
391
- raise ValueError("gpt_layers must be positive")
392
- if self.gpt_n_heads <= 0:
393
- raise ValueError("gpt_n_heads must be positive")
394
- if self.gpt_n_model_channels <= 0:
395
- raise ValueError("gpt_n_model_channels must be positive")
396
- if len(self.upsample_rates) != len(self.upsample_kernel_sizes):
397
- raise ValueError("upsample_rates and upsample_kernel_sizes must have same length")
398
- if not all(isinstance(x, int) and x > 0 for x in self.upsample_rates):
399
- raise ValueError("all upsample_rates must be positive integers")
400
-
401
- def get_audio_config(self) -> XTTSAudioConfig:
402
- """Get the audio configuration"""
403
- return self.audio_config
404
-
405
- @property
406
- def num_hidden_layers(self) -> int:
407
- """Get number of hidden layers (alias for gpt_layers)"""
408
- return self.gpt_layers
409
-
410
- @property
411
- def hidden_size(self) -> int:
412
- """Get hidden size (alias for gpt_n_model_channels)"""
413
- return self.gpt_n_model_channels
414
-
415
- @property
416
- def num_attention_heads(self) -> int:
417
- """Get number of attention heads (alias for gpt_n_heads)"""
418
- return self.gpt_n_heads
 
1
  from dataclasses import asdict, dataclass
2
+ from typing import Dict, Optional, List
3
  from transformers.configuration_utils import PretrainedConfig
4
+ from transformers.utils import logging
5
+
6
+ logger = logging.get_logger(__name__)
7
 
8
 
9
  @dataclass
10
+ class GPTAudioConfig:
11
+ """Configuration for GPT audio processing parameters"""
12
+ mel_channels: int = 80
13
+ sample_rate: int = 22050
14
+ output_sample_rate: int = 24000
15
+
16
+
17
+ class XTTSGPTConfig(PretrainedConfig):
18
+ """Configuration class for the GPT component of XTTS."""
19
+ model_type = "xtts_gpt"
20
+
21
+ def __init__(
22
+ self,
23
+ # Model architecture
24
+ hidden_size: int = 1024, # gpt_n_model_channels in original
25
+ num_hidden_layers: int = 30, # gpt_layers in original
26
+ num_attention_heads: int = 16, # gpt_n_heads in original
27
+
28
+ # Tokenizer settings
29
+ vocab_size: int = 6681, # gpt_number_text_tokens in original
30
+ number_text_tokens: int = 6681, # Explicit text token vocabulary size
31
+ start_text_token: Optional[int] = None,
32
+ stop_text_token: Optional[int] = None,
33
+
34
+ # Audio token settings
35
+ num_audio_tokens: int = 1026, # gpt_num_audio_tokens in original
36
+ start_audio_token: int = 1024, # gpt_start_audio_token in original
37
+ stop_audio_token: int = 1025, # gpt_stop_audio_token in original
38
+
39
+ # Sequence length settings
40
+ max_audio_tokens: int = 605, # gpt_max_audio_tokens in original
41
+ max_text_tokens: int = 402, # gpt_max_text_tokens in original
42
+ max_prompt_tokens: int = 70, # gpt_max_prompt_tokens in original
43
+ gpt_max_audio_tokens: int = 605, # Used for generation
44
+
45
+ # Model behavior settings
46
+ use_masking_gt_prompt_approach: bool = True, # gpt_use_masking_gt_prompt_approach in original
47
+ use_perceiver_resampler: bool = True, # gpt_use_perceiver_resampler in original
48
+ kv_cache: bool = True,
49
+ enable_redaction: bool = False,
50
+
51
+ # GPT batch settings
52
+ gpt_batch_size: int = 1,
53
+
54
+ # Audio processing
55
+ audio_config: Optional[Dict] = None,
56
+
57
+ # Architecture specifics
58
+ layer_norm_epsilon: float = 1e-5,
59
+ initializer_range: float = 0.02,
60
+ add_cross_attention: bool = False,
61
+ scale_attn_by_inverse_layer_idx: bool = False,
62
+ reorder_and_upcast_attn: bool = False,
63
+
64
+ # Size settings for the decoder
65
+ decoder_input_dim: int = 1024,
66
+
67
+ **kwargs
68
+ ):
69
+ super().__init__(**kwargs)
70
+
71
+ self.audio_config = GPTAudioConfig(
72
+ **audio_config if audio_config is not None else {}
73
+ )
74
+
75
+ self.hidden_size = hidden_size
76
+ self.num_hidden_layers = num_hidden_layers
77
+ self.num_attention_heads = num_attention_heads
78
+
79
+ self.vocab_size = vocab_size
80
+ self.number_text_tokens = number_text_tokens
81
+ self.start_text_token = start_text_token
82
+ self.stop_text_token = stop_text_token
83
+
84
+ self.num_audio_tokens = num_audio_tokens
85
+ self.start_audio_token = start_audio_token
86
+ self.stop_audio_token = stop_audio_token
87
+
88
+ self.max_audio_tokens = max_audio_tokens
89
+ self.max_text_tokens = max_text_tokens
90
+ self.max_prompt_tokens = max_prompt_tokens
91
+ self.gpt_max_audio_tokens = gpt_max_audio_tokens
92
+
93
+ self.use_masking_gt_prompt_approach = use_masking_gt_prompt_approach
94
+ self.use_perceiver_resampler = use_perceiver_resampler
95
+ self.kv_cache = kv_cache
96
+ self.enable_redaction = enable_redaction
97
+
98
+ self.gpt_batch_size = gpt_batch_size
99
+
100
+ self.layer_norm_epsilon = layer_norm_epsilon
101
+ self.initializer_range = initializer_range
102
+ self.add_cross_attention = add_cross_attention
103
+ self.scale_attn_by_inverse_layer_idx = scale_attn_by_inverse_layer_idx
104
+ self.reorder_and_upcast_attn = reorder_and_upcast_attn
105
+
106
+ self.decoder_input_dim = decoder_input_dim
107
+
108
+ def to_dict(self) -> Dict:
109
+ """Convert the config to a dictionary."""
110
+ output = super().to_dict()
111
+ output["audio_config"] = asdict(self.audio_config)
112
+ return output
113
 
114
+ @classmethod
115
+ def from_dict(cls, config_dict: Dict, *args, **kwargs) -> "XTTSGPTConfig":
116
+ """Create a config from a dictionary."""
117
+ return cls(**config_dict)
118
 
119
  @dataclass
120
  class XTTSAudioConfig:
 
132
 
133
 
134
  class XTTSConfig(PretrainedConfig):
135
+ """Configuration class for XTTS model components except GPT."""
136
  model_type = "xtts"
137
 
138
  def __init__(
139
  self,
140
+ # Audio settings
141
+ audio_config: Optional[Dict] = None,
142
  input_sample_rate: int = 22050,
143
  output_sample_rate: int = 24000,
144
  output_hop_length: int = 256,
145
 
146
+ # Model architecture
147
  decoder_input_dim: int = 1024,
148
  d_vector_dim: int = 512,
149
  cond_d_vector_in_each_upsampling_layer: bool = True,
150
 
151
+ # Training settings
152
+ gpt_code_stride_len: int = 1024,
153
+ duration_const: int = 102400,
 
 
 
 
 
 
 
 
154
 
155
+ # Tokenizer settings
156
+ tokenizer_file: str = "",
157
  num_chars: int = 255,
158
 
159
+ # Language support
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
160
  languages: Optional[List[str]] = None,
161
 
162
+ # GPT configuration
163
+ gpt_config: Optional[Dict] = None,
 
 
 
 
 
 
 
 
 
 
164
 
165
+ **kwargs
166
+ ):
167
+ super().__init__(**kwargs)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
168
 
169
+ # Initialize audio config
170
+ self.audio_config = XTTSAudioConfig(
171
+ **audio_config if audio_config is not None else {}
172
+ )
 
173
 
 
 
174
  self.input_sample_rate = input_sample_rate
175
  self.output_sample_rate = output_sample_rate
176
  self.output_hop_length = output_hop_length
177
 
 
178
  self.decoder_input_dim = decoder_input_dim
179
  self.d_vector_dim = d_vector_dim
180
  self.cond_d_vector_in_each_upsampling_layer = cond_d_vector_in_each_upsampling_layer
181
 
182
+ self.gpt_code_stride_len = gpt_code_stride_len
183
+ self.duration_const = duration_const
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
184
 
185
+ self.tokenizer_file = tokenizer_file
 
186
  self.num_chars = num_chars
187
 
188
+ # Initialize GPT config
189
+ self.gpt = XTTSGPTConfig(**gpt_config if gpt_config is not None else {})
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
190
 
191
+ if languages is None:
192
+ self.languages = [
193
+ "en", "es", "fr", "de", "it", "pt", "pl", "tr", "ru",
194
+ "nl", "cs", "ar", "zh-cn", "hu", "ko", "ja", "hi"
195
+ ]
196
+ else:
197
+ self.languages = languages
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
198
 
199
  def to_dict(self) -> Dict:
200
+ """Convert the config to a dictionary."""
 
201
  output = super().to_dict()
202
+ output["audio_config"] = asdict(self.audio_config)
203
+ output["gpt_config"] = self.gpt.to_dict()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
204
  return output
205
 
206
  @classmethod
207
+ def from_dict(cls, config_dict: Dict, *args, **kwargs) -> "XTTSConfig":
208
+ """Create a config from a dictionary."""
209
+ if "gpt_config" in config_dict:
210
+ gpt_config = config_dict["gpt_config"]
211
+ config_dict = {k: v for k, v in config_dict.items() if k != "gpt_config"}
212
+ return cls(gpt_config=gpt_config, **config_dict)
213
+ return cls(**config_dict)