Update configuration_wavtokenizer.py
Browse files
configuration_wavtokenizer.py
CHANGED
|
@@ -85,20 +85,20 @@ class WavTokenizerConfig(PretrainedConfig):
|
|
| 85 |
|
| 86 |
# Feature dimensions
|
| 87 |
feature_dim: int = 512,
|
| 88 |
-
encoder_dim: int =
|
| 89 |
encoder_rates: list = None,
|
| 90 |
latent_dim: int = None,
|
| 91 |
|
| 92 |
# Quantizer parameters
|
| 93 |
codebook_size: int = 4096,
|
| 94 |
-
codebook_dim: int =
|
| 95 |
num_quantizers: int = 1,
|
| 96 |
|
| 97 |
# Backbone parameters
|
| 98 |
backbone_type: str = "vocos",
|
| 99 |
-
backbone_dim: int =
|
| 100 |
-
backbone_num_blocks: int =
|
| 101 |
-
backbone_intermediate_dim: int =
|
| 102 |
backbone_kernel_size: int = 7,
|
| 103 |
backbone_layer_scale_init_value: float = 1e-6,
|
| 104 |
|
|
@@ -126,7 +126,7 @@ class WavTokenizerConfig(PretrainedConfig):
|
|
| 126 |
# Feature dimensions
|
| 127 |
self.feature_dim = feature_dim
|
| 128 |
self.encoder_dim = encoder_dim
|
| 129 |
-
self.encoder_rates = encoder_rates if encoder_rates is not None else [
|
| 130 |
self.latent_dim = latent_dim if latent_dim is not None else feature_dim
|
| 131 |
|
| 132 |
# Quantizer
|
|
@@ -160,4 +160,4 @@ class WavTokenizerConfig(PretrainedConfig):
|
|
| 160 |
@property
|
| 161 |
def frame_rate(self) -> float:
|
| 162 |
"""Returns the frame rate (tokens per second)."""
|
| 163 |
-
return self.sample_rate / self.hop_length
|
|
|
|
| 85 |
|
| 86 |
# Feature dimensions
|
| 87 |
feature_dim: int = 512,
|
| 88 |
+
encoder_dim: int = 32,
|
| 89 |
encoder_rates: list = None,
|
| 90 |
latent_dim: int = None,
|
| 91 |
|
| 92 |
# Quantizer parameters
|
| 93 |
codebook_size: int = 4096,
|
| 94 |
+
codebook_dim: int = 512,
|
| 95 |
num_quantizers: int = 1,
|
| 96 |
|
| 97 |
# Backbone parameters
|
| 98 |
backbone_type: str = "vocos",
|
| 99 |
+
backbone_dim: int = 768,
|
| 100 |
+
backbone_num_blocks: int = 12,
|
| 101 |
+
backbone_intermediate_dim: int = 2304,
|
| 102 |
backbone_kernel_size: int = 7,
|
| 103 |
backbone_layer_scale_init_value: float = 1e-6,
|
| 104 |
|
|
|
|
| 126 |
# Feature dimensions
|
| 127 |
self.feature_dim = feature_dim
|
| 128 |
self.encoder_dim = encoder_dim
|
| 129 |
+
self.encoder_rates = encoder_rates if encoder_rates is not None else [2, 4, 5, 8]
|
| 130 |
self.latent_dim = latent_dim if latent_dim is not None else feature_dim
|
| 131 |
|
| 132 |
# Quantizer
|
|
|
|
| 160 |
@property
|
| 161 |
def frame_rate(self) -> float:
|
| 162 |
"""Returns the frame rate (tokens per second)."""
|
| 163 |
+
return self.sample_rate / self.hop_length
|