klemenk commited on
Commit
3592734
·
verified ·
1 Parent(s): bc6461b

Update configuration_wavtokenizer.py

Browse files
Files changed (1) hide show
  1. configuration_wavtokenizer.py +7 -7
configuration_wavtokenizer.py CHANGED
@@ -85,20 +85,20 @@ class WavTokenizerConfig(PretrainedConfig):
85
 
86
  # Feature dimensions
87
  feature_dim: int = 512,
88
- encoder_dim: int = 64,
89
  encoder_rates: list = None,
90
  latent_dim: int = None,
91
 
92
  # Quantizer parameters
93
  codebook_size: int = 4096,
94
- codebook_dim: int = 8,
95
  num_quantizers: int = 1,
96
 
97
  # Backbone parameters
98
  backbone_type: str = "vocos",
99
- backbone_dim: int = 512,
100
- backbone_num_blocks: int = 8,
101
- backbone_intermediate_dim: int = 1536,
102
  backbone_kernel_size: int = 7,
103
  backbone_layer_scale_init_value: float = 1e-6,
104
 
@@ -126,7 +126,7 @@ class WavTokenizerConfig(PretrainedConfig):
126
  # Feature dimensions
127
  self.feature_dim = feature_dim
128
  self.encoder_dim = encoder_dim
129
- self.encoder_rates = encoder_rates if encoder_rates is not None else [8, 5, 4, 2]
130
  self.latent_dim = latent_dim if latent_dim is not None else feature_dim
131
 
132
  # Quantizer
@@ -160,4 +160,4 @@ class WavTokenizerConfig(PretrainedConfig):
160
  @property
161
  def frame_rate(self) -> float:
162
  """Returns the frame rate (tokens per second)."""
163
- return self.sample_rate / self.hop_length
 
85
 
86
  # Feature dimensions
87
  feature_dim: int = 512,
88
+ encoder_dim: int = 32,
89
  encoder_rates: list = None,
90
  latent_dim: int = None,
91
 
92
  # Quantizer parameters
93
  codebook_size: int = 4096,
94
+ codebook_dim: int = 512,
95
  num_quantizers: int = 1,
96
 
97
  # Backbone parameters
98
  backbone_type: str = "vocos",
99
+ backbone_dim: int = 768,
100
+ backbone_num_blocks: int = 12,
101
+ backbone_intermediate_dim: int = 2304,
102
  backbone_kernel_size: int = 7,
103
  backbone_layer_scale_init_value: float = 1e-6,
104
 
 
126
  # Feature dimensions
127
  self.feature_dim = feature_dim
128
  self.encoder_dim = encoder_dim
129
+ self.encoder_rates = encoder_rates if encoder_rates is not None else [2, 4, 5, 8]
130
  self.latent_dim = latent_dim if latent_dim is not None else feature_dim
131
 
132
  # Quantizer
 
160
  @property
161
  def frame_rate(self) -> float:
162
  """Returns the frame rate (tokens per second)."""
163
+ return self.sample_rate / self.hop_length