Upload 3 files

Browse files

Files changed (3) hide show

qwen3_tts_tokenizer_12hz/config.json +94 -0
qwen3_tts_tokenizer_12hz/configuration.json +1 -0
qwen3_tts_tokenizer_12hz/preprocessor_config.json +10 -0

qwen3_tts_tokenizer_12hz/config.json ADDED Viewed

	@@ -0,0 +1,94 @@

+{
+  "architectures": [
+    "Qwen3TTSTokenizerV2Model"
+  ],
+  "model_type": "qwen3_tts_tokenizer_12hz",
+  "encoder_valid_num_quantizers": 16,
+  "input_sample_rate": 24000,
+  "output_sample_rate": 24000,
+  "decode_upsample_rate": 1920,
+  "encode_downsample_rate": 1920,
+  "decoder_config": {
+    "attention_bias": false,
+    "attention_dropout": 0.0,
+    "latent_dim": 1024,
+    "codebook_dim": 512,
+    "codebook_size": 2048,
+    "decoder_dim": 1536,
+    "hidden_act": "silu",
+    "hidden_size": 512,
+    "intermediate_size": 1024,
+    "layer_scale_initial_scale": 0.01,
+    "max_position_embeddings": 8000,
+    "head_dim": 64,
+    "num_attention_heads": 16,
+    "num_hidden_layers": 8,
+    "num_key_value_heads": 16,
+    "num_quantizers": 16,
+    "num_semantic_quantizers": 1,
+    "rms_norm_eps": 1e-05,
+    "rope_theta": 10000,
+    "semantic_codebook_size": 4096,
+    "sliding_window": 72,
+    "upsample_rates": [
+      8,
+      5,
+      4,
+      3
+    ],
+    "upsampling_ratios": [
+      2,
+      2
+    ],
+    "vector_quantization_hidden_dimension": 512
+  },
+  "encoder_config": {
+    "_frame_rate": 12.5,
+    "attention_bias": false,
+    "attention_dropout": 0.0,
+    "audio_channels": 1,
+    "codebook_dim": 256,
+    "codebook_size": 2048,
+    "compress": 2,
+    "dilation_growth_rate": 2,
+    "dtype": "float32",
+    "head_dim": 64,
+    "hidden_act": "gelu",
+    "hidden_size": 512,
+    "initializer_range": 0.02,
+    "intermediate_size": 2048,
+    "kernel_size": 7,
+    "last_kernel_size": 3,
+    "layer_scale_initial_scale": 0.01,
+    "max_position_embeddings": 8000,
+    "norm_eps": 1e-05,
+    "normalize": false,
+    "num_attention_heads": 8,
+    "num_filters": 64,
+    "num_hidden_layers": 8,
+    "num_key_value_heads": 8,
+    "num_quantizers": 32,
+    "num_residual_layers": 1,
+    "num_semantic_quantizers": 1,
+    "pad_mode": "constant",
+    "residual_kernel_size": 3,
+    "rope_theta": 10000.0,
+    "sampling_rate": 24000,
+    "sliding_window": 250,
+    "transformers_version": "4.57.0.dev0",
+    "trim_right_ratio": 1.0,
+    "upsample_groups": 512,
+    "upsampling_ratios": [
+      8,
+      6,
+      5,
+      4
+    ],
+    "use_cache": false,
+    "use_causal_conv": true,
+    "use_conv_shortcut": false,
+    "use_streaming": false,
+    "vector_quantization_hidden_dimension": 256
+  },
+  "transformers_version": "4.57.3"
+}

qwen3_tts_tokenizer_12hz/configuration.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"framework": "pytorch", "task": "feature-extraction", "allow_remote": true}

qwen3_tts_tokenizer_12hz/preprocessor_config.json ADDED Viewed

	@@ -0,0 +1,10 @@

+{
+  "chunk_length_s": null,
+  "feature_extractor_type": "EncodecFeatureExtractor",
+  "feature_size": 1,
+  "overlap": null,
+  "padding_side": "right",
+  "padding_value": 0.0,
+  "return_attention_mask": true,
+  "sampling_rate": 24000
+}