DeepBeepMeep commited on
Commit
d8492b9
·
verified ·
1 Parent(s): 6889843

Upload 3 files

Browse files
qwen3_tts_tokenizer_12hz/config.json ADDED
@@ -0,0 +1,94 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "Qwen3TTSTokenizerV2Model"
4
+ ],
5
+ "model_type": "qwen3_tts_tokenizer_12hz",
6
+ "encoder_valid_num_quantizers": 16,
7
+ "input_sample_rate": 24000,
8
+ "output_sample_rate": 24000,
9
+ "decode_upsample_rate": 1920,
10
+ "encode_downsample_rate": 1920,
11
+ "decoder_config": {
12
+ "attention_bias": false,
13
+ "attention_dropout": 0.0,
14
+ "latent_dim": 1024,
15
+ "codebook_dim": 512,
16
+ "codebook_size": 2048,
17
+ "decoder_dim": 1536,
18
+ "hidden_act": "silu",
19
+ "hidden_size": 512,
20
+ "intermediate_size": 1024,
21
+ "layer_scale_initial_scale": 0.01,
22
+ "max_position_embeddings": 8000,
23
+ "head_dim": 64,
24
+ "num_attention_heads": 16,
25
+ "num_hidden_layers": 8,
26
+ "num_key_value_heads": 16,
27
+ "num_quantizers": 16,
28
+ "num_semantic_quantizers": 1,
29
+ "rms_norm_eps": 1e-05,
30
+ "rope_theta": 10000,
31
+ "semantic_codebook_size": 4096,
32
+ "sliding_window": 72,
33
+ "upsample_rates": [
34
+ 8,
35
+ 5,
36
+ 4,
37
+ 3
38
+ ],
39
+ "upsampling_ratios": [
40
+ 2,
41
+ 2
42
+ ],
43
+ "vector_quantization_hidden_dimension": 512
44
+ },
45
+ "encoder_config": {
46
+ "_frame_rate": 12.5,
47
+ "attention_bias": false,
48
+ "attention_dropout": 0.0,
49
+ "audio_channels": 1,
50
+ "codebook_dim": 256,
51
+ "codebook_size": 2048,
52
+ "compress": 2,
53
+ "dilation_growth_rate": 2,
54
+ "dtype": "float32",
55
+ "head_dim": 64,
56
+ "hidden_act": "gelu",
57
+ "hidden_size": 512,
58
+ "initializer_range": 0.02,
59
+ "intermediate_size": 2048,
60
+ "kernel_size": 7,
61
+ "last_kernel_size": 3,
62
+ "layer_scale_initial_scale": 0.01,
63
+ "max_position_embeddings": 8000,
64
+ "norm_eps": 1e-05,
65
+ "normalize": false,
66
+ "num_attention_heads": 8,
67
+ "num_filters": 64,
68
+ "num_hidden_layers": 8,
69
+ "num_key_value_heads": 8,
70
+ "num_quantizers": 32,
71
+ "num_residual_layers": 1,
72
+ "num_semantic_quantizers": 1,
73
+ "pad_mode": "constant",
74
+ "residual_kernel_size": 3,
75
+ "rope_theta": 10000.0,
76
+ "sampling_rate": 24000,
77
+ "sliding_window": 250,
78
+ "transformers_version": "4.57.0.dev0",
79
+ "trim_right_ratio": 1.0,
80
+ "upsample_groups": 512,
81
+ "upsampling_ratios": [
82
+ 8,
83
+ 6,
84
+ 5,
85
+ 4
86
+ ],
87
+ "use_cache": false,
88
+ "use_causal_conv": true,
89
+ "use_conv_shortcut": false,
90
+ "use_streaming": false,
91
+ "vector_quantization_hidden_dimension": 256
92
+ },
93
+ "transformers_version": "4.57.3"
94
+ }
qwen3_tts_tokenizer_12hz/configuration.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"framework": "pytorch", "task": "feature-extraction", "allow_remote": true}
qwen3_tts_tokenizer_12hz/preprocessor_config.json ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "chunk_length_s": null,
3
+ "feature_extractor_type": "EncodecFeatureExtractor",
4
+ "feature_size": 1,
5
+ "overlap": null,
6
+ "padding_side": "right",
7
+ "padding_value": 0.0,
8
+ "return_attention_mask": true,
9
+ "sampling_rate": 24000
10
+ }