lainlives commited on
Commit
07d82db
·
verified ·
1 Parent(s): 1e6db37

Upload folder using huggingface_hub

Browse files
Stewart-0.6b/training_config.json ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "speaker_name": "Patrick Stewart",
3
+ "init_model": "Qwen/Qwen3-TTS-12Hz-0.6B-Base",
4
+ "model_source": "HuggingFace",
5
+ "batch_size": 32,
6
+ "lr": "1e-07",
7
+ "epochs": 30,
8
+ "grad_acc": 2,
9
+ "use_experimental_speedup": true,
10
+ "resume_from_checkpoint": "latest"
11
+ }
Stewart-1.7/training_config.json ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "speaker_name": "Patrick Stewart",
3
+ "init_model": "Qwen/Qwen3-TTS-12Hz-1.7B-Base",
4
+ "model_source": "HuggingFace",
5
+ "batch_size": 32,
6
+ "lr": "1e-7",
7
+ "epochs": 10,
8
+ "grad_acc": 2,
9
+ "use_experimental_speedup": true,
10
+ "resume_from_checkpoint": "latest"
11
+ }
Stewart/checkpoint-epoch-0/.gitattributes ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
Stewart/checkpoint-epoch-0/README.md ADDED
@@ -0,0 +1,103 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: apache-2.0
3
+ pipeline_tag: text-to-speech
4
+ language:
5
+ - zh
6
+ - en
7
+ - ja
8
+ - ko
9
+ - de
10
+ - fr
11
+ - ru
12
+ - pt
13
+ - es
14
+ - it
15
+ tags:
16
+ - audio
17
+ - tts
18
+ - voice-clone
19
+ ---
20
+
21
+ # Qwen3-TTS-12Hz-0.6B-Base
22
+
23
+ [**Qwen3-TTS Technical Report**](https://huggingface.co/papers/2601.15621) | [**GitHub Repository**](https://github.com/QwenLM/Qwen3-TTS) | [**Hugging Face Demo**](https://huggingface.co/spaces/Qwen/Qwen3-TTS)
24
+
25
+ Qwen3-TTS is a family of advanced multilingual, controllable, robust, and streaming text-to-speech models. Trained on over 5 million hours of speech data spanning 10 languages, Qwen3-TTS supports state-of-the-art 3-second voice cloning and description-based control.
26
+
27
+ This specific checkpoint is the **0.6B Base model**, which is capable of rapid voice cloning from a user-provided audio input.
28
+
29
+ ## Quickstart
30
+
31
+ ### Installation
32
+
33
+ ```bash
34
+ pip install -U qwen-tts
35
+ # Optional: for optimized performance
36
+ pip install -U flash-attn --no-build-isolation
37
+ ```
38
+
39
+ ### Sample Usage (Voice Clone)
40
+
41
+ To clone a voice and synthesize new content using the Base model, you can use the following code snippet:
42
+
43
+ ```python
44
+ import torch
45
+ import soundfile as sf
46
+ from qwen_tts import Qwen3TTSModel
47
+
48
+ # Load the model
49
+ model = Qwen3TTSModel.from_pretrained(
50
+ "Qwen/Qwen3-TTS-12Hz-0.6B-Base",
51
+ device_map="cuda:0",
52
+ dtype=torch.bfloat16,
53
+ attn_implementation="flash_attention_2",
54
+ )
55
+
56
+ # Reference audio for cloning
57
+ ref_audio = "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen3-TTS-Repo/clone.wav"
58
+ ref_text = "Okay. Yeah. I resent you. I love you. I respect you. But you know what? You blew it! And thanks to you."
59
+
60
+ # Generate speech
61
+ wavs, sr = model.generate_voice_clone(
62
+ text="I am solving the equation: x = [-b ± √(b²-4ac)] / 2a? Nobody can — it's a disaster (◍•͈⌔•͈◍), very sad!",
63
+ language="English",
64
+ ref_audio=ref_audio,
65
+ ref_text=ref_text,
66
+ )
67
+
68
+ # Save the resulting audio
69
+ sf.write("output_voice_clone.wav", wavs[0], sr)
70
+ ```
71
+
72
+ ## Overview
73
+ ### Introduction
74
+
75
+ <p align="center">
76
+ <img src="https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen3-TTS-Repo/qwen3_tts_introduction.png" width="90%"/>
77
+ <p>
78
+
79
+ Qwen3-TTS covers 10 major languages (Chinese, English, Japanese, Korean, German, French, Russian, Portuguese, Spanish, and Italian) as well as multiple dialectal voice profiles to meet global application needs. Key features:
80
+
81
+ * **Powerful Speech Representation**: Powered by the self-developed Qwen3-TTS-Tokenizer-12Hz, it achieves efficient acoustic compression and high-dimensional semantic modeling.
82
+ * **Universal End-to-End Architecture**: Utilizing a discrete multi-codebook LM architecture, it realizes full-information end-to-end speech modeling.
83
+ * **Extreme Low-Latency Streaming Generation**: End-to-end synthesis latency as low as 97ms, meeting the rigorous demands of real-time interactive scenarios.
84
+ * **Intelligent Text Understanding and Voice Control**: Supports speech generation driven by natural language instructions, allowing for flexible control over multi-dimensional acoustic attributes.
85
+
86
+ ### Model Architecture
87
+
88
+ <p align="center">
89
+ <img src="https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen3-TTS-Repo/overview.png" width="80%"/>
90
+ <p>
91
+
92
+ ## Citation
93
+
94
+ If you find this work useful, please consider citing the technical report:
95
+
96
+ ```BibTeX
97
+ @article{Qwen3-TTS,
98
+ title={Qwen3-TTS Technical Report},
99
+ author={Hangrui Hu and Xinfa Zhu and Ting He and Dake Guo and Bin Zhang and Xiong Wang and Zhifang Guo and Ziyue Jiang and Hongkun Hao and Zishan Guo and Xinyu Zhang and Pei Zhang and Baosong Yang and Jin Xu and Jingren Zhou and Junyang Lin},
100
+ journal={arXiv preprint arXiv:2601.15621},
101
+ year={2026}
102
+ }
103
+ ```
Stewart/checkpoint-epoch-0/accelerate_state/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3f4ded20a82d1addd00c58992fd8d10894bab28cdbb2e07b9b80dcf980df80dc
3
+ size 1829344304
Stewart/checkpoint-epoch-0/accelerate_state/optimizer.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7abc1ac6a9f297ce6c490744ff5b60e658d8b6ac432d3d4a03c4cd5387db012c
3
+ size 3623493273
Stewart/checkpoint-epoch-0/accelerate_state/random_states_0.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4e7731d221a0a78c9505c61dec568e1b4098493a34dc8d0aceaf108f1334b1b2
3
+ size 14757
Stewart/checkpoint-epoch-0/config.json ADDED
@@ -0,0 +1,169 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "Qwen3TTSForConditionalGeneration"
4
+ ],
5
+ "assistant_token_id": 77091,
6
+ "im_end_token_id": 151645,
7
+ "im_start_token_id": 151644,
8
+ "tts_bos_token_id": 151672,
9
+ "tts_eos_token_id": 151673,
10
+ "tts_pad_token_id": 151671,
11
+ "model_type": "qwen3_tts",
12
+ "tokenizer_type": "qwen3_tts_tokenizer_12hz",
13
+ "tts_model_size": "0b6",
14
+ "tts_model_type": "custom_voice",
15
+ "speaker_encoder_config": {
16
+ "enc_dim": 1024,
17
+ "sample_rate": 24000
18
+ },
19
+ "talker_config": {
20
+ "attention_bias": false,
21
+ "attention_dropout": 0,
22
+ "code_predictor_config": {
23
+ "_name_or_path": "",
24
+ "add_cross_attention": false,
25
+ "architectures": null,
26
+ "attention_bias": false,
27
+ "attention_dropout": 0,
28
+ "bad_words_ids": null,
29
+ "begin_suppress_tokens": null,
30
+ "bos_token_id": null,
31
+ "chunk_size_feed_forward": 0,
32
+ "cross_attention_hidden_size": null,
33
+ "decoder_start_token_id": null,
34
+ "diversity_penalty": 0.0,
35
+ "do_sample": false,
36
+ "early_stopping": false,
37
+ "encoder_no_repeat_ngram_size": 0,
38
+ "eos_token_id": null,
39
+ "exponential_decay_length_penalty": null,
40
+ "finetuning_task": null,
41
+ "forced_bos_token_id": null,
42
+ "forced_eos_token_id": null,
43
+ "head_dim": 128,
44
+ "hidden_act": "silu",
45
+ "hidden_size": 1024,
46
+ "id2label": {
47
+ "0": "LABEL_0",
48
+ "1": "LABEL_1"
49
+ },
50
+ "initializer_range": 0.02,
51
+ "intermediate_size": 3072,
52
+ "is_decoder": false,
53
+ "is_encoder_decoder": false,
54
+ "label2id": {
55
+ "LABEL_0": 0,
56
+ "LABEL_1": 1
57
+ },
58
+ "layer_types": [
59
+ "full_attention",
60
+ "full_attention",
61
+ "full_attention",
62
+ "full_attention",
63
+ "full_attention"
64
+ ],
65
+ "length_penalty": 1.0,
66
+ "max_length": 20,
67
+ "max_position_embeddings": 65536,
68
+ "max_window_layers": 28,
69
+ "min_length": 0,
70
+ "model_type": "qwen3_tts_talker_code_predictor",
71
+ "no_repeat_ngram_size": 0,
72
+ "num_attention_heads": 16,
73
+ "num_beam_groups": 1,
74
+ "num_beams": 1,
75
+ "num_code_groups": 16,
76
+ "num_hidden_layers": 5,
77
+ "num_key_value_heads": 8,
78
+ "num_return_sequences": 1,
79
+ "output_attentions": false,
80
+ "output_hidden_states": false,
81
+ "output_scores": false,
82
+ "pad_token_id": null,
83
+ "prefix": null,
84
+ "problem_type": null,
85
+ "pruned_heads": {},
86
+ "remove_invalid_values": false,
87
+ "repetition_penalty": 1.0,
88
+ "return_dict": true,
89
+ "return_dict_in_generate": false,
90
+ "rms_norm_eps": 1e-06,
91
+ "rope_scaling": null,
92
+ "rope_theta": 1000000,
93
+ "sep_token_id": null,
94
+ "sliding_window": null,
95
+ "suppress_tokens": null,
96
+ "task_specific_params": null,
97
+ "temperature": 1.0,
98
+ "tf_legacy_loss": false,
99
+ "tie_encoder_decoder": false,
100
+ "tie_word_embeddings": false,
101
+ "tokenizer_class": null,
102
+ "top_k": 50,
103
+ "top_p": 1.0,
104
+ "dtype": null,
105
+ "torchscript": false,
106
+ "typical_p": 1.0,
107
+ "use_bfloat16": false,
108
+ "use_cache": true,
109
+ "use_sliding_window": false,
110
+ "vocab_size": 2048
111
+ },
112
+ "codec_bos_id": 2149,
113
+ "codec_eos_token_id": 2150,
114
+ "codec_think_id": 2154,
115
+ "codec_language_id": {
116
+ "chinese": 2055,
117
+ "english": 2050,
118
+ "german": 2053,
119
+ "italian": 2070,
120
+ "portuguese": 2071,
121
+ "spanish": 2054,
122
+ "japanese": 2058,
123
+ "korean": 2064,
124
+ "french": 2061,
125
+ "russian": 2069
126
+ },
127
+ "codec_nothink_id": 2155,
128
+ "codec_pad_id": 2148,
129
+ "codec_think_bos_id": 2156,
130
+ "codec_think_eos_id": 2157,
131
+ "spk_id": {
132
+ "Patrick Stewart": 3000
133
+ },
134
+ "spk_is_dialect": {
135
+ "Patrick Stewart": false
136
+ },
137
+ "head_dim": 128,
138
+ "hidden_act": "silu",
139
+ "hidden_size": 1024,
140
+ "initializer_range": 0.02,
141
+ "intermediate_size": 3072,
142
+ "max_position_embeddings": 32768,
143
+ "model_type": "qwen3_tts_talker",
144
+ "num_attention_heads": 16,
145
+ "num_code_groups": 16,
146
+ "num_hidden_layers": 28,
147
+ "num_key_value_heads": 8,
148
+ "position_id_per_seconds": 13,
149
+ "rms_norm_eps": 1e-06,
150
+ "rope_scaling": {
151
+ "interleaved": true,
152
+ "mrope_section": [
153
+ 24,
154
+ 20,
155
+ 20
156
+ ],
157
+ "rope_type": "default",
158
+ "type": "default"
159
+ },
160
+ "rope_theta": 1000000,
161
+ "sliding_window": null,
162
+ "text_hidden_size": 2048,
163
+ "text_vocab_size": 151936,
164
+ "use_cache": true,
165
+ "use_sliding_window": false,
166
+ "vocab_size": 3072
167
+ },
168
+ "transformers_version": "4.57.3"
169
+ }
Stewart/checkpoint-epoch-0/generation_config.json ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "do_sample": true,
3
+ "repetition_penalty": 1.05,
4
+ "temperature": 0.9,
5
+ "top_p": 1.0,
6
+ "top_k": 50,
7
+ "subtalker_dosample": true,
8
+ "subtalker_temperature": 0.9,
9
+ "subtalker_top_p": 1.0,
10
+ "subtalker_top_k": 50,
11
+ "max_new_tokens": 8192
12
+ }
Stewart/checkpoint-epoch-0/merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
Stewart/checkpoint-epoch-0/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e60852809dd36bc8e6bcde610fcba6e6af2e6b31085c9657d9898b7ba2f106ba
3
+ size 1811626544
Stewart/checkpoint-epoch-0/preprocessor_config.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "padding_side": "left",
3
+ "padding_value": 0.0,
4
+ "processor_class": "Qwen3TTSProcessor",
5
+ "return_attention_mask": true
6
+ }
Stewart/checkpoint-epoch-0/speech_tokenizer/config.json ADDED
@@ -0,0 +1,94 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "Qwen3TTSTokenizerV2Model"
4
+ ],
5
+ "model_type": "qwen3_tts_tokenizer_12hz",
6
+ "encoder_valid_num_quantizers": 16,
7
+ "input_sample_rate": 24000,
8
+ "output_sample_rate": 24000,
9
+ "decode_upsample_rate": 1920,
10
+ "encode_downsample_rate": 1920,
11
+ "decoder_config": {
12
+ "attention_bias": false,
13
+ "attention_dropout": 0.0,
14
+ "latent_dim": 1024,
15
+ "codebook_dim": 512,
16
+ "codebook_size": 2048,
17
+ "decoder_dim": 1536,
18
+ "hidden_act": "silu",
19
+ "hidden_size": 512,
20
+ "intermediate_size": 1024,
21
+ "layer_scale_initial_scale": 0.01,
22
+ "max_position_embeddings": 8000,
23
+ "head_dim": 64,
24
+ "num_attention_heads": 16,
25
+ "num_hidden_layers": 8,
26
+ "num_key_value_heads": 16,
27
+ "num_quantizers": 16,
28
+ "num_semantic_quantizers": 1,
29
+ "rms_norm_eps": 1e-05,
30
+ "rope_theta": 10000,
31
+ "semantic_codebook_size": 4096,
32
+ "sliding_window": 72,
33
+ "upsample_rates": [
34
+ 8,
35
+ 5,
36
+ 4,
37
+ 3
38
+ ],
39
+ "upsampling_ratios": [
40
+ 2,
41
+ 2
42
+ ],
43
+ "vector_quantization_hidden_dimension": 512
44
+ },
45
+ "encoder_config": {
46
+ "_frame_rate": 12.5,
47
+ "attention_bias": false,
48
+ "attention_dropout": 0.0,
49
+ "audio_channels": 1,
50
+ "codebook_dim": 256,
51
+ "codebook_size": 2048,
52
+ "compress": 2,
53
+ "dilation_growth_rate": 2,
54
+ "dtype": "float32",
55
+ "head_dim": 64,
56
+ "hidden_act": "gelu",
57
+ "hidden_size": 512,
58
+ "initializer_range": 0.02,
59
+ "intermediate_size": 2048,
60
+ "kernel_size": 7,
61
+ "last_kernel_size": 3,
62
+ "layer_scale_initial_scale": 0.01,
63
+ "max_position_embeddings": 8000,
64
+ "norm_eps": 1e-05,
65
+ "normalize": false,
66
+ "num_attention_heads": 8,
67
+ "num_filters": 64,
68
+ "num_hidden_layers": 8,
69
+ "num_key_value_heads": 8,
70
+ "num_quantizers": 32,
71
+ "num_residual_layers": 1,
72
+ "num_semantic_quantizers": 1,
73
+ "pad_mode": "constant",
74
+ "residual_kernel_size": 3,
75
+ "rope_theta": 10000.0,
76
+ "sampling_rate": 24000,
77
+ "sliding_window": 250,
78
+ "transformers_version": "4.57.0.dev0",
79
+ "trim_right_ratio": 1.0,
80
+ "upsample_groups": 512,
81
+ "upsampling_ratios": [
82
+ 8,
83
+ 6,
84
+ 5,
85
+ 4
86
+ ],
87
+ "use_cache": false,
88
+ "use_causal_conv": true,
89
+ "use_conv_shortcut": false,
90
+ "use_streaming": false,
91
+ "vector_quantization_hidden_dimension": 256
92
+ },
93
+ "transformers_version": "4.57.3"
94
+ }
Stewart/checkpoint-epoch-0/speech_tokenizer/configuration.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"framework": "pytorch", "task": "feature-extraction", "allow_remote": true}
Stewart/checkpoint-epoch-0/speech_tokenizer/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:836b7b357f5ea43e889936a3709af68dfe3751881acefe4ecf0dbd30ba571258
3
+ size 682293092
Stewart/checkpoint-epoch-0/speech_tokenizer/preprocessor_config.json ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "chunk_length_s": null,
3
+ "feature_extractor_type": "EncodecFeatureExtractor",
4
+ "feature_size": 1,
5
+ "overlap": null,
6
+ "padding_side": "right",
7
+ "padding_value": 0.0,
8
+ "return_attention_mask": true,
9
+ "sampling_rate": 24000
10
+ }
Stewart/checkpoint-epoch-0/tokenizer_config.json ADDED
@@ -0,0 +1,316 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": false,
3
+ "add_prefix_space": false,
4
+ "added_tokens_decoder": {
5
+ "151643": {
6
+ "content": "<|endoftext|>",
7
+ "lstrip": false,
8
+ "normalized": false,
9
+ "rstrip": false,
10
+ "single_word": false,
11
+ "special": true
12
+ },
13
+ "151644": {
14
+ "content": "<|im_start|>",
15
+ "lstrip": false,
16
+ "normalized": false,
17
+ "rstrip": false,
18
+ "single_word": false,
19
+ "special": true
20
+ },
21
+ "151645": {
22
+ "content": "<|im_end|>",
23
+ "lstrip": false,
24
+ "normalized": false,
25
+ "rstrip": false,
26
+ "single_word": false,
27
+ "special": true
28
+ },
29
+ "151646": {
30
+ "content": "<|object_ref_start|>",
31
+ "lstrip": false,
32
+ "normalized": false,
33
+ "rstrip": false,
34
+ "single_word": false,
35
+ "special": true
36
+ },
37
+ "151647": {
38
+ "content": "<|object_ref_end|>",
39
+ "lstrip": false,
40
+ "normalized": false,
41
+ "rstrip": false,
42
+ "single_word": false,
43
+ "special": true
44
+ },
45
+ "151648": {
46
+ "content": "<|box_start|>",
47
+ "lstrip": false,
48
+ "normalized": false,
49
+ "rstrip": false,
50
+ "single_word": false,
51
+ "special": true
52
+ },
53
+ "151649": {
54
+ "content": "<|box_end|>",
55
+ "lstrip": false,
56
+ "normalized": false,
57
+ "rstrip": false,
58
+ "single_word": false,
59
+ "special": true
60
+ },
61
+ "151650": {
62
+ "content": "<|quad_start|>",
63
+ "lstrip": false,
64
+ "normalized": false,
65
+ "rstrip": false,
66
+ "single_word": false,
67
+ "special": true
68
+ },
69
+ "151651": {
70
+ "content": "<|quad_end|>",
71
+ "lstrip": false,
72
+ "normalized": false,
73
+ "rstrip": false,
74
+ "single_word": false,
75
+ "special": true
76
+ },
77
+ "151652": {
78
+ "content": "<|vision_start|>",
79
+ "lstrip": false,
80
+ "normalized": false,
81
+ "rstrip": false,
82
+ "single_word": false,
83
+ "special": true
84
+ },
85
+ "151653": {
86
+ "content": "<|vision_end|>",
87
+ "lstrip": false,
88
+ "normalized": false,
89
+ "rstrip": false,
90
+ "single_word": false,
91
+ "special": true
92
+ },
93
+ "151654": {
94
+ "content": "<|vision_pad|>",
95
+ "lstrip": false,
96
+ "normalized": false,
97
+ "rstrip": false,
98
+ "single_word": false,
99
+ "special": true
100
+ },
101
+ "151655": {
102
+ "content": "<|image_pad|>",
103
+ "lstrip": false,
104
+ "normalized": false,
105
+ "rstrip": false,
106
+ "single_word": false,
107
+ "special": true
108
+ },
109
+ "151656": {
110
+ "content": "<|video_pad|>",
111
+ "lstrip": false,
112
+ "normalized": false,
113
+ "rstrip": false,
114
+ "single_word": false,
115
+ "special": true
116
+ },
117
+ "151657": {
118
+ "content": "<tool_call>",
119
+ "lstrip": false,
120
+ "normalized": false,
121
+ "rstrip": false,
122
+ "single_word": false,
123
+ "special": false
124
+ },
125
+ "151658": {
126
+ "content": "</tool_call>",
127
+ "lstrip": false,
128
+ "normalized": false,
129
+ "rstrip": false,
130
+ "single_word": false,
131
+ "special": false
132
+ },
133
+ "151659": {
134
+ "content": "<|fim_prefix|>",
135
+ "lstrip": false,
136
+ "normalized": false,
137
+ "rstrip": false,
138
+ "single_word": false,
139
+ "special": false
140
+ },
141
+ "151660": {
142
+ "content": "<|fim_middle|>",
143
+ "lstrip": false,
144
+ "normalized": false,
145
+ "rstrip": false,
146
+ "single_word": false,
147
+ "special": false
148
+ },
149
+ "151661": {
150
+ "content": "<|fim_suffix|>",
151
+ "lstrip": false,
152
+ "normalized": false,
153
+ "rstrip": false,
154
+ "single_word": false,
155
+ "special": false
156
+ },
157
+ "151662": {
158
+ "content": "<|fim_pad|>",
159
+ "lstrip": false,
160
+ "normalized": false,
161
+ "rstrip": false,
162
+ "single_word": false,
163
+ "special": false
164
+ },
165
+ "151663": {
166
+ "content": "<|repo_name|>",
167
+ "lstrip": false,
168
+ "normalized": false,
169
+ "rstrip": false,
170
+ "single_word": false,
171
+ "special": false
172
+ },
173
+ "151664": {
174
+ "content": "<|file_sep|>",
175
+ "lstrip": false,
176
+ "normalized": false,
177
+ "rstrip": false,
178
+ "single_word": false,
179
+ "special": false
180
+ },
181
+ "151665": {
182
+ "content": "<tool_response>",
183
+ "lstrip": false,
184
+ "normalized": false,
185
+ "rstrip": false,
186
+ "single_word": false,
187
+ "special": false
188
+ },
189
+ "151666": {
190
+ "content": "</tool_response>",
191
+ "lstrip": false,
192
+ "normalized": false,
193
+ "rstrip": false,
194
+ "single_word": false,
195
+ "special": false
196
+ },
197
+ "151667": {
198
+ "content": "<think>",
199
+ "lstrip": false,
200
+ "normalized": false,
201
+ "rstrip": false,
202
+ "single_word": false,
203
+ "special": false
204
+ },
205
+ "151668": {
206
+ "content": "</think>",
207
+ "lstrip": false,
208
+ "normalized": false,
209
+ "rstrip": false,
210
+ "single_word": false,
211
+ "special": false
212
+ },
213
+ "151669": {
214
+ "content": "<|audio_start|>",
215
+ "lstrip": false,
216
+ "normalized": false,
217
+ "rstrip": false,
218
+ "single_word": false,
219
+ "special": true
220
+ },
221
+ "151670": {
222
+ "content": "<|audio_end|>",
223
+ "lstrip": false,
224
+ "normalized": false,
225
+ "rstrip": false,
226
+ "single_word": false,
227
+ "special": true
228
+ },
229
+ "151671": {
230
+ "content": "<tts_pad>",
231
+ "lstrip": false,
232
+ "normalized": false,
233
+ "rstrip": false,
234
+ "single_word": false,
235
+ "special": true
236
+ },
237
+ "151672": {
238
+ "content": "<tts_text_bos>",
239
+ "lstrip": false,
240
+ "normalized": false,
241
+ "rstrip": false,
242
+ "single_word": false,
243
+ "special": true
244
+ },
245
+ "151673": {
246
+ "content": "<tts_text_eod>",
247
+ "lstrip": false,
248
+ "normalized": false,
249
+ "rstrip": false,
250
+ "single_word": false,
251
+ "special": true
252
+ },
253
+ "151674": {
254
+ "content": "<tts_text_bos_single>",
255
+ "lstrip": false,
256
+ "normalized": false,
257
+ "rstrip": false,
258
+ "single_word": false,
259
+ "special": true
260
+ },
261
+ "151675": {
262
+ "content": "<|audio_pad|>",
263
+ "lstrip": false,
264
+ "normalized": false,
265
+ "rstrip": false,
266
+ "single_word": false,
267
+ "special": true
268
+ }
269
+ },
270
+ "additional_special_tokens": [
271
+ "<|im_start|>",
272
+ "<|im_end|>",
273
+ "<|object_ref_start|>",
274
+ "<|object_ref_end|>",
275
+ "<|box_start|>",
276
+ "<|box_end|>",
277
+ "<|quad_start|>",
278
+ "<|quad_end|>",
279
+ "<|vision_start|>",
280
+ "<|vision_end|>",
281
+ "<|vision_pad|>",
282
+ "<|image_pad|>",
283
+ "<|video_pad|>",
284
+ "<|audio_start|>",
285
+ "<|audio_end|>",
286
+ "<tts_pad>",
287
+ "<tts_text_bos>",
288
+ "<tts_text_bos_single>",
289
+ "<|audio_pad|>"
290
+ ],
291
+ "extra_special_tokens": {
292
+ "image_token": "<|image_pad|>",
293
+ "audio_token": "<|audio_pad|>",
294
+ "video_token": "<|video_pad|>",
295
+ "vision_bos_token": "<|vision_start|>",
296
+ "vision_eos_token": "<|vision_end|>",
297
+ "audio_bos_token": "<|audio_start|>",
298
+ "audio_eos_token": "<|audio_end|>"
299
+ },
300
+ "bos_token": null,
301
+ "clean_up_tokenization_spaces": false,
302
+ "eos_token": "<|im_end|>",
303
+ "errors": "replace",
304
+ "model_max_length": 131072,
305
+ "pad_token": "<|endoftext|>",
306
+ "split_special_tokens": false,
307
+ "tokenizer_class": "Qwen2Tokenizer",
308
+ "unk_token": null,
309
+ "image_token": "<|image_pad|>",
310
+ "audio_token": "<|audio_pad|>",
311
+ "video_token": "<|video_pad|>",
312
+ "vision_bos_token": "<|vision_start|>",
313
+ "vision_eos_token": "<|vision_end|>",
314
+ "audio_bos_token": "<|audio_start|>",
315
+ "audio_eos_token": "<|audio_end|>"
316
+ }
Stewart/checkpoint-epoch-0/vocab.json ADDED
The diff for this file is too large to render. See raw diff
 
Stewart/training_config.json ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "speaker_name": "Patrick Stewart",
3
+ "init_model": "Qwen/Qwen3-TTS-12Hz-1.7B-Base",
4
+ "model_source": "HuggingFace",
5
+ "batch_size": 64,
6
+ "lr": "2e-06",
7
+ "epochs": 5,
8
+ "grad_acc": 1,
9
+ "use_experimental_speedup": true,
10
+ "resume_from_checkpoint": "latest"
11
+ }