Upload folder using huggingface_hub
Browse files- Stewart-0.6b/training_config.json +11 -0
- Stewart-1.7/training_config.json +11 -0
- Stewart/checkpoint-epoch-0/.gitattributes +35 -0
- Stewart/checkpoint-epoch-0/README.md +103 -0
- Stewart/checkpoint-epoch-0/accelerate_state/model.safetensors +3 -0
- Stewart/checkpoint-epoch-0/accelerate_state/optimizer.bin +3 -0
- Stewart/checkpoint-epoch-0/accelerate_state/random_states_0.pkl +3 -0
- Stewart/checkpoint-epoch-0/config.json +169 -0
- Stewart/checkpoint-epoch-0/generation_config.json +12 -0
- Stewart/checkpoint-epoch-0/merges.txt +0 -0
- Stewart/checkpoint-epoch-0/model.safetensors +3 -0
- Stewart/checkpoint-epoch-0/preprocessor_config.json +6 -0
- Stewart/checkpoint-epoch-0/speech_tokenizer/config.json +94 -0
- Stewart/checkpoint-epoch-0/speech_tokenizer/configuration.json +1 -0
- Stewart/checkpoint-epoch-0/speech_tokenizer/model.safetensors +3 -0
- Stewart/checkpoint-epoch-0/speech_tokenizer/preprocessor_config.json +10 -0
- Stewart/checkpoint-epoch-0/tokenizer_config.json +316 -0
- Stewart/checkpoint-epoch-0/vocab.json +0 -0
- Stewart/training_config.json +11 -0
Stewart-0.6b/training_config.json
ADDED
|
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"speaker_name": "Patrick Stewart",
|
| 3 |
+
"init_model": "Qwen/Qwen3-TTS-12Hz-0.6B-Base",
|
| 4 |
+
"model_source": "HuggingFace",
|
| 5 |
+
"batch_size": 32,
|
| 6 |
+
"lr": "1e-07",
|
| 7 |
+
"epochs": 30,
|
| 8 |
+
"grad_acc": 2,
|
| 9 |
+
"use_experimental_speedup": true,
|
| 10 |
+
"resume_from_checkpoint": "latest"
|
| 11 |
+
}
|
Stewart-1.7/training_config.json
ADDED
|
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"speaker_name": "Patrick Stewart",
|
| 3 |
+
"init_model": "Qwen/Qwen3-TTS-12Hz-1.7B-Base",
|
| 4 |
+
"model_source": "HuggingFace",
|
| 5 |
+
"batch_size": 32,
|
| 6 |
+
"lr": "1e-7",
|
| 7 |
+
"epochs": 10,
|
| 8 |
+
"grad_acc": 2,
|
| 9 |
+
"use_experimental_speedup": true,
|
| 10 |
+
"resume_from_checkpoint": "latest"
|
| 11 |
+
}
|
Stewart/checkpoint-epoch-0/.gitattributes
ADDED
|
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
| 2 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
| 3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
| 4 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
| 5 |
+
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
| 6 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
| 7 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
| 8 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
| 9 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
| 10 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
| 11 |
+
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
| 12 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
| 13 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
| 14 |
+
*.npy filter=lfs diff=lfs merge=lfs -text
|
| 15 |
+
*.npz filter=lfs diff=lfs merge=lfs -text
|
| 16 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
| 17 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
| 18 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
| 19 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
| 20 |
+
*.pickle filter=lfs diff=lfs merge=lfs -text
|
| 21 |
+
*.pkl filter=lfs diff=lfs merge=lfs -text
|
| 22 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
| 23 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
| 24 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
| 25 |
+
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
| 26 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
| 27 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
| 28 |
+
*.tar filter=lfs diff=lfs merge=lfs -text
|
| 29 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
| 30 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
| 31 |
+
*.wasm filter=lfs diff=lfs merge=lfs -text
|
| 32 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
| 33 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
+
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
Stewart/checkpoint-epoch-0/README.md
ADDED
|
@@ -0,0 +1,103 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
license: apache-2.0
|
| 3 |
+
pipeline_tag: text-to-speech
|
| 4 |
+
language:
|
| 5 |
+
- zh
|
| 6 |
+
- en
|
| 7 |
+
- ja
|
| 8 |
+
- ko
|
| 9 |
+
- de
|
| 10 |
+
- fr
|
| 11 |
+
- ru
|
| 12 |
+
- pt
|
| 13 |
+
- es
|
| 14 |
+
- it
|
| 15 |
+
tags:
|
| 16 |
+
- audio
|
| 17 |
+
- tts
|
| 18 |
+
- voice-clone
|
| 19 |
+
---
|
| 20 |
+
|
| 21 |
+
# Qwen3-TTS-12Hz-0.6B-Base
|
| 22 |
+
|
| 23 |
+
[**Qwen3-TTS Technical Report**](https://huggingface.co/papers/2601.15621) | [**GitHub Repository**](https://github.com/QwenLM/Qwen3-TTS) | [**Hugging Face Demo**](https://huggingface.co/spaces/Qwen/Qwen3-TTS)
|
| 24 |
+
|
| 25 |
+
Qwen3-TTS is a family of advanced multilingual, controllable, robust, and streaming text-to-speech models. Trained on over 5 million hours of speech data spanning 10 languages, Qwen3-TTS supports state-of-the-art 3-second voice cloning and description-based control.
|
| 26 |
+
|
| 27 |
+
This specific checkpoint is the **0.6B Base model**, which is capable of rapid voice cloning from a user-provided audio input.
|
| 28 |
+
|
| 29 |
+
## Quickstart
|
| 30 |
+
|
| 31 |
+
### Installation
|
| 32 |
+
|
| 33 |
+
```bash
|
| 34 |
+
pip install -U qwen-tts
|
| 35 |
+
# Optional: for optimized performance
|
| 36 |
+
pip install -U flash-attn --no-build-isolation
|
| 37 |
+
```
|
| 38 |
+
|
| 39 |
+
### Sample Usage (Voice Clone)
|
| 40 |
+
|
| 41 |
+
To clone a voice and synthesize new content using the Base model, you can use the following code snippet:
|
| 42 |
+
|
| 43 |
+
```python
|
| 44 |
+
import torch
|
| 45 |
+
import soundfile as sf
|
| 46 |
+
from qwen_tts import Qwen3TTSModel
|
| 47 |
+
|
| 48 |
+
# Load the model
|
| 49 |
+
model = Qwen3TTSModel.from_pretrained(
|
| 50 |
+
"Qwen/Qwen3-TTS-12Hz-0.6B-Base",
|
| 51 |
+
device_map="cuda:0",
|
| 52 |
+
dtype=torch.bfloat16,
|
| 53 |
+
attn_implementation="flash_attention_2",
|
| 54 |
+
)
|
| 55 |
+
|
| 56 |
+
# Reference audio for cloning
|
| 57 |
+
ref_audio = "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen3-TTS-Repo/clone.wav"
|
| 58 |
+
ref_text = "Okay. Yeah. I resent you. I love you. I respect you. But you know what? You blew it! And thanks to you."
|
| 59 |
+
|
| 60 |
+
# Generate speech
|
| 61 |
+
wavs, sr = model.generate_voice_clone(
|
| 62 |
+
text="I am solving the equation: x = [-b ± √(b²-4ac)] / 2a? Nobody can — it's a disaster (◍•͈⌔•͈◍), very sad!",
|
| 63 |
+
language="English",
|
| 64 |
+
ref_audio=ref_audio,
|
| 65 |
+
ref_text=ref_text,
|
| 66 |
+
)
|
| 67 |
+
|
| 68 |
+
# Save the resulting audio
|
| 69 |
+
sf.write("output_voice_clone.wav", wavs[0], sr)
|
| 70 |
+
```
|
| 71 |
+
|
| 72 |
+
## Overview
|
| 73 |
+
### Introduction
|
| 74 |
+
|
| 75 |
+
<p align="center">
|
| 76 |
+
<img src="https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen3-TTS-Repo/qwen3_tts_introduction.png" width="90%"/>
|
| 77 |
+
<p>
|
| 78 |
+
|
| 79 |
+
Qwen3-TTS covers 10 major languages (Chinese, English, Japanese, Korean, German, French, Russian, Portuguese, Spanish, and Italian) as well as multiple dialectal voice profiles to meet global application needs. Key features:
|
| 80 |
+
|
| 81 |
+
* **Powerful Speech Representation**: Powered by the self-developed Qwen3-TTS-Tokenizer-12Hz, it achieves efficient acoustic compression and high-dimensional semantic modeling.
|
| 82 |
+
* **Universal End-to-End Architecture**: Utilizing a discrete multi-codebook LM architecture, it realizes full-information end-to-end speech modeling.
|
| 83 |
+
* **Extreme Low-Latency Streaming Generation**: End-to-end synthesis latency as low as 97ms, meeting the rigorous demands of real-time interactive scenarios.
|
| 84 |
+
* **Intelligent Text Understanding and Voice Control**: Supports speech generation driven by natural language instructions, allowing for flexible control over multi-dimensional acoustic attributes.
|
| 85 |
+
|
| 86 |
+
### Model Architecture
|
| 87 |
+
|
| 88 |
+
<p align="center">
|
| 89 |
+
<img src="https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen3-TTS-Repo/overview.png" width="80%"/>
|
| 90 |
+
<p>
|
| 91 |
+
|
| 92 |
+
## Citation
|
| 93 |
+
|
| 94 |
+
If you find this work useful, please consider citing the technical report:
|
| 95 |
+
|
| 96 |
+
```BibTeX
|
| 97 |
+
@article{Qwen3-TTS,
|
| 98 |
+
title={Qwen3-TTS Technical Report},
|
| 99 |
+
author={Hangrui Hu and Xinfa Zhu and Ting He and Dake Guo and Bin Zhang and Xiong Wang and Zhifang Guo and Ziyue Jiang and Hongkun Hao and Zishan Guo and Xinyu Zhang and Pei Zhang and Baosong Yang and Jin Xu and Jingren Zhou and Junyang Lin},
|
| 100 |
+
journal={arXiv preprint arXiv:2601.15621},
|
| 101 |
+
year={2026}
|
| 102 |
+
}
|
| 103 |
+
```
|
Stewart/checkpoint-epoch-0/accelerate_state/model.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:3f4ded20a82d1addd00c58992fd8d10894bab28cdbb2e07b9b80dcf980df80dc
|
| 3 |
+
size 1829344304
|
Stewart/checkpoint-epoch-0/accelerate_state/optimizer.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:7abc1ac6a9f297ce6c490744ff5b60e658d8b6ac432d3d4a03c4cd5387db012c
|
| 3 |
+
size 3623493273
|
Stewart/checkpoint-epoch-0/accelerate_state/random_states_0.pkl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:4e7731d221a0a78c9505c61dec568e1b4098493a34dc8d0aceaf108f1334b1b2
|
| 3 |
+
size 14757
|
Stewart/checkpoint-epoch-0/config.json
ADDED
|
@@ -0,0 +1,169 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"architectures": [
|
| 3 |
+
"Qwen3TTSForConditionalGeneration"
|
| 4 |
+
],
|
| 5 |
+
"assistant_token_id": 77091,
|
| 6 |
+
"im_end_token_id": 151645,
|
| 7 |
+
"im_start_token_id": 151644,
|
| 8 |
+
"tts_bos_token_id": 151672,
|
| 9 |
+
"tts_eos_token_id": 151673,
|
| 10 |
+
"tts_pad_token_id": 151671,
|
| 11 |
+
"model_type": "qwen3_tts",
|
| 12 |
+
"tokenizer_type": "qwen3_tts_tokenizer_12hz",
|
| 13 |
+
"tts_model_size": "0b6",
|
| 14 |
+
"tts_model_type": "custom_voice",
|
| 15 |
+
"speaker_encoder_config": {
|
| 16 |
+
"enc_dim": 1024,
|
| 17 |
+
"sample_rate": 24000
|
| 18 |
+
},
|
| 19 |
+
"talker_config": {
|
| 20 |
+
"attention_bias": false,
|
| 21 |
+
"attention_dropout": 0,
|
| 22 |
+
"code_predictor_config": {
|
| 23 |
+
"_name_or_path": "",
|
| 24 |
+
"add_cross_attention": false,
|
| 25 |
+
"architectures": null,
|
| 26 |
+
"attention_bias": false,
|
| 27 |
+
"attention_dropout": 0,
|
| 28 |
+
"bad_words_ids": null,
|
| 29 |
+
"begin_suppress_tokens": null,
|
| 30 |
+
"bos_token_id": null,
|
| 31 |
+
"chunk_size_feed_forward": 0,
|
| 32 |
+
"cross_attention_hidden_size": null,
|
| 33 |
+
"decoder_start_token_id": null,
|
| 34 |
+
"diversity_penalty": 0.0,
|
| 35 |
+
"do_sample": false,
|
| 36 |
+
"early_stopping": false,
|
| 37 |
+
"encoder_no_repeat_ngram_size": 0,
|
| 38 |
+
"eos_token_id": null,
|
| 39 |
+
"exponential_decay_length_penalty": null,
|
| 40 |
+
"finetuning_task": null,
|
| 41 |
+
"forced_bos_token_id": null,
|
| 42 |
+
"forced_eos_token_id": null,
|
| 43 |
+
"head_dim": 128,
|
| 44 |
+
"hidden_act": "silu",
|
| 45 |
+
"hidden_size": 1024,
|
| 46 |
+
"id2label": {
|
| 47 |
+
"0": "LABEL_0",
|
| 48 |
+
"1": "LABEL_1"
|
| 49 |
+
},
|
| 50 |
+
"initializer_range": 0.02,
|
| 51 |
+
"intermediate_size": 3072,
|
| 52 |
+
"is_decoder": false,
|
| 53 |
+
"is_encoder_decoder": false,
|
| 54 |
+
"label2id": {
|
| 55 |
+
"LABEL_0": 0,
|
| 56 |
+
"LABEL_1": 1
|
| 57 |
+
},
|
| 58 |
+
"layer_types": [
|
| 59 |
+
"full_attention",
|
| 60 |
+
"full_attention",
|
| 61 |
+
"full_attention",
|
| 62 |
+
"full_attention",
|
| 63 |
+
"full_attention"
|
| 64 |
+
],
|
| 65 |
+
"length_penalty": 1.0,
|
| 66 |
+
"max_length": 20,
|
| 67 |
+
"max_position_embeddings": 65536,
|
| 68 |
+
"max_window_layers": 28,
|
| 69 |
+
"min_length": 0,
|
| 70 |
+
"model_type": "qwen3_tts_talker_code_predictor",
|
| 71 |
+
"no_repeat_ngram_size": 0,
|
| 72 |
+
"num_attention_heads": 16,
|
| 73 |
+
"num_beam_groups": 1,
|
| 74 |
+
"num_beams": 1,
|
| 75 |
+
"num_code_groups": 16,
|
| 76 |
+
"num_hidden_layers": 5,
|
| 77 |
+
"num_key_value_heads": 8,
|
| 78 |
+
"num_return_sequences": 1,
|
| 79 |
+
"output_attentions": false,
|
| 80 |
+
"output_hidden_states": false,
|
| 81 |
+
"output_scores": false,
|
| 82 |
+
"pad_token_id": null,
|
| 83 |
+
"prefix": null,
|
| 84 |
+
"problem_type": null,
|
| 85 |
+
"pruned_heads": {},
|
| 86 |
+
"remove_invalid_values": false,
|
| 87 |
+
"repetition_penalty": 1.0,
|
| 88 |
+
"return_dict": true,
|
| 89 |
+
"return_dict_in_generate": false,
|
| 90 |
+
"rms_norm_eps": 1e-06,
|
| 91 |
+
"rope_scaling": null,
|
| 92 |
+
"rope_theta": 1000000,
|
| 93 |
+
"sep_token_id": null,
|
| 94 |
+
"sliding_window": null,
|
| 95 |
+
"suppress_tokens": null,
|
| 96 |
+
"task_specific_params": null,
|
| 97 |
+
"temperature": 1.0,
|
| 98 |
+
"tf_legacy_loss": false,
|
| 99 |
+
"tie_encoder_decoder": false,
|
| 100 |
+
"tie_word_embeddings": false,
|
| 101 |
+
"tokenizer_class": null,
|
| 102 |
+
"top_k": 50,
|
| 103 |
+
"top_p": 1.0,
|
| 104 |
+
"dtype": null,
|
| 105 |
+
"torchscript": false,
|
| 106 |
+
"typical_p": 1.0,
|
| 107 |
+
"use_bfloat16": false,
|
| 108 |
+
"use_cache": true,
|
| 109 |
+
"use_sliding_window": false,
|
| 110 |
+
"vocab_size": 2048
|
| 111 |
+
},
|
| 112 |
+
"codec_bos_id": 2149,
|
| 113 |
+
"codec_eos_token_id": 2150,
|
| 114 |
+
"codec_think_id": 2154,
|
| 115 |
+
"codec_language_id": {
|
| 116 |
+
"chinese": 2055,
|
| 117 |
+
"english": 2050,
|
| 118 |
+
"german": 2053,
|
| 119 |
+
"italian": 2070,
|
| 120 |
+
"portuguese": 2071,
|
| 121 |
+
"spanish": 2054,
|
| 122 |
+
"japanese": 2058,
|
| 123 |
+
"korean": 2064,
|
| 124 |
+
"french": 2061,
|
| 125 |
+
"russian": 2069
|
| 126 |
+
},
|
| 127 |
+
"codec_nothink_id": 2155,
|
| 128 |
+
"codec_pad_id": 2148,
|
| 129 |
+
"codec_think_bos_id": 2156,
|
| 130 |
+
"codec_think_eos_id": 2157,
|
| 131 |
+
"spk_id": {
|
| 132 |
+
"Patrick Stewart": 3000
|
| 133 |
+
},
|
| 134 |
+
"spk_is_dialect": {
|
| 135 |
+
"Patrick Stewart": false
|
| 136 |
+
},
|
| 137 |
+
"head_dim": 128,
|
| 138 |
+
"hidden_act": "silu",
|
| 139 |
+
"hidden_size": 1024,
|
| 140 |
+
"initializer_range": 0.02,
|
| 141 |
+
"intermediate_size": 3072,
|
| 142 |
+
"max_position_embeddings": 32768,
|
| 143 |
+
"model_type": "qwen3_tts_talker",
|
| 144 |
+
"num_attention_heads": 16,
|
| 145 |
+
"num_code_groups": 16,
|
| 146 |
+
"num_hidden_layers": 28,
|
| 147 |
+
"num_key_value_heads": 8,
|
| 148 |
+
"position_id_per_seconds": 13,
|
| 149 |
+
"rms_norm_eps": 1e-06,
|
| 150 |
+
"rope_scaling": {
|
| 151 |
+
"interleaved": true,
|
| 152 |
+
"mrope_section": [
|
| 153 |
+
24,
|
| 154 |
+
20,
|
| 155 |
+
20
|
| 156 |
+
],
|
| 157 |
+
"rope_type": "default",
|
| 158 |
+
"type": "default"
|
| 159 |
+
},
|
| 160 |
+
"rope_theta": 1000000,
|
| 161 |
+
"sliding_window": null,
|
| 162 |
+
"text_hidden_size": 2048,
|
| 163 |
+
"text_vocab_size": 151936,
|
| 164 |
+
"use_cache": true,
|
| 165 |
+
"use_sliding_window": false,
|
| 166 |
+
"vocab_size": 3072
|
| 167 |
+
},
|
| 168 |
+
"transformers_version": "4.57.3"
|
| 169 |
+
}
|
Stewart/checkpoint-epoch-0/generation_config.json
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"do_sample": true,
|
| 3 |
+
"repetition_penalty": 1.05,
|
| 4 |
+
"temperature": 0.9,
|
| 5 |
+
"top_p": 1.0,
|
| 6 |
+
"top_k": 50,
|
| 7 |
+
"subtalker_dosample": true,
|
| 8 |
+
"subtalker_temperature": 0.9,
|
| 9 |
+
"subtalker_top_p": 1.0,
|
| 10 |
+
"subtalker_top_k": 50,
|
| 11 |
+
"max_new_tokens": 8192
|
| 12 |
+
}
|
Stewart/checkpoint-epoch-0/merges.txt
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
Stewart/checkpoint-epoch-0/model.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:e60852809dd36bc8e6bcde610fcba6e6af2e6b31085c9657d9898b7ba2f106ba
|
| 3 |
+
size 1811626544
|
Stewart/checkpoint-epoch-0/preprocessor_config.json
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"padding_side": "left",
|
| 3 |
+
"padding_value": 0.0,
|
| 4 |
+
"processor_class": "Qwen3TTSProcessor",
|
| 5 |
+
"return_attention_mask": true
|
| 6 |
+
}
|
Stewart/checkpoint-epoch-0/speech_tokenizer/config.json
ADDED
|
@@ -0,0 +1,94 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"architectures": [
|
| 3 |
+
"Qwen3TTSTokenizerV2Model"
|
| 4 |
+
],
|
| 5 |
+
"model_type": "qwen3_tts_tokenizer_12hz",
|
| 6 |
+
"encoder_valid_num_quantizers": 16,
|
| 7 |
+
"input_sample_rate": 24000,
|
| 8 |
+
"output_sample_rate": 24000,
|
| 9 |
+
"decode_upsample_rate": 1920,
|
| 10 |
+
"encode_downsample_rate": 1920,
|
| 11 |
+
"decoder_config": {
|
| 12 |
+
"attention_bias": false,
|
| 13 |
+
"attention_dropout": 0.0,
|
| 14 |
+
"latent_dim": 1024,
|
| 15 |
+
"codebook_dim": 512,
|
| 16 |
+
"codebook_size": 2048,
|
| 17 |
+
"decoder_dim": 1536,
|
| 18 |
+
"hidden_act": "silu",
|
| 19 |
+
"hidden_size": 512,
|
| 20 |
+
"intermediate_size": 1024,
|
| 21 |
+
"layer_scale_initial_scale": 0.01,
|
| 22 |
+
"max_position_embeddings": 8000,
|
| 23 |
+
"head_dim": 64,
|
| 24 |
+
"num_attention_heads": 16,
|
| 25 |
+
"num_hidden_layers": 8,
|
| 26 |
+
"num_key_value_heads": 16,
|
| 27 |
+
"num_quantizers": 16,
|
| 28 |
+
"num_semantic_quantizers": 1,
|
| 29 |
+
"rms_norm_eps": 1e-05,
|
| 30 |
+
"rope_theta": 10000,
|
| 31 |
+
"semantic_codebook_size": 4096,
|
| 32 |
+
"sliding_window": 72,
|
| 33 |
+
"upsample_rates": [
|
| 34 |
+
8,
|
| 35 |
+
5,
|
| 36 |
+
4,
|
| 37 |
+
3
|
| 38 |
+
],
|
| 39 |
+
"upsampling_ratios": [
|
| 40 |
+
2,
|
| 41 |
+
2
|
| 42 |
+
],
|
| 43 |
+
"vector_quantization_hidden_dimension": 512
|
| 44 |
+
},
|
| 45 |
+
"encoder_config": {
|
| 46 |
+
"_frame_rate": 12.5,
|
| 47 |
+
"attention_bias": false,
|
| 48 |
+
"attention_dropout": 0.0,
|
| 49 |
+
"audio_channels": 1,
|
| 50 |
+
"codebook_dim": 256,
|
| 51 |
+
"codebook_size": 2048,
|
| 52 |
+
"compress": 2,
|
| 53 |
+
"dilation_growth_rate": 2,
|
| 54 |
+
"dtype": "float32",
|
| 55 |
+
"head_dim": 64,
|
| 56 |
+
"hidden_act": "gelu",
|
| 57 |
+
"hidden_size": 512,
|
| 58 |
+
"initializer_range": 0.02,
|
| 59 |
+
"intermediate_size": 2048,
|
| 60 |
+
"kernel_size": 7,
|
| 61 |
+
"last_kernel_size": 3,
|
| 62 |
+
"layer_scale_initial_scale": 0.01,
|
| 63 |
+
"max_position_embeddings": 8000,
|
| 64 |
+
"norm_eps": 1e-05,
|
| 65 |
+
"normalize": false,
|
| 66 |
+
"num_attention_heads": 8,
|
| 67 |
+
"num_filters": 64,
|
| 68 |
+
"num_hidden_layers": 8,
|
| 69 |
+
"num_key_value_heads": 8,
|
| 70 |
+
"num_quantizers": 32,
|
| 71 |
+
"num_residual_layers": 1,
|
| 72 |
+
"num_semantic_quantizers": 1,
|
| 73 |
+
"pad_mode": "constant",
|
| 74 |
+
"residual_kernel_size": 3,
|
| 75 |
+
"rope_theta": 10000.0,
|
| 76 |
+
"sampling_rate": 24000,
|
| 77 |
+
"sliding_window": 250,
|
| 78 |
+
"transformers_version": "4.57.0.dev0",
|
| 79 |
+
"trim_right_ratio": 1.0,
|
| 80 |
+
"upsample_groups": 512,
|
| 81 |
+
"upsampling_ratios": [
|
| 82 |
+
8,
|
| 83 |
+
6,
|
| 84 |
+
5,
|
| 85 |
+
4
|
| 86 |
+
],
|
| 87 |
+
"use_cache": false,
|
| 88 |
+
"use_causal_conv": true,
|
| 89 |
+
"use_conv_shortcut": false,
|
| 90 |
+
"use_streaming": false,
|
| 91 |
+
"vector_quantization_hidden_dimension": 256
|
| 92 |
+
},
|
| 93 |
+
"transformers_version": "4.57.3"
|
| 94 |
+
}
|
Stewart/checkpoint-epoch-0/speech_tokenizer/configuration.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"framework": "pytorch", "task": "feature-extraction", "allow_remote": true}
|
Stewart/checkpoint-epoch-0/speech_tokenizer/model.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:836b7b357f5ea43e889936a3709af68dfe3751881acefe4ecf0dbd30ba571258
|
| 3 |
+
size 682293092
|
Stewart/checkpoint-epoch-0/speech_tokenizer/preprocessor_config.json
ADDED
|
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"chunk_length_s": null,
|
| 3 |
+
"feature_extractor_type": "EncodecFeatureExtractor",
|
| 4 |
+
"feature_size": 1,
|
| 5 |
+
"overlap": null,
|
| 6 |
+
"padding_side": "right",
|
| 7 |
+
"padding_value": 0.0,
|
| 8 |
+
"return_attention_mask": true,
|
| 9 |
+
"sampling_rate": 24000
|
| 10 |
+
}
|
Stewart/checkpoint-epoch-0/tokenizer_config.json
ADDED
|
@@ -0,0 +1,316 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"add_bos_token": false,
|
| 3 |
+
"add_prefix_space": false,
|
| 4 |
+
"added_tokens_decoder": {
|
| 5 |
+
"151643": {
|
| 6 |
+
"content": "<|endoftext|>",
|
| 7 |
+
"lstrip": false,
|
| 8 |
+
"normalized": false,
|
| 9 |
+
"rstrip": false,
|
| 10 |
+
"single_word": false,
|
| 11 |
+
"special": true
|
| 12 |
+
},
|
| 13 |
+
"151644": {
|
| 14 |
+
"content": "<|im_start|>",
|
| 15 |
+
"lstrip": false,
|
| 16 |
+
"normalized": false,
|
| 17 |
+
"rstrip": false,
|
| 18 |
+
"single_word": false,
|
| 19 |
+
"special": true
|
| 20 |
+
},
|
| 21 |
+
"151645": {
|
| 22 |
+
"content": "<|im_end|>",
|
| 23 |
+
"lstrip": false,
|
| 24 |
+
"normalized": false,
|
| 25 |
+
"rstrip": false,
|
| 26 |
+
"single_word": false,
|
| 27 |
+
"special": true
|
| 28 |
+
},
|
| 29 |
+
"151646": {
|
| 30 |
+
"content": "<|object_ref_start|>",
|
| 31 |
+
"lstrip": false,
|
| 32 |
+
"normalized": false,
|
| 33 |
+
"rstrip": false,
|
| 34 |
+
"single_word": false,
|
| 35 |
+
"special": true
|
| 36 |
+
},
|
| 37 |
+
"151647": {
|
| 38 |
+
"content": "<|object_ref_end|>",
|
| 39 |
+
"lstrip": false,
|
| 40 |
+
"normalized": false,
|
| 41 |
+
"rstrip": false,
|
| 42 |
+
"single_word": false,
|
| 43 |
+
"special": true
|
| 44 |
+
},
|
| 45 |
+
"151648": {
|
| 46 |
+
"content": "<|box_start|>",
|
| 47 |
+
"lstrip": false,
|
| 48 |
+
"normalized": false,
|
| 49 |
+
"rstrip": false,
|
| 50 |
+
"single_word": false,
|
| 51 |
+
"special": true
|
| 52 |
+
},
|
| 53 |
+
"151649": {
|
| 54 |
+
"content": "<|box_end|>",
|
| 55 |
+
"lstrip": false,
|
| 56 |
+
"normalized": false,
|
| 57 |
+
"rstrip": false,
|
| 58 |
+
"single_word": false,
|
| 59 |
+
"special": true
|
| 60 |
+
},
|
| 61 |
+
"151650": {
|
| 62 |
+
"content": "<|quad_start|>",
|
| 63 |
+
"lstrip": false,
|
| 64 |
+
"normalized": false,
|
| 65 |
+
"rstrip": false,
|
| 66 |
+
"single_word": false,
|
| 67 |
+
"special": true
|
| 68 |
+
},
|
| 69 |
+
"151651": {
|
| 70 |
+
"content": "<|quad_end|>",
|
| 71 |
+
"lstrip": false,
|
| 72 |
+
"normalized": false,
|
| 73 |
+
"rstrip": false,
|
| 74 |
+
"single_word": false,
|
| 75 |
+
"special": true
|
| 76 |
+
},
|
| 77 |
+
"151652": {
|
| 78 |
+
"content": "<|vision_start|>",
|
| 79 |
+
"lstrip": false,
|
| 80 |
+
"normalized": false,
|
| 81 |
+
"rstrip": false,
|
| 82 |
+
"single_word": false,
|
| 83 |
+
"special": true
|
| 84 |
+
},
|
| 85 |
+
"151653": {
|
| 86 |
+
"content": "<|vision_end|>",
|
| 87 |
+
"lstrip": false,
|
| 88 |
+
"normalized": false,
|
| 89 |
+
"rstrip": false,
|
| 90 |
+
"single_word": false,
|
| 91 |
+
"special": true
|
| 92 |
+
},
|
| 93 |
+
"151654": {
|
| 94 |
+
"content": "<|vision_pad|>",
|
| 95 |
+
"lstrip": false,
|
| 96 |
+
"normalized": false,
|
| 97 |
+
"rstrip": false,
|
| 98 |
+
"single_word": false,
|
| 99 |
+
"special": true
|
| 100 |
+
},
|
| 101 |
+
"151655": {
|
| 102 |
+
"content": "<|image_pad|>",
|
| 103 |
+
"lstrip": false,
|
| 104 |
+
"normalized": false,
|
| 105 |
+
"rstrip": false,
|
| 106 |
+
"single_word": false,
|
| 107 |
+
"special": true
|
| 108 |
+
},
|
| 109 |
+
"151656": {
|
| 110 |
+
"content": "<|video_pad|>",
|
| 111 |
+
"lstrip": false,
|
| 112 |
+
"normalized": false,
|
| 113 |
+
"rstrip": false,
|
| 114 |
+
"single_word": false,
|
| 115 |
+
"special": true
|
| 116 |
+
},
|
| 117 |
+
"151657": {
|
| 118 |
+
"content": "<tool_call>",
|
| 119 |
+
"lstrip": false,
|
| 120 |
+
"normalized": false,
|
| 121 |
+
"rstrip": false,
|
| 122 |
+
"single_word": false,
|
| 123 |
+
"special": false
|
| 124 |
+
},
|
| 125 |
+
"151658": {
|
| 126 |
+
"content": "</tool_call>",
|
| 127 |
+
"lstrip": false,
|
| 128 |
+
"normalized": false,
|
| 129 |
+
"rstrip": false,
|
| 130 |
+
"single_word": false,
|
| 131 |
+
"special": false
|
| 132 |
+
},
|
| 133 |
+
"151659": {
|
| 134 |
+
"content": "<|fim_prefix|>",
|
| 135 |
+
"lstrip": false,
|
| 136 |
+
"normalized": false,
|
| 137 |
+
"rstrip": false,
|
| 138 |
+
"single_word": false,
|
| 139 |
+
"special": false
|
| 140 |
+
},
|
| 141 |
+
"151660": {
|
| 142 |
+
"content": "<|fim_middle|>",
|
| 143 |
+
"lstrip": false,
|
| 144 |
+
"normalized": false,
|
| 145 |
+
"rstrip": false,
|
| 146 |
+
"single_word": false,
|
| 147 |
+
"special": false
|
| 148 |
+
},
|
| 149 |
+
"151661": {
|
| 150 |
+
"content": "<|fim_suffix|>",
|
| 151 |
+
"lstrip": false,
|
| 152 |
+
"normalized": false,
|
| 153 |
+
"rstrip": false,
|
| 154 |
+
"single_word": false,
|
| 155 |
+
"special": false
|
| 156 |
+
},
|
| 157 |
+
"151662": {
|
| 158 |
+
"content": "<|fim_pad|>",
|
| 159 |
+
"lstrip": false,
|
| 160 |
+
"normalized": false,
|
| 161 |
+
"rstrip": false,
|
| 162 |
+
"single_word": false,
|
| 163 |
+
"special": false
|
| 164 |
+
},
|
| 165 |
+
"151663": {
|
| 166 |
+
"content": "<|repo_name|>",
|
| 167 |
+
"lstrip": false,
|
| 168 |
+
"normalized": false,
|
| 169 |
+
"rstrip": false,
|
| 170 |
+
"single_word": false,
|
| 171 |
+
"special": false
|
| 172 |
+
},
|
| 173 |
+
"151664": {
|
| 174 |
+
"content": "<|file_sep|>",
|
| 175 |
+
"lstrip": false,
|
| 176 |
+
"normalized": false,
|
| 177 |
+
"rstrip": false,
|
| 178 |
+
"single_word": false,
|
| 179 |
+
"special": false
|
| 180 |
+
},
|
| 181 |
+
"151665": {
|
| 182 |
+
"content": "<tool_response>",
|
| 183 |
+
"lstrip": false,
|
| 184 |
+
"normalized": false,
|
| 185 |
+
"rstrip": false,
|
| 186 |
+
"single_word": false,
|
| 187 |
+
"special": false
|
| 188 |
+
},
|
| 189 |
+
"151666": {
|
| 190 |
+
"content": "</tool_response>",
|
| 191 |
+
"lstrip": false,
|
| 192 |
+
"normalized": false,
|
| 193 |
+
"rstrip": false,
|
| 194 |
+
"single_word": false,
|
| 195 |
+
"special": false
|
| 196 |
+
},
|
| 197 |
+
"151667": {
|
| 198 |
+
"content": "<think>",
|
| 199 |
+
"lstrip": false,
|
| 200 |
+
"normalized": false,
|
| 201 |
+
"rstrip": false,
|
| 202 |
+
"single_word": false,
|
| 203 |
+
"special": false
|
| 204 |
+
},
|
| 205 |
+
"151668": {
|
| 206 |
+
"content": "</think>",
|
| 207 |
+
"lstrip": false,
|
| 208 |
+
"normalized": false,
|
| 209 |
+
"rstrip": false,
|
| 210 |
+
"single_word": false,
|
| 211 |
+
"special": false
|
| 212 |
+
},
|
| 213 |
+
"151669": {
|
| 214 |
+
"content": "<|audio_start|>",
|
| 215 |
+
"lstrip": false,
|
| 216 |
+
"normalized": false,
|
| 217 |
+
"rstrip": false,
|
| 218 |
+
"single_word": false,
|
| 219 |
+
"special": true
|
| 220 |
+
},
|
| 221 |
+
"151670": {
|
| 222 |
+
"content": "<|audio_end|>",
|
| 223 |
+
"lstrip": false,
|
| 224 |
+
"normalized": false,
|
| 225 |
+
"rstrip": false,
|
| 226 |
+
"single_word": false,
|
| 227 |
+
"special": true
|
| 228 |
+
},
|
| 229 |
+
"151671": {
|
| 230 |
+
"content": "<tts_pad>",
|
| 231 |
+
"lstrip": false,
|
| 232 |
+
"normalized": false,
|
| 233 |
+
"rstrip": false,
|
| 234 |
+
"single_word": false,
|
| 235 |
+
"special": true
|
| 236 |
+
},
|
| 237 |
+
"151672": {
|
| 238 |
+
"content": "<tts_text_bos>",
|
| 239 |
+
"lstrip": false,
|
| 240 |
+
"normalized": false,
|
| 241 |
+
"rstrip": false,
|
| 242 |
+
"single_word": false,
|
| 243 |
+
"special": true
|
| 244 |
+
},
|
| 245 |
+
"151673": {
|
| 246 |
+
"content": "<tts_text_eod>",
|
| 247 |
+
"lstrip": false,
|
| 248 |
+
"normalized": false,
|
| 249 |
+
"rstrip": false,
|
| 250 |
+
"single_word": false,
|
| 251 |
+
"special": true
|
| 252 |
+
},
|
| 253 |
+
"151674": {
|
| 254 |
+
"content": "<tts_text_bos_single>",
|
| 255 |
+
"lstrip": false,
|
| 256 |
+
"normalized": false,
|
| 257 |
+
"rstrip": false,
|
| 258 |
+
"single_word": false,
|
| 259 |
+
"special": true
|
| 260 |
+
},
|
| 261 |
+
"151675": {
|
| 262 |
+
"content": "<|audio_pad|>",
|
| 263 |
+
"lstrip": false,
|
| 264 |
+
"normalized": false,
|
| 265 |
+
"rstrip": false,
|
| 266 |
+
"single_word": false,
|
| 267 |
+
"special": true
|
| 268 |
+
}
|
| 269 |
+
},
|
| 270 |
+
"additional_special_tokens": [
|
| 271 |
+
"<|im_start|>",
|
| 272 |
+
"<|im_end|>",
|
| 273 |
+
"<|object_ref_start|>",
|
| 274 |
+
"<|object_ref_end|>",
|
| 275 |
+
"<|box_start|>",
|
| 276 |
+
"<|box_end|>",
|
| 277 |
+
"<|quad_start|>",
|
| 278 |
+
"<|quad_end|>",
|
| 279 |
+
"<|vision_start|>",
|
| 280 |
+
"<|vision_end|>",
|
| 281 |
+
"<|vision_pad|>",
|
| 282 |
+
"<|image_pad|>",
|
| 283 |
+
"<|video_pad|>",
|
| 284 |
+
"<|audio_start|>",
|
| 285 |
+
"<|audio_end|>",
|
| 286 |
+
"<tts_pad>",
|
| 287 |
+
"<tts_text_bos>",
|
| 288 |
+
"<tts_text_bos_single>",
|
| 289 |
+
"<|audio_pad|>"
|
| 290 |
+
],
|
| 291 |
+
"extra_special_tokens": {
|
| 292 |
+
"image_token": "<|image_pad|>",
|
| 293 |
+
"audio_token": "<|audio_pad|>",
|
| 294 |
+
"video_token": "<|video_pad|>",
|
| 295 |
+
"vision_bos_token": "<|vision_start|>",
|
| 296 |
+
"vision_eos_token": "<|vision_end|>",
|
| 297 |
+
"audio_bos_token": "<|audio_start|>",
|
| 298 |
+
"audio_eos_token": "<|audio_end|>"
|
| 299 |
+
},
|
| 300 |
+
"bos_token": null,
|
| 301 |
+
"clean_up_tokenization_spaces": false,
|
| 302 |
+
"eos_token": "<|im_end|>",
|
| 303 |
+
"errors": "replace",
|
| 304 |
+
"model_max_length": 131072,
|
| 305 |
+
"pad_token": "<|endoftext|>",
|
| 306 |
+
"split_special_tokens": false,
|
| 307 |
+
"tokenizer_class": "Qwen2Tokenizer",
|
| 308 |
+
"unk_token": null,
|
| 309 |
+
"image_token": "<|image_pad|>",
|
| 310 |
+
"audio_token": "<|audio_pad|>",
|
| 311 |
+
"video_token": "<|video_pad|>",
|
| 312 |
+
"vision_bos_token": "<|vision_start|>",
|
| 313 |
+
"vision_eos_token": "<|vision_end|>",
|
| 314 |
+
"audio_bos_token": "<|audio_start|>",
|
| 315 |
+
"audio_eos_token": "<|audio_end|>"
|
| 316 |
+
}
|
Stewart/checkpoint-epoch-0/vocab.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
Stewart/training_config.json
ADDED
|
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"speaker_name": "Patrick Stewart",
|
| 3 |
+
"init_model": "Qwen/Qwen3-TTS-12Hz-1.7B-Base",
|
| 4 |
+
"model_source": "HuggingFace",
|
| 5 |
+
"batch_size": 64,
|
| 6 |
+
"lr": "2e-06",
|
| 7 |
+
"epochs": 5,
|
| 8 |
+
"grad_acc": 1,
|
| 9 |
+
"use_experimental_speedup": true,
|
| 10 |
+
"resume_from_checkpoint": "latest"
|
| 11 |
+
}
|