Step-Audio (models)
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- .gitattributes +9 -0
- models/Step-Audio-EditX-bnb-4bit/.gitattributes +35 -0
- models/Step-Audio-EditX-bnb-4bit/CosyVoice-300M-25Hz/FLOW_VERSION +2 -0
- models/Step-Audio-EditX-bnb-4bit/CosyVoice-300M-25Hz/campplus.onnx +3 -0
- models/Step-Audio-EditX-bnb-4bit/CosyVoice-300M-25Hz/cosyvoice.yaml +72 -0
- models/Step-Audio-EditX-bnb-4bit/CosyVoice-300M-25Hz/flow.pt +3 -0
- models/Step-Audio-EditX-bnb-4bit/CosyVoice-300M-25Hz/hift.pt +3 -0
- models/Step-Audio-EditX-bnb-4bit/CosyVoice-300M-25Hz/speech_tokenizer_v1.onnx +3 -0
- models/Step-Audio-EditX-bnb-4bit/README.md +3 -0
- models/Step-Audio-EditX-bnb-4bit/config.json +39 -0
- models/Step-Audio-EditX-bnb-4bit/configuration_step1.py +43 -0
- models/Step-Audio-EditX-bnb-4bit/generation_config.json +7 -0
- models/Step-Audio-EditX-bnb-4bit/model.safetensors +3 -0
- models/Step-Audio-EditX-bnb-4bit/modeling_step1.py +414 -0
- models/Step-Audio-EditX-bnb-4bit/quantization_config.json +10 -0
- models/Step-Audio-EditX-bnb-4bit/source.txt +1 -0
- models/Step-Audio-EditX-bnb-4bit/special_tokens_map.json +30 -0
- models/Step-Audio-EditX-bnb-4bit/tokenizer.json +0 -0
- models/Step-Audio-EditX-bnb-4bit/tokenizer.model +3 -0
- models/Step-Audio-EditX-bnb-4bit/tokenizer_config.json +0 -0
- models/Step-Audio-EditX/.gitattributes +35 -0
- models/Step-Audio-EditX/CosyVoice-300M-25Hz/FLOW_VERSION +2 -0
- models/Step-Audio-EditX/CosyVoice-300M-25Hz/campplus.onnx +3 -0
- models/Step-Audio-EditX/CosyVoice-300M-25Hz/cosyvoice.yaml +72 -0
- models/Step-Audio-EditX/CosyVoice-300M-25Hz/flow.pt +3 -0
- models/Step-Audio-EditX/CosyVoice-300M-25Hz/hift.pt +3 -0
- models/Step-Audio-EditX/CosyVoice-300M-25Hz/speech_tokenizer_v1.onnx +3 -0
- models/Step-Audio-EditX/README.md +144 -0
- models/Step-Audio-EditX/config.json +22 -0
- models/Step-Audio-EditX/configuration_step1.py +41 -0
- models/Step-Audio-EditX/model-00001.safetensors +3 -0
- models/Step-Audio-EditX/model.safetensors.index.json +1 -0
- models/Step-Audio-EditX/modeling_step1.py +414 -0
- models/Step-Audio-EditX/source.txt +1 -0
- models/Step-Audio-EditX/tokenizer.model +3 -0
- models/Step-Audio-EditX/tokenizer_config.json +14 -0
- models/Step-Audio-R1-NVFP4A16/.gitattributes +36 -0
- models/Step-Audio-R1-NVFP4A16/README.md +216 -0
- models/Step-Audio-R1-NVFP4A16/added_tokens.json +0 -0
- models/Step-Audio-R1-NVFP4A16/chat_template.jinja +74 -0
- models/Step-Audio-R1-NVFP4A16/config.json +348 -0
- models/Step-Audio-R1-NVFP4A16/configuration_step_audio_2.py +128 -0
- models/Step-Audio-R1-NVFP4A16/generation_config.json +10 -0
- models/Step-Audio-R1-NVFP4A16/merges.txt +0 -0
- models/Step-Audio-R1-NVFP4A16/model-00001-of-00005.safetensors +3 -0
- models/Step-Audio-R1-NVFP4A16/model-00002-of-00005.safetensors +3 -0
- models/Step-Audio-R1-NVFP4A16/model-00003-of-00005.safetensors +3 -0
- models/Step-Audio-R1-NVFP4A16/model-00004-of-00005.safetensors +3 -0
- models/Step-Audio-R1-NVFP4A16/model-00005-of-00005.safetensors +3 -0
- models/Step-Audio-R1-NVFP4A16/model.safetensors.index.json +0 -0
.gitattributes
CHANGED
|
@@ -55,3 +55,12 @@ models/Step-Audio-TTS-3B/lib/liboptimus_ths-torch2.5-cu124.cpython-310-x86_64-li
|
|
| 55 |
models/Step-Audio-Chat/lib/liboptimus_ths-torch2.2-cu121.cpython-310-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
|
| 56 |
models/Step-Audio-Chat/lib/liboptimus_ths-torch2.3-cu121.cpython-310-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
|
| 57 |
models/Step-Audio-Chat/lib/liboptimus_ths-torch2.5-cu124.cpython-310-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 55 |
models/Step-Audio-Chat/lib/liboptimus_ths-torch2.2-cu121.cpython-310-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
|
| 56 |
models/Step-Audio-Chat/lib/liboptimus_ths-torch2.3-cu121.cpython-310-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
|
| 57 |
models/Step-Audio-Chat/lib/liboptimus_ths-torch2.5-cu124.cpython-310-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
|
| 58 |
+
models/Step-Audio-R1-NVFP4A16/tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
| 59 |
+
models/StepAudio2mini_BPFT/checkpoint-12000/tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
| 60 |
+
models/StepAudio2mini_BPFT/checkpoint-15000/tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
| 61 |
+
models/StepAudio2mini_BPFT/checkpoint-18000/tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
| 62 |
+
models/StepAudio2mini_BPFT/checkpoint-18850/tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
| 63 |
+
models/StepAudio2mini_BPFT/checkpoint-3000/tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
| 64 |
+
models/StepAudio2mini_BPFT/checkpoint-6000/tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
| 65 |
+
models/StepAudio2mini_BPFT/checkpoint-9000/tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
| 66 |
+
models/StepAudio2mini_BPFT/tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
models/Step-Audio-EditX-bnb-4bit/.gitattributes
ADDED
|
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
| 2 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
| 3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
| 4 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
| 5 |
+
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
| 6 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
| 7 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
| 8 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
| 9 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
| 10 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
| 11 |
+
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
| 12 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
| 13 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
| 14 |
+
*.npy filter=lfs diff=lfs merge=lfs -text
|
| 15 |
+
*.npz filter=lfs diff=lfs merge=lfs -text
|
| 16 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
| 17 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
| 18 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
| 19 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
| 20 |
+
*.pickle filter=lfs diff=lfs merge=lfs -text
|
| 21 |
+
*.pkl filter=lfs diff=lfs merge=lfs -text
|
| 22 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
| 23 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
| 24 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
| 25 |
+
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
| 26 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
| 27 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
| 28 |
+
*.tar filter=lfs diff=lfs merge=lfs -text
|
| 29 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
| 30 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
| 31 |
+
*.wasm filter=lfs diff=lfs merge=lfs -text
|
| 32 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
| 33 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
+
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
models/Step-Audio-EditX-bnb-4bit/CosyVoice-300M-25Hz/FLOW_VERSION
ADDED
|
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/mnt/wby-jfs/models/train/flow_matching/flow_v2_1node_vq0206_dit_v8_fullattn_exp0227_sft_exp0408_stepaudio_sft_exp0616/model_epoch_5_whole.pt
|
| 2 |
+
fae53942e60310eb172b170396202069
|
models/Step-Audio-EditX-bnb-4bit/CosyVoice-300M-25Hz/campplus.onnx
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:a6ac6a63997761ae2997373e2ee1c47040854b4b759ea41ec48e4e42df0f4d73
|
| 3 |
+
size 28303423
|
models/Step-Audio-EditX-bnb-4bit/CosyVoice-300M-25Hz/cosyvoice.yaml
ADDED
|
@@ -0,0 +1,72 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
mel_conf:
|
| 2 |
+
num_mels: 80
|
| 3 |
+
n_fft: 1920
|
| 4 |
+
hop_size: 480
|
| 5 |
+
win_size: 1920
|
| 6 |
+
sampling_rate: 24000
|
| 7 |
+
fmin: 0
|
| 8 |
+
fmax: 8000
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
flow: !new:stepvocoder.cosyvoice2.flow.flow.CausalMaskedDiffWithXvec
|
| 12 |
+
input_size: 512
|
| 13 |
+
output_size: 80
|
| 14 |
+
spk_embed_dim: 192
|
| 15 |
+
output_type: 'mel'
|
| 16 |
+
vocab_size: 5121 # 1024(vq02) + 4096(vq06) + 1(vq02-pad)
|
| 17 |
+
input_embedding: !new:stepvocoder.cosyvoice2.embedding.dual_codebook.DualCodebookEmbedding
|
| 18 |
+
vocab_size: 5121 # 1024(vq02) + 4096(vq06) + 1(vq02-pad)
|
| 19 |
+
input_size: 512
|
| 20 |
+
encoder: !new:stepvocoder.cosyvoice2.transformer.upsample_encoder_v2.UpsampleConformerEncoderV2
|
| 21 |
+
input_size: 512
|
| 22 |
+
output_size: 512
|
| 23 |
+
input_layer: 'linear'
|
| 24 |
+
pre_lookahead_len: 3
|
| 25 |
+
num_blocks: 6
|
| 26 |
+
num_up_blocks: 4
|
| 27 |
+
up_stride: 2
|
| 28 |
+
up_scale_factor: 2
|
| 29 |
+
attention_heads: 8
|
| 30 |
+
pos_enc_layer_type: 'rel_pos_espnet'
|
| 31 |
+
selfattention_layer_type: 'rel_selfattn'
|
| 32 |
+
key_bias: true
|
| 33 |
+
linear_units: 2048
|
| 34 |
+
dropout_rate: 0.1
|
| 35 |
+
positional_dropout_rate: 0.1
|
| 36 |
+
attention_dropout_rate: 0.1
|
| 37 |
+
normalize_before: True
|
| 38 |
+
decoder: !new:stepvocoder.cosyvoice2.flow.flow_matching.CausalConditionalCFM
|
| 39 |
+
inference_cfg_rate: 0.7
|
| 40 |
+
estimator: !new:stepvocoder.cosyvoice2.flow.decoder_dit.DiT
|
| 41 |
+
in_channels: 320
|
| 42 |
+
out_channels: 80
|
| 43 |
+
mlp_ratio: 4.0
|
| 44 |
+
depth: 16
|
| 45 |
+
num_heads: 8
|
| 46 |
+
head_dim: 64
|
| 47 |
+
hidden_size: 512
|
| 48 |
+
|
| 49 |
+
|
| 50 |
+
hift: !new:stepvocoder.cosyvoice2.hifigan.generator.HiFTGenerator
|
| 51 |
+
in_channels: 80
|
| 52 |
+
base_channels: 512
|
| 53 |
+
nb_harmonics: 8
|
| 54 |
+
sampling_rate: 24000
|
| 55 |
+
nsf_alpha: 0.1
|
| 56 |
+
nsf_sigma: 0.003
|
| 57 |
+
nsf_voiced_threshold: 10
|
| 58 |
+
upsample_rates: [8, 5, 3]
|
| 59 |
+
upsample_kernel_sizes: [16, 11, 7]
|
| 60 |
+
istft_params:
|
| 61 |
+
n_fft: 16
|
| 62 |
+
hop_len: 4
|
| 63 |
+
resblock_kernel_sizes: [3, 7, 11]
|
| 64 |
+
resblock_dilation_sizes: [[1, 3, 5], [1, 3, 5], [1, 3, 5]]
|
| 65 |
+
source_resblock_kernel_sizes: [7, 7, 11]
|
| 66 |
+
source_resblock_dilation_sizes: [[1, 3, 5], [1, 3, 5], [1, 3, 5]]
|
| 67 |
+
lrelu_slope: 0.1
|
| 68 |
+
audio_limit: 0.99
|
| 69 |
+
f0_predictor: !new:stepvocoder.cosyvoice2.hifigan.f0_predictor.ConvRNNF0Predictor
|
| 70 |
+
num_class: 1
|
| 71 |
+
in_channels: 80
|
| 72 |
+
cond_channels: 512
|
models/Step-Audio-EditX-bnb-4bit/CosyVoice-300M-25Hz/flow.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:37f18fcb9c374bb8d8ae229e2f7618b6effaa208609bd0407fc661234125531c
|
| 3 |
+
size 615269316
|
models/Step-Audio-EditX-bnb-4bit/CosyVoice-300M-25Hz/hift.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:3386cc880324d4e98e05987b99107f49e40ed925b8ecc87c1f4939432d429879
|
| 3 |
+
size 83390254
|
models/Step-Audio-EditX-bnb-4bit/CosyVoice-300M-25Hz/speech_tokenizer_v1.onnx
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:56285ddd4a83e883ee0cb9f8d69c1089b53a94b1f78ff7e4a0224a27eb4cb486
|
| 3 |
+
size 522625011
|
models/Step-Audio-EditX-bnb-4bit/README.md
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
license: apache-2.0
|
| 3 |
+
---
|
models/Step-Audio-EditX-bnb-4bit/config.json
ADDED
|
@@ -0,0 +1,39 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"architectures": [
|
| 3 |
+
"Step1ForCausalLM"
|
| 4 |
+
],
|
| 5 |
+
"auto_map": {
|
| 6 |
+
"AutoConfig": "configuration_step1.Step1Config",
|
| 7 |
+
"AutoModelForCausalLM": "modeling_step1.Step1ForCausalLM"
|
| 8 |
+
},
|
| 9 |
+
"bos_token_id": 1,
|
| 10 |
+
"eos_token_id": 3,
|
| 11 |
+
"hidden_size": 3072,
|
| 12 |
+
"intermediate_size": 8192,
|
| 13 |
+
"max_seq_len": 32768,
|
| 14 |
+
"model_type": "step1",
|
| 15 |
+
"num_attention_groups": 4,
|
| 16 |
+
"num_attention_heads": 48,
|
| 17 |
+
"num_hidden_layers": 32,
|
| 18 |
+
"pad_token_id": 0,
|
| 19 |
+
"quantization_config": {
|
| 20 |
+
"_load_in_4bit": true,
|
| 21 |
+
"_load_in_8bit": false,
|
| 22 |
+
"bnb_4bit_compute_dtype": "float16",
|
| 23 |
+
"bnb_4bit_quant_storage": "uint8",
|
| 24 |
+
"bnb_4bit_quant_type": "nf4",
|
| 25 |
+
"bnb_4bit_use_double_quant": true,
|
| 26 |
+
"llm_int8_enable_fp32_cpu_offload": false,
|
| 27 |
+
"llm_int8_has_fp16_weight": false,
|
| 28 |
+
"llm_int8_skip_modules": null,
|
| 29 |
+
"llm_int8_threshold": 6.0,
|
| 30 |
+
"load_in_4bit": true,
|
| 31 |
+
"load_in_8bit": false,
|
| 32 |
+
"quant_method": "bitsandbytes"
|
| 33 |
+
},
|
| 34 |
+
"rms_norm_eps": 1e-05,
|
| 35 |
+
"torch_dtype": "float16",
|
| 36 |
+
"transformers_version": "4.53.3",
|
| 37 |
+
"use_cache": true,
|
| 38 |
+
"vocab_size": 74752
|
| 39 |
+
}
|
models/Step-Audio-EditX-bnb-4bit/configuration_step1.py
ADDED
|
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from typing import Optional, List, Any, Dict
|
| 2 |
+
from transformers.configuration_utils import PretrainedConfig
|
| 3 |
+
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
class Step1Config(PretrainedConfig):
|
| 7 |
+
model_type = "step1"
|
| 8 |
+
keys_to_ignore_at_inference = ["past_key_values"]
|
| 9 |
+
|
| 10 |
+
def __init__(
|
| 11 |
+
self,
|
| 12 |
+
hidden_size: int = 5120,
|
| 13 |
+
intermediate_size: int = 13312,
|
| 14 |
+
num_attention_heads: int = 40,
|
| 15 |
+
num_attention_groups: int = 8,
|
| 16 |
+
num_hidden_layers: int = 48,
|
| 17 |
+
max_seq_len: int = 4096,
|
| 18 |
+
vocab_size: int = 65536,
|
| 19 |
+
rms_norm_eps: float = 1e-5,
|
| 20 |
+
bos_token_id: int = 1,
|
| 21 |
+
eos_token_id: int = 3,
|
| 22 |
+
pad_token_id: int = 0,
|
| 23 |
+
use_cache: bool = True,
|
| 24 |
+
**kwargs,
|
| 25 |
+
) -> None:
|
| 26 |
+
self.hidden_size = hidden_size
|
| 27 |
+
self.intermediate_size = intermediate_size
|
| 28 |
+
self.num_attention_heads = num_attention_heads
|
| 29 |
+
self.num_attention_groups = num_attention_groups
|
| 30 |
+
self.num_hidden_layers = num_hidden_layers
|
| 31 |
+
self.max_seq_len = max_seq_len
|
| 32 |
+
self.vocab_size = vocab_size
|
| 33 |
+
self.rms_norm_eps = rms_norm_eps
|
| 34 |
+
self.use_cache = use_cache
|
| 35 |
+
super().__init__(
|
| 36 |
+
bos_token_id=bos_token_id,
|
| 37 |
+
pad_token_id=pad_token_id,
|
| 38 |
+
eos_token_id=eos_token_id,
|
| 39 |
+
**kwargs
|
| 40 |
+
)
|
| 41 |
+
|
| 42 |
+
|
| 43 |
+
__all__ = ["Step1Config"]
|
models/Step-Audio-EditX-bnb-4bit/generation_config.json
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"_from_model_config": true,
|
| 3 |
+
"bos_token_id": 1,
|
| 4 |
+
"eos_token_id": 3,
|
| 5 |
+
"pad_token_id": 0,
|
| 6 |
+
"transformers_version": "4.53.3"
|
| 7 |
+
}
|
models/Step-Audio-EditX-bnb-4bit/model.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:535f2dd870385812ce2ef8170634c614f62a1b260d7f0f0028eb19196f4af688
|
| 3 |
+
size 2503238347
|
models/Step-Audio-EditX-bnb-4bit/modeling_step1.py
ADDED
|
@@ -0,0 +1,414 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import math
|
| 2 |
+
from typing import Optional, Tuple, Union, List
|
| 3 |
+
|
| 4 |
+
import torch
|
| 5 |
+
import torch.utils.checkpoint
|
| 6 |
+
from torch import nn
|
| 7 |
+
from transformers.generation import GenerationMixin
|
| 8 |
+
|
| 9 |
+
from transformers.modeling_utils import PreTrainedModel
|
| 10 |
+
from transformers.utils import logging
|
| 11 |
+
from .configuration_step1 import Step1Config
|
| 12 |
+
from transformers.cache_utils import Cache, DynamicCache
|
| 13 |
+
from einops import rearrange
|
| 14 |
+
from transformers.modeling_outputs import (
|
| 15 |
+
BaseModelOutputWithPast,
|
| 16 |
+
CausalLMOutputWithPast,
|
| 17 |
+
)
|
| 18 |
+
|
| 19 |
+
logger = logging.get_logger(__name__)
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
def build_alibi_cache(block_size, n_heads, dtype, device):
|
| 23 |
+
# get slopes
|
| 24 |
+
n = 2 ** math.floor(math.log2(n_heads)) # nearest 2**n to n_heads
|
| 25 |
+
m0 = 2.0 ** (-8.0 / n)
|
| 26 |
+
# 2^(-8/n), 2^(-8*2/n), 2^(-8*3/n), ...
|
| 27 |
+
slopes = torch.pow(m0, torch.arange(1, n + 1))
|
| 28 |
+
if n < n_heads:
|
| 29 |
+
m1 = 2.0 ** (-4.0 / n)
|
| 30 |
+
# 2^(-8/(2n)), 2^(-8*3/(2n)), 2^(-8*5/(2n)), ...
|
| 31 |
+
mm = torch.pow(m1, torch.arange(1, 1 + 2 * (n_heads - n), 2))
|
| 32 |
+
slopes = torch.cat([slopes, mm])
|
| 33 |
+
slopes = slopes.to(device)
|
| 34 |
+
|
| 35 |
+
tril = torch.tril(torch.ones(1, 1, block_size, block_size, device=device))
|
| 36 |
+
|
| 37 |
+
bias_rows = torch.arange(block_size, device=device).view(1, -1)
|
| 38 |
+
bias_cols = torch.arange(block_size, device=device).view(-1, 1)
|
| 39 |
+
bias = -torch.sqrt(bias_cols - bias_rows)
|
| 40 |
+
bias = bias.view(1, block_size, block_size) * slopes.view(-1, 1, 1)
|
| 41 |
+
bias = bias.masked_fill(tril == 0, float("-inf"))
|
| 42 |
+
|
| 43 |
+
return bias.type(dtype)
|
| 44 |
+
|
| 45 |
+
|
| 46 |
+
class StepRMSNorm(torch.nn.Module):
|
| 47 |
+
def __init__(self, hidden_size, eps=1e-5):
|
| 48 |
+
super().__init__()
|
| 49 |
+
self.weight = torch.nn.Parameter(torch.ones(hidden_size))
|
| 50 |
+
self.eps = eps
|
| 51 |
+
|
| 52 |
+
def forward(self, x: torch.Tensor):
|
| 53 |
+
var = x.float().pow(2).mean(-1, keepdim=True)
|
| 54 |
+
x = x * torch.rsqrt(var + self.eps).to(x.dtype)
|
| 55 |
+
x = x * self.weight
|
| 56 |
+
return x
|
| 57 |
+
|
| 58 |
+
|
| 59 |
+
class StepAttention(torch.nn.Module):
|
| 60 |
+
def __init__(self, hidden_size, num_heads, num_groups, layer_idx: int):
|
| 61 |
+
super().__init__()
|
| 62 |
+
|
| 63 |
+
self.num_heads = num_heads
|
| 64 |
+
self.num_groups = num_groups
|
| 65 |
+
self.hidden_size = hidden_size
|
| 66 |
+
self.head_dim = hidden_size // num_heads
|
| 67 |
+
|
| 68 |
+
self.q_proj = torch.nn.Linear(hidden_size, hidden_size, bias=False)
|
| 69 |
+
self.k_proj = torch.nn.Linear(
|
| 70 |
+
hidden_size, num_groups * self.head_dim, bias=False
|
| 71 |
+
)
|
| 72 |
+
self.v_proj = torch.nn.Linear(
|
| 73 |
+
hidden_size, num_groups * self.head_dim, bias=False
|
| 74 |
+
)
|
| 75 |
+
self.o_proj = torch.nn.Linear(hidden_size, hidden_size, bias=False)
|
| 76 |
+
|
| 77 |
+
self.layer_idx = layer_idx
|
| 78 |
+
|
| 79 |
+
def flash_attn_func(self, q, k, v, dropout_p=0.0, softmax_scale=None, causal=True,
|
| 80 |
+
return_attn_probs=False, tp_group_rank=0, tp_group_size=1):
|
| 81 |
+
softmax_scale = q.size(-1) ** (-0.5) if softmax_scale is None else softmax_scale
|
| 82 |
+
return torch.ops.Optimus.fwd(q, k, v, None, dropout_p, softmax_scale, causal, return_attn_probs, None, tp_group_rank, tp_group_size)[0]
|
| 83 |
+
|
| 84 |
+
def forward(
|
| 85 |
+
self,
|
| 86 |
+
x: torch.Tensor,
|
| 87 |
+
past_key_value: Optional[Cache] = None,
|
| 88 |
+
attention_mask: Optional[torch.Tensor] = None,
|
| 89 |
+
cache_position: Optional[torch.LongTensor] = None,
|
| 90 |
+
):
|
| 91 |
+
|
| 92 |
+
q: torch.Tensor = self.q_proj(x)
|
| 93 |
+
k: torch.Tensor = self.k_proj(x)
|
| 94 |
+
v: torch.Tensor = self.v_proj(x)
|
| 95 |
+
if past_key_value is not None:
|
| 96 |
+
cache_kwargs = {"cache_position": cache_position}
|
| 97 |
+
k, v = past_key_value.update(k, v, self.layer_idx, cache_kwargs)
|
| 98 |
+
|
| 99 |
+
q = rearrange(q, "b s (h d) -> b s h d", h=self.num_heads)
|
| 100 |
+
k = rearrange(k, "b s (g d) -> b s g d", g=self.num_groups)
|
| 101 |
+
v = rearrange(v, "b s (g d) -> b s g d", g=self.num_groups)
|
| 102 |
+
|
| 103 |
+
try:
|
| 104 |
+
if self.head_dim not in (64, 128):
|
| 105 |
+
raise ValueError("head_dim must be 64 or 128")
|
| 106 |
+
attn_output = self.flash_attn_func(q, k, v)
|
| 107 |
+
attn_output = attn_output.flatten(-2, -1)
|
| 108 |
+
except:
|
| 109 |
+
k = k.repeat_interleave(self.num_heads // self.num_groups, dim=-2)
|
| 110 |
+
v = v.repeat_interleave(self.num_heads // self.num_groups, dim=-2)
|
| 111 |
+
|
| 112 |
+
attention_mask = build_alibi_cache(
|
| 113 |
+
k.size(1), self.num_heads, dtype=q.dtype, device=q.device
|
| 114 |
+
)[:, :, -q.size(1) :, :].contiguous()
|
| 115 |
+
|
| 116 |
+
q = q.transpose(1, 2)
|
| 117 |
+
k = k.transpose(1, 2)
|
| 118 |
+
v = v.transpose(1, 2)
|
| 119 |
+
|
| 120 |
+
attn_output: torch.Tensor = torch.nn.functional.scaled_dot_product_attention(
|
| 121 |
+
q, k, v, attn_mask=attention_mask
|
| 122 |
+
)
|
| 123 |
+
|
| 124 |
+
attn_output = attn_output.transpose(1, 2).flatten(-2, -1)
|
| 125 |
+
|
| 126 |
+
out = self.o_proj(attn_output)
|
| 127 |
+
return out, None # attn weights are not returned
|
| 128 |
+
|
| 129 |
+
|
| 130 |
+
class StepMLP(torch.nn.Module):
|
| 131 |
+
def __init__(self, hidden_size, intermediate_size):
|
| 132 |
+
super().__init__()
|
| 133 |
+
self.gate_proj = torch.nn.Linear(hidden_size, intermediate_size, bias=False)
|
| 134 |
+
self.up_proj = torch.nn.Linear(hidden_size, intermediate_size, bias=False)
|
| 135 |
+
self.down_proj = torch.nn.Linear(intermediate_size, hidden_size, bias=False)
|
| 136 |
+
|
| 137 |
+
def forward(self, x):
|
| 138 |
+
gate = self.gate_proj(x)
|
| 139 |
+
up = self.up_proj(x)
|
| 140 |
+
x = torch.nn.functional.silu(gate) * up
|
| 141 |
+
x = self.down_proj(x)
|
| 142 |
+
return x
|
| 143 |
+
|
| 144 |
+
|
| 145 |
+
class StepLayer(torch.nn.Module):
|
| 146 |
+
def __init__(self, config: Step1Config, layer_idx: int):
|
| 147 |
+
super().__init__()
|
| 148 |
+
self.layer_idx = layer_idx
|
| 149 |
+
self.self_attn = StepAttention(
|
| 150 |
+
hidden_size=config.hidden_size,
|
| 151 |
+
num_heads=config.num_attention_heads,
|
| 152 |
+
num_groups=config.num_attention_groups,
|
| 153 |
+
layer_idx=layer_idx,
|
| 154 |
+
)
|
| 155 |
+
self.mlp = StepMLP(
|
| 156 |
+
hidden_size=config.hidden_size,
|
| 157 |
+
intermediate_size=config.intermediate_size,
|
| 158 |
+
)
|
| 159 |
+
self.input_layernorm = StepRMSNorm(
|
| 160 |
+
hidden_size=config.hidden_size, eps=config.rms_norm_eps
|
| 161 |
+
)
|
| 162 |
+
self.post_attention_layernorm = StepRMSNorm(
|
| 163 |
+
hidden_size=config.hidden_size, eps=config.rms_norm_eps
|
| 164 |
+
)
|
| 165 |
+
|
| 166 |
+
def forward(
|
| 167 |
+
self,
|
| 168 |
+
hidden_states: torch.Tensor,
|
| 169 |
+
attention_mask: Optional[torch.Tensor] = None,
|
| 170 |
+
past_key_value: Optional[Cache] = None,
|
| 171 |
+
output_attentions: Optional[bool] = False,
|
| 172 |
+
cache_position: Optional[torch.LongTensor] = None,
|
| 173 |
+
):
|
| 174 |
+
residual = hidden_states
|
| 175 |
+
hidden_states = self.input_layernorm(hidden_states)
|
| 176 |
+
hidden_states, self_attn_weights = self.self_attn(hidden_states, past_key_value, attention_mask, cache_position)
|
| 177 |
+
hidden_states = residual + hidden_states
|
| 178 |
+
|
| 179 |
+
residual = hidden_states
|
| 180 |
+
hidden_states = self.post_attention_layernorm(hidden_states)
|
| 181 |
+
hidden_states = self.mlp(hidden_states)
|
| 182 |
+
hidden_states = residual + hidden_states
|
| 183 |
+
|
| 184 |
+
outputs = (hidden_states, )
|
| 185 |
+
if output_attentions:
|
| 186 |
+
outputs += (self_attn_weights,)
|
| 187 |
+
return outputs
|
| 188 |
+
|
| 189 |
+
|
| 190 |
+
class StepPreTrainedModel(PreTrainedModel):
|
| 191 |
+
config_class = Step1Config
|
| 192 |
+
base_model_prefix = "model"
|
| 193 |
+
supports_gradient_checkpointing = True
|
| 194 |
+
_no_split_modules = ["StepLayer"]
|
| 195 |
+
_skip_keys_device_placement = ["past_key_values"]
|
| 196 |
+
_supports_cache_class = True
|
| 197 |
+
_supports_static_cache = True
|
| 198 |
+
|
| 199 |
+
def _init_weights(self, module):
|
| 200 |
+
std = self.config.initializer_range
|
| 201 |
+
if isinstance(module, nn.Linear):
|
| 202 |
+
module.weight.data.normal_(mean=0.0, std=std)
|
| 203 |
+
if module.bias is not None:
|
| 204 |
+
module.bias.data.zero_()
|
| 205 |
+
elif isinstance(module, nn.Embedding):
|
| 206 |
+
module.weight.data.normal_(mean=0.0, std=std)
|
| 207 |
+
if module.padding_idx is not None:
|
| 208 |
+
module.weight.data[module.padding_idx].zero_()
|
| 209 |
+
|
| 210 |
+
|
| 211 |
+
class Step1Model(StepPreTrainedModel):
|
| 212 |
+
"""
|
| 213 |
+
Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`LlamaDecoderLayer`]
|
| 214 |
+
|
| 215 |
+
Args:
|
| 216 |
+
config: Step1Config
|
| 217 |
+
"""
|
| 218 |
+
|
| 219 |
+
def __init__(self, config: Step1Config):
|
| 220 |
+
super().__init__(config)
|
| 221 |
+
self.config = config
|
| 222 |
+
self.embed_tokens = torch.nn.Embedding(config.vocab_size, config.hidden_size)
|
| 223 |
+
|
| 224 |
+
self.layers = torch.nn.Sequential(
|
| 225 |
+
*[
|
| 226 |
+
StepLayer(config, layer_idx)
|
| 227 |
+
for layer_idx in range(config.num_hidden_layers)
|
| 228 |
+
]
|
| 229 |
+
)
|
| 230 |
+
|
| 231 |
+
self.norm = StepRMSNorm(
|
| 232 |
+
hidden_size=config.hidden_size, eps=config.rms_norm_eps
|
| 233 |
+
)
|
| 234 |
+
|
| 235 |
+
# Initialize weights and apply final processing
|
| 236 |
+
self.post_init()
|
| 237 |
+
|
| 238 |
+
def get_input_embeddings(self):
|
| 239 |
+
return self.embed_tokens
|
| 240 |
+
|
| 241 |
+
def set_input_embeddings(self, value):
|
| 242 |
+
self.embed_tokens = value
|
| 243 |
+
|
| 244 |
+
def forward(
|
| 245 |
+
self,
|
| 246 |
+
input_ids: torch.LongTensor = None,
|
| 247 |
+
attention_mask: Optional[torch.Tensor] = None,
|
| 248 |
+
past_key_values: Optional[Cache] = None,
|
| 249 |
+
inputs_embeds: Optional[torch.FloatTensor] = None,
|
| 250 |
+
use_cache: Optional[bool] = None,
|
| 251 |
+
output_attentions: Optional[bool] = None,
|
| 252 |
+
output_hidden_states: Optional[bool] = None,
|
| 253 |
+
return_dict: Optional[bool] = None,
|
| 254 |
+
cache_position: Optional[torch.LongTensor] = None,
|
| 255 |
+
) -> Union[Tuple, BaseModelOutputWithPast]:
|
| 256 |
+
output_attentions = (
|
| 257 |
+
output_attentions
|
| 258 |
+
if output_attentions is not None
|
| 259 |
+
else self.config.output_attentions
|
| 260 |
+
)
|
| 261 |
+
output_hidden_states = (
|
| 262 |
+
output_hidden_states
|
| 263 |
+
if output_hidden_states is not None
|
| 264 |
+
else self.config.output_hidden_states
|
| 265 |
+
)
|
| 266 |
+
use_cache = use_cache if use_cache is not None else self.config.use_cache
|
| 267 |
+
return_dict = (
|
| 268 |
+
return_dict if return_dict is not None else self.config.use_return_dict
|
| 269 |
+
)
|
| 270 |
+
|
| 271 |
+
if (input_ids is None) ^ (inputs_embeds is not None):
|
| 272 |
+
raise ValueError(
|
| 273 |
+
"You must specify exactly one of input_ids or inputs_embeds"
|
| 274 |
+
)
|
| 275 |
+
|
| 276 |
+
if inputs_embeds is None:
|
| 277 |
+
inputs_embeds = self.embed_tokens(input_ids)
|
| 278 |
+
|
| 279 |
+
if use_cache and past_key_values is None:
|
| 280 |
+
past_key_values = DynamicCache()
|
| 281 |
+
|
| 282 |
+
if cache_position is None:
|
| 283 |
+
past_seen_tokens = (
|
| 284 |
+
past_key_values.get_seq_length() if past_key_values is not None else 0
|
| 285 |
+
)
|
| 286 |
+
cache_position = torch.arange(
|
| 287 |
+
past_seen_tokens,
|
| 288 |
+
past_seen_tokens + inputs_embeds.shape[1],
|
| 289 |
+
device=inputs_embeds.device,
|
| 290 |
+
)
|
| 291 |
+
|
| 292 |
+
causal_mask = attention_mask
|
| 293 |
+
|
| 294 |
+
hidden_states = inputs_embeds
|
| 295 |
+
|
| 296 |
+
# decoder layers
|
| 297 |
+
all_hidden_states = () if output_hidden_states else None
|
| 298 |
+
all_self_attns = () if output_attentions else None
|
| 299 |
+
|
| 300 |
+
for decoder_layer in self.layers[: self.config.num_hidden_layers]:
|
| 301 |
+
if output_hidden_states:
|
| 302 |
+
all_hidden_states += (hidden_states,)
|
| 303 |
+
|
| 304 |
+
layer_outputs = decoder_layer(
|
| 305 |
+
hidden_states,
|
| 306 |
+
attention_mask=causal_mask,
|
| 307 |
+
past_key_value=past_key_values,
|
| 308 |
+
cache_position=cache_position,
|
| 309 |
+
output_attentions=output_attentions,
|
| 310 |
+
)
|
| 311 |
+
|
| 312 |
+
hidden_states = layer_outputs[0]
|
| 313 |
+
|
| 314 |
+
if output_attentions:
|
| 315 |
+
all_self_attns += (layer_outputs[1],)
|
| 316 |
+
|
| 317 |
+
hidden_states = self.norm(hidden_states)
|
| 318 |
+
|
| 319 |
+
# add hidden states from the last decoder layer
|
| 320 |
+
if output_hidden_states:
|
| 321 |
+
all_hidden_states += (hidden_states,)
|
| 322 |
+
|
| 323 |
+
output = BaseModelOutputWithPast(
|
| 324 |
+
last_hidden_state=hidden_states,
|
| 325 |
+
past_key_values=past_key_values if use_cache else None,
|
| 326 |
+
hidden_states=all_hidden_states,
|
| 327 |
+
attentions=None,
|
| 328 |
+
)
|
| 329 |
+
return output if return_dict else output.to_tuple()
|
| 330 |
+
|
| 331 |
+
|
| 332 |
+
class Step1ForCausalLM(StepPreTrainedModel, GenerationMixin):
|
| 333 |
+
_tied_weights_keys = ["lm_head.weight"]
|
| 334 |
+
|
| 335 |
+
def __init__(self, config):
|
| 336 |
+
super().__init__(config)
|
| 337 |
+
self.model = Step1Model(config)
|
| 338 |
+
self.vocab_size = config.vocab_size
|
| 339 |
+
self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
|
| 340 |
+
|
| 341 |
+
# Initialize weights and apply final processing
|
| 342 |
+
self.post_init()
|
| 343 |
+
|
| 344 |
+
def get_input_embeddings(self):
|
| 345 |
+
return self.model.embed_tokens
|
| 346 |
+
|
| 347 |
+
def set_input_embeddings(self, value):
|
| 348 |
+
self.model.embed_tokens = value
|
| 349 |
+
|
| 350 |
+
def set_decoder(self, decoder):
|
| 351 |
+
self.model = decoder
|
| 352 |
+
|
| 353 |
+
def get_decoder(self):
|
| 354 |
+
return self.model
|
| 355 |
+
|
| 356 |
+
def forward(
|
| 357 |
+
self,
|
| 358 |
+
input_ids: torch.LongTensor = None,
|
| 359 |
+
attention_mask: Optional[torch.Tensor] = None,
|
| 360 |
+
position_ids: Optional[torch.LongTensor] = None,
|
| 361 |
+
past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None,
|
| 362 |
+
inputs_embeds: Optional[torch.FloatTensor] = None,
|
| 363 |
+
labels: Optional[torch.LongTensor] = None,
|
| 364 |
+
use_cache: Optional[bool] = None,
|
| 365 |
+
output_attentions: Optional[bool] = None,
|
| 366 |
+
output_hidden_states: Optional[bool] = None,
|
| 367 |
+
return_dict: Optional[bool] = None,
|
| 368 |
+
cache_position: Optional[torch.LongTensor] = None,
|
| 369 |
+
) -> Union[Tuple, CausalLMOutputWithPast]:
|
| 370 |
+
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
|
| 371 |
+
output_hidden_states = (
|
| 372 |
+
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
|
| 373 |
+
)
|
| 374 |
+
return_dict = (
|
| 375 |
+
return_dict if return_dict is not None else self.config.use_return_dict
|
| 376 |
+
)
|
| 377 |
+
|
| 378 |
+
# decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
|
| 379 |
+
outputs = self.model(
|
| 380 |
+
input_ids=input_ids,
|
| 381 |
+
attention_mask=attention_mask,
|
| 382 |
+
past_key_values=past_key_values,
|
| 383 |
+
inputs_embeds=inputs_embeds,
|
| 384 |
+
use_cache=use_cache,
|
| 385 |
+
output_attentions=output_attentions,
|
| 386 |
+
output_hidden_states=output_hidden_states,
|
| 387 |
+
return_dict=return_dict,
|
| 388 |
+
cache_position=cache_position,
|
| 389 |
+
)
|
| 390 |
+
|
| 391 |
+
hidden_states = outputs[0]
|
| 392 |
+
# Only compute necessary logits, and do not upcast them to float if we are not computing the loss
|
| 393 |
+
|
| 394 |
+
logits = self.lm_head(hidden_states)
|
| 395 |
+
|
| 396 |
+
loss = None
|
| 397 |
+
if labels is not None:
|
| 398 |
+
loss = self.loss_function(
|
| 399 |
+
logits=logits,
|
| 400 |
+
labels=labels,
|
| 401 |
+
vocab_size=self.config.vocab_size,
|
| 402 |
+
)
|
| 403 |
+
|
| 404 |
+
if not return_dict:
|
| 405 |
+
output = (logits,) + outputs[1:]
|
| 406 |
+
return (loss,) + output if loss is not None else output
|
| 407 |
+
|
| 408 |
+
return CausalLMOutputWithPast(
|
| 409 |
+
loss=loss,
|
| 410 |
+
logits=logits,
|
| 411 |
+
past_key_values=outputs.past_key_values,
|
| 412 |
+
hidden_states=outputs.hidden_states,
|
| 413 |
+
attentions=outputs.attentions,
|
| 414 |
+
)
|
models/Step-Audio-EditX-bnb-4bit/quantization_config.json
ADDED
|
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"bits": 4,
|
| 3 |
+
"compute_dtype": "float16",
|
| 4 |
+
"quantization_method": "bitsandbytes",
|
| 5 |
+
"load_in_4bit": true,
|
| 6 |
+
"load_in_8bit": false,
|
| 7 |
+
"bnb_4bit_compute_dtype": "float16",
|
| 8 |
+
"bnb_4bit_use_double_quant": true,
|
| 9 |
+
"bnb_4bit_quant_type": "nf4"
|
| 10 |
+
}
|
models/Step-Audio-EditX-bnb-4bit/source.txt
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
https://huggingface.co/SUP3RMASS1VE/Step-Audio-EditX-bnb-4bit
|
models/Step-Audio-EditX-bnb-4bit/special_tokens_map.json
ADDED
|
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"bos_token": {
|
| 3 |
+
"content": "<s>",
|
| 4 |
+
"lstrip": false,
|
| 5 |
+
"normalized": false,
|
| 6 |
+
"rstrip": false,
|
| 7 |
+
"single_word": false
|
| 8 |
+
},
|
| 9 |
+
"eos_token": {
|
| 10 |
+
"content": "</s>",
|
| 11 |
+
"lstrip": false,
|
| 12 |
+
"normalized": false,
|
| 13 |
+
"rstrip": false,
|
| 14 |
+
"single_word": false
|
| 15 |
+
},
|
| 16 |
+
"pad_token": {
|
| 17 |
+
"content": "<unk>",
|
| 18 |
+
"lstrip": false,
|
| 19 |
+
"normalized": false,
|
| 20 |
+
"rstrip": false,
|
| 21 |
+
"single_word": false
|
| 22 |
+
},
|
| 23 |
+
"unk_token": {
|
| 24 |
+
"content": "<unk>",
|
| 25 |
+
"lstrip": false,
|
| 26 |
+
"normalized": false,
|
| 27 |
+
"rstrip": false,
|
| 28 |
+
"single_word": false
|
| 29 |
+
}
|
| 30 |
+
}
|
models/Step-Audio-EditX-bnb-4bit/tokenizer.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
models/Step-Audio-EditX-bnb-4bit/tokenizer.model
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:25e122d9205d035033a9994c4d46a6a1b467a938654e4178fc0e5f4f5d610674
|
| 3 |
+
size 1264044
|
models/Step-Audio-EditX-bnb-4bit/tokenizer_config.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
models/Step-Audio-EditX/.gitattributes
ADDED
|
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
| 2 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
| 3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
| 4 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
| 5 |
+
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
| 6 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
| 7 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
| 8 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
| 9 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
| 10 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
| 11 |
+
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
| 12 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
| 13 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
| 14 |
+
*.npy filter=lfs diff=lfs merge=lfs -text
|
| 15 |
+
*.npz filter=lfs diff=lfs merge=lfs -text
|
| 16 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
| 17 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
| 18 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
| 19 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
| 20 |
+
*.pickle filter=lfs diff=lfs merge=lfs -text
|
| 21 |
+
*.pkl filter=lfs diff=lfs merge=lfs -text
|
| 22 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
| 23 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
| 24 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
| 25 |
+
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
| 26 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
| 27 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
| 28 |
+
*.tar filter=lfs diff=lfs merge=lfs -text
|
| 29 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
| 30 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
| 31 |
+
*.wasm filter=lfs diff=lfs merge=lfs -text
|
| 32 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
| 33 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
+
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
models/Step-Audio-EditX/CosyVoice-300M-25Hz/FLOW_VERSION
ADDED
|
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/mnt/wby-jfs/models/train/flow_matching/flow_v2_1node_vq0206_dit_v8_fullattn_exp0227_sft_exp0408_stepaudio_sft_exp0616/model_epoch_5_whole.pt
|
| 2 |
+
fae53942e60310eb172b170396202069
|
models/Step-Audio-EditX/CosyVoice-300M-25Hz/campplus.onnx
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:a6ac6a63997761ae2997373e2ee1c47040854b4b759ea41ec48e4e42df0f4d73
|
| 3 |
+
size 28303423
|
models/Step-Audio-EditX/CosyVoice-300M-25Hz/cosyvoice.yaml
ADDED
|
@@ -0,0 +1,72 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
mel_conf:
|
| 2 |
+
num_mels: 80
|
| 3 |
+
n_fft: 1920
|
| 4 |
+
hop_size: 480
|
| 5 |
+
win_size: 1920
|
| 6 |
+
sampling_rate: 24000
|
| 7 |
+
fmin: 0
|
| 8 |
+
fmax: 8000
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
flow: !new:stepvocoder.cosyvoice2.flow.flow.CausalMaskedDiffWithXvec
|
| 12 |
+
input_size: 512
|
| 13 |
+
output_size: 80
|
| 14 |
+
spk_embed_dim: 192
|
| 15 |
+
output_type: 'mel'
|
| 16 |
+
vocab_size: 5121 # 1024(vq02) + 4096(vq06) + 1(vq02-pad)
|
| 17 |
+
input_embedding: !new:stepvocoder.cosyvoice2.embedding.dual_codebook.DualCodebookEmbedding
|
| 18 |
+
vocab_size: 5121 # 1024(vq02) + 4096(vq06) + 1(vq02-pad)
|
| 19 |
+
input_size: 512
|
| 20 |
+
encoder: !new:stepvocoder.cosyvoice2.transformer.upsample_encoder_v2.UpsampleConformerEncoderV2
|
| 21 |
+
input_size: 512
|
| 22 |
+
output_size: 512
|
| 23 |
+
input_layer: 'linear'
|
| 24 |
+
pre_lookahead_len: 3
|
| 25 |
+
num_blocks: 6
|
| 26 |
+
num_up_blocks: 4
|
| 27 |
+
up_stride: 2
|
| 28 |
+
up_scale_factor: 2
|
| 29 |
+
attention_heads: 8
|
| 30 |
+
pos_enc_layer_type: 'rel_pos_espnet'
|
| 31 |
+
selfattention_layer_type: 'rel_selfattn'
|
| 32 |
+
key_bias: true
|
| 33 |
+
linear_units: 2048
|
| 34 |
+
dropout_rate: 0.1
|
| 35 |
+
positional_dropout_rate: 0.1
|
| 36 |
+
attention_dropout_rate: 0.1
|
| 37 |
+
normalize_before: True
|
| 38 |
+
decoder: !new:stepvocoder.cosyvoice2.flow.flow_matching.CausalConditionalCFM
|
| 39 |
+
inference_cfg_rate: 0.7
|
| 40 |
+
estimator: !new:stepvocoder.cosyvoice2.flow.decoder_dit.DiT
|
| 41 |
+
in_channels: 320
|
| 42 |
+
out_channels: 80
|
| 43 |
+
mlp_ratio: 4.0
|
| 44 |
+
depth: 16
|
| 45 |
+
num_heads: 8
|
| 46 |
+
head_dim: 64
|
| 47 |
+
hidden_size: 512
|
| 48 |
+
|
| 49 |
+
|
| 50 |
+
hift: !new:stepvocoder.cosyvoice2.hifigan.generator.HiFTGenerator
|
| 51 |
+
in_channels: 80
|
| 52 |
+
base_channels: 512
|
| 53 |
+
nb_harmonics: 8
|
| 54 |
+
sampling_rate: 24000
|
| 55 |
+
nsf_alpha: 0.1
|
| 56 |
+
nsf_sigma: 0.003
|
| 57 |
+
nsf_voiced_threshold: 10
|
| 58 |
+
upsample_rates: [8, 5, 3]
|
| 59 |
+
upsample_kernel_sizes: [16, 11, 7]
|
| 60 |
+
istft_params:
|
| 61 |
+
n_fft: 16
|
| 62 |
+
hop_len: 4
|
| 63 |
+
resblock_kernel_sizes: [3, 7, 11]
|
| 64 |
+
resblock_dilation_sizes: [[1, 3, 5], [1, 3, 5], [1, 3, 5]]
|
| 65 |
+
source_resblock_kernel_sizes: [7, 7, 11]
|
| 66 |
+
source_resblock_dilation_sizes: [[1, 3, 5], [1, 3, 5], [1, 3, 5]]
|
| 67 |
+
lrelu_slope: 0.1
|
| 68 |
+
audio_limit: 0.99
|
| 69 |
+
f0_predictor: !new:stepvocoder.cosyvoice2.hifigan.f0_predictor.ConvRNNF0Predictor
|
| 70 |
+
num_class: 1
|
| 71 |
+
in_channels: 80
|
| 72 |
+
cond_channels: 512
|
models/Step-Audio-EditX/CosyVoice-300M-25Hz/flow.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:37f18fcb9c374bb8d8ae229e2f7618b6effaa208609bd0407fc661234125531c
|
| 3 |
+
size 615269316
|
models/Step-Audio-EditX/CosyVoice-300M-25Hz/hift.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:3386cc880324d4e98e05987b99107f49e40ed925b8ecc87c1f4939432d429879
|
| 3 |
+
size 83390254
|
models/Step-Audio-EditX/CosyVoice-300M-25Hz/speech_tokenizer_v1.onnx
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:56285ddd4a83e883ee0cb9f8d69c1089b53a94b1f78ff7e4a0224a27eb4cb486
|
| 3 |
+
size 522625011
|
models/Step-Audio-EditX/README.md
ADDED
|
@@ -0,0 +1,144 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
license: apache-2.0
|
| 3 |
+
pipeline_tag: text-to-speech
|
| 4 |
+
library_name: transformers
|
| 5 |
+
---
|
| 6 |
+
## Step-Audio-EditX
|
| 7 |
+
|
| 8 |
+
✨ [Demo Page](https://stepaudiollm.github.io/step-audio-editx/)
|
| 9 |
+
| 🌟 [GitHub](https://github.com/stepfun-ai/Step-Audio-EditX)
|
| 10 |
+
| 📑 [Paper](https://arxiv.org/abs/2511.03601)
|
| 11 |
+
|
| 12 |
+
Check our open-source repository https://github.com/stepfun-ai/Step-Audio-EditX for more details!
|
| 13 |
+
|
| 14 |
+
## 🔥🔥🔥 News!!!
|
| 15 |
+
* Nov 28, 2025: 🚀 New Model Release: Now supporting **`Japanese`** and **`Korean`** languages.
|
| 16 |
+
* Nov 23, 2025: 📊 [Step-Audio-Edit-Benchmark](https://github.com/stepfun-ai/Step-Audio-Edit-Benchmark) Released!
|
| 17 |
+
* Nov 19, 2025: ⚙️ We release a **new version** of our model, which **supports polyphonic pronunciation control** and improves the performance of emotion, speaking style, and paralinguistic editing.
|
| 18 |
+
|
| 19 |
+
We are open-sourcing **Step-Audio-EditX**, a powerful **3B parameters** LLM-based audio model specialized in expressive and **iterative audio editing**.
|
| 20 |
+
It excels at **editing emotion**, **speaking style**, and **paralinguistics**, and also features robust **zero-shot text-to-speech (TTS)** capabilities.
|
| 21 |
+
|
| 22 |
+
## Features
|
| 23 |
+
- **Zero-Shot TTS**
|
| 24 |
+
- Excellent zero-shot TTS cloning for `Mandarin`, `English`, `Sichuanese`, `Cantonese`, `Japanese` and `Korean`.
|
| 25 |
+
- To use a dialect, just add a **`[Sichuanese]`**, **`[Cantonese]`** ,**`[Japanese]`**,**`[Korean]`** tag before your text.
|
| 26 |
+
|
| 27 |
+
- **Emotion and Speaking Style Editing**
|
| 28 |
+
- Remarkably effective iterative control over emotions and styles, supporting **dozens** of options for editing.
|
| 29 |
+
- Emotion Editing : [ *Angry*, *Happy*, *Sad*, *Excited*, *Fearful*, *Surprised*, *Disgusted*, etc. ]
|
| 30 |
+
- Speaking Style Editing: [ *Act_coy*, *Older*, *Child*, *Whisper*, *Serious*, *Generous*, *Exaggerated*, etc.]
|
| 31 |
+
- Editing with more emotion and more speaking styles is on the way. **Get Ready!** 🚀
|
| 32 |
+
|
| 33 |
+
- **Paralinguistic Editing**:
|
| 34 |
+
- Precise control over 10 types of paralinguistic features for more natural, human-like, and expressive synthetic audio.
|
| 35 |
+
- Supporting Tags:
|
| 36 |
+
- [ *Breathing*, *Laughter*, *Suprise-oh*, *Confirmation-en*, *Uhm*, *Suprise-ah*, *Suprise-wa*, *Sigh*, *Question-ei*, *Dissatisfaction-hnn* ]
|
| 37 |
+
|
| 38 |
+
For more examples, see [demo page](https://stepaudiollm.github.io/step-audio-editx/).
|
| 39 |
+
|
| 40 |
+
## Model Usage
|
| 41 |
+
### 📜 Requirements
|
| 42 |
+
The following table shows the requirements for running Step-Audio-EditX model:
|
| 43 |
+
|
| 44 |
+
| Model | Parameters | Setting<br/>(sample frequency) | GPU Optimal Memory |
|
| 45 |
+
|------------|------------|--------------------------------|----------------|
|
| 46 |
+
| Step-Audio-EditX | 3B| 41.6Hz | 32 GB |
|
| 47 |
+
|
| 48 |
+
* An NVIDIA GPU with CUDA support is required.
|
| 49 |
+
* The model is tested on a single L40S GPU.
|
| 50 |
+
* Tested operating system: Linux
|
| 51 |
+
|
| 52 |
+
### 🔧 Dependencies and Installation
|
| 53 |
+
- Python >= 3.10.0 (Recommend to use [Anaconda](https://www.anaconda.com/download/#linux) or [Miniconda](https://docs.conda.io/en/latest/miniconda.html))
|
| 54 |
+
- [PyTorch >= 2.4.1-cu121](https://pytorch.org/)
|
| 55 |
+
- [CUDA Toolkit](https://developer.nvidia.com/cuda-downloads)
|
| 56 |
+
|
| 57 |
+
```bash
|
| 58 |
+
git clone https://github.com/stepfun-ai/Step-Audio-EditX.git
|
| 59 |
+
conda create -n stepaudioedit python=3.10
|
| 60 |
+
conda activate stepaudioedit
|
| 61 |
+
|
| 62 |
+
cd Step-Audio-EditX
|
| 63 |
+
pip install -r requirements.txt
|
| 64 |
+
|
| 65 |
+
git lfs install
|
| 66 |
+
git clone https://huggingface.co/stepfun-ai/Step-Audio-Tokenizer
|
| 67 |
+
git clone https://huggingface.co/stepfun-ai/Step-Audio-EditX
|
| 68 |
+
|
| 69 |
+
```
|
| 70 |
+
|
| 71 |
+
After downloading the models, where_you_download_dir should have the following structure:
|
| 72 |
+
```
|
| 73 |
+
where_you_download_dir
|
| 74 |
+
├── Step-Audio-Tokenizer
|
| 75 |
+
├── Step-Audio-EditX
|
| 76 |
+
```
|
| 77 |
+
|
| 78 |
+
#### Run with Docker
|
| 79 |
+
|
| 80 |
+
You can set up the environment required for running Step-Audio using the provided Dockerfile.
|
| 81 |
+
|
| 82 |
+
```bash
|
| 83 |
+
# build docker
|
| 84 |
+
docker build . -t step-audio-editx
|
| 85 |
+
|
| 86 |
+
# run docker
|
| 87 |
+
docker run --rm --gpus all \
|
| 88 |
+
-v /your/code/path:/app \
|
| 89 |
+
-v /your/model/path:/model \
|
| 90 |
+
-p 7860:7860 \
|
| 91 |
+
step-audio-editx
|
| 92 |
+
```
|
| 93 |
+
|
| 94 |
+
|
| 95 |
+
#### Launch Web Demo
|
| 96 |
+
Start a local server for online inference.
|
| 97 |
+
Assume you have one GPU with at least 32GB memory available and have already downloaded all the models.
|
| 98 |
+
|
| 99 |
+
```bash
|
| 100 |
+
# Step-Audio-EditX demo
|
| 101 |
+
python app.py --model-path where_you_download_dir --model-source local
|
| 102 |
+
```
|
| 103 |
+
|
| 104 |
+
#### Local Inference Demo
|
| 105 |
+
> [!TIP]
|
| 106 |
+
> For optimal performance, keep audio under 30 seconds per inference.
|
| 107 |
+
|
| 108 |
+
```bash
|
| 109 |
+
# zero-shot cloning
|
| 110 |
+
python3 tts_infer.py \
|
| 111 |
+
--model-path where_you_download_dir \
|
| 112 |
+
--output-dir ./output \
|
| 113 |
+
--prompt-text "your prompt text"\
|
| 114 |
+
--prompt-audio your_prompt_audio_path \
|
| 115 |
+
--generated-text "your target text" \
|
| 116 |
+
--edit-type "clone"
|
| 117 |
+
|
| 118 |
+
# edit
|
| 119 |
+
python3 tts_infer.py \
|
| 120 |
+
--model-path where_you_download_dir \
|
| 121 |
+
--output-dir ./output \
|
| 122 |
+
--prompt-text "your promt text" \
|
| 123 |
+
--prompt-audio your_prompt_audio_path \
|
| 124 |
+
--generated-text "" \ # for para-linguistic editing, you need to specify the generatedd text
|
| 125 |
+
--edit-type "emotion" \
|
| 126 |
+
--edit-info "sad" \
|
| 127 |
+
--n-edit-iter 2
|
| 128 |
+
```
|
| 129 |
+
|
| 130 |
+
|
| 131 |
+
## Citation
|
| 132 |
+
|
| 133 |
+
```
|
| 134 |
+
@misc{yan2025stepaudioeditxtechnicalreport,
|
| 135 |
+
title={Step-Audio-EditX Technical Report},
|
| 136 |
+
author={Chao Yan and Boyong Wu and Peng Yang and Pengfei Tan and Guoqiang Hu and Yuxin Zhang and Xiangyu and Zhang and Fei Tian and Xuerui Yang and Xiangyu Zhang and Daxin Jiang and Gang Yu},
|
| 137 |
+
year={2025},
|
| 138 |
+
eprint={2511.03601},
|
| 139 |
+
archivePrefix={arXiv},
|
| 140 |
+
primaryClass={cs.CL},
|
| 141 |
+
url={https://arxiv.org/abs/2511.03601},
|
| 142 |
+
}
|
| 143 |
+
|
| 144 |
+
```
|
models/Step-Audio-EditX/config.json
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"architectures": [
|
| 3 |
+
"Step1ForCausalLM"
|
| 4 |
+
],
|
| 5 |
+
"auto_map": {
|
| 6 |
+
"AutoConfig": "configuration_step1.Step1Config",
|
| 7 |
+
"AutoModelForCausalLM": "modeling_step1.Step1ForCausalLM"
|
| 8 |
+
},
|
| 9 |
+
"model_type": "step1",
|
| 10 |
+
"bos_token_id": 1,
|
| 11 |
+
"pad_token_id": 0,
|
| 12 |
+
"eos_token_id": 3,
|
| 13 |
+
"hidden_size": 3072,
|
| 14 |
+
"intermediate_size": 8192,
|
| 15 |
+
"num_attention_heads": 48,
|
| 16 |
+
"num_attention_groups": 4,
|
| 17 |
+
"num_hidden_layers": 32,
|
| 18 |
+
"max_seq_len": 32768,
|
| 19 |
+
"vocab_size": 74752,
|
| 20 |
+
"rms_norm_eps": 1e-05,
|
| 21 |
+
"torch_dtype": "bfloat16"
|
| 22 |
+
}
|
models/Step-Audio-EditX/configuration_step1.py
ADDED
|
@@ -0,0 +1,41 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from typing import Optional, List, Any, Dict
|
| 2 |
+
from transformers.configuration_utils import PretrainedConfig
|
| 3 |
+
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
class Step1Config(PretrainedConfig):
|
| 7 |
+
model_type = "step1"
|
| 8 |
+
keys_to_ignore_at_inference = ["past_key_values"]
|
| 9 |
+
|
| 10 |
+
def __init__(
|
| 11 |
+
self,
|
| 12 |
+
hidden_size: int = 5120,
|
| 13 |
+
intermediate_size: int = 13312,
|
| 14 |
+
num_attention_heads: int = 40,
|
| 15 |
+
num_attention_groups: int = 8,
|
| 16 |
+
num_hidden_layers: int = 48,
|
| 17 |
+
max_seq_len: int = 4096,
|
| 18 |
+
vocab_size: int = 65536,
|
| 19 |
+
rms_norm_eps: float = 1e-5,
|
| 20 |
+
bos_token_id: int = 1,
|
| 21 |
+
eos_token_id: int = 3,
|
| 22 |
+
pad_token_id: int = 0,
|
| 23 |
+
**kwargs,
|
| 24 |
+
) -> None:
|
| 25 |
+
self.hidden_size = hidden_size
|
| 26 |
+
self.intermediate_size = intermediate_size
|
| 27 |
+
self.num_attention_heads = num_attention_heads
|
| 28 |
+
self.num_attention_groups = num_attention_groups
|
| 29 |
+
self.num_hidden_layers = num_hidden_layers
|
| 30 |
+
self.max_seq_len = max_seq_len
|
| 31 |
+
self.vocab_size = vocab_size
|
| 32 |
+
self.rms_norm_eps = rms_norm_eps
|
| 33 |
+
super().__init__(
|
| 34 |
+
bos_token_id=bos_token_id,
|
| 35 |
+
pad_token_id=pad_token_id,
|
| 36 |
+
eos_token_id=eos_token_id,
|
| 37 |
+
**kwargs
|
| 38 |
+
)
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
__all__ = ["Step1Config"]
|
models/Step-Audio-EditX/model-00001.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:5b5e7e066a5efbad022a77d688f5c35031448c4edfb9b101e84624a2c593ce75
|
| 3 |
+
size 7059446656
|
models/Step-Audio-EditX/model.safetensors.index.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"metadata": {"total_size": 7059412992}, "weight_map": {"model.embed_tokens.weight": "model-00001.safetensors", "model.layers.0.self_attn.o_proj.weight": "model-00001.safetensors", "model.layers.0.input_layernorm.weight": "model-00001.safetensors", "model.layers.0.mlp.down_proj.weight": "model-00001.safetensors", "model.layers.0.post_attention_layernorm.weight": "model-00001.safetensors", "model.layers.0.self_attn.q_proj.weight": "model-00001.safetensors", "model.layers.0.self_attn.k_proj.weight": "model-00001.safetensors", "model.layers.0.self_attn.v_proj.weight": "model-00001.safetensors", "model.layers.0.mlp.gate_proj.weight": "model-00001.safetensors", "model.layers.0.mlp.up_proj.weight": "model-00001.safetensors", "model.layers.1.self_attn.o_proj.weight": "model-00001.safetensors", "model.layers.1.input_layernorm.weight": "model-00001.safetensors", "model.layers.1.mlp.down_proj.weight": "model-00001.safetensors", "model.layers.1.post_attention_layernorm.weight": "model-00001.safetensors", "model.layers.1.self_attn.q_proj.weight": "model-00001.safetensors", "model.layers.1.self_attn.k_proj.weight": "model-00001.safetensors", "model.layers.1.self_attn.v_proj.weight": "model-00001.safetensors", "model.layers.1.mlp.gate_proj.weight": "model-00001.safetensors", "model.layers.1.mlp.up_proj.weight": "model-00001.safetensors", "model.layers.2.self_attn.o_proj.weight": "model-00001.safetensors", "model.layers.2.input_layernorm.weight": "model-00001.safetensors", "model.layers.2.mlp.down_proj.weight": "model-00001.safetensors", "model.layers.2.post_attention_layernorm.weight": "model-00001.safetensors", "model.layers.2.self_attn.q_proj.weight": "model-00001.safetensors", "model.layers.2.self_attn.k_proj.weight": "model-00001.safetensors", "model.layers.2.self_attn.v_proj.weight": "model-00001.safetensors", "model.layers.2.mlp.gate_proj.weight": "model-00001.safetensors", "model.layers.2.mlp.up_proj.weight": "model-00001.safetensors", "model.layers.3.self_attn.o_proj.weight": "model-00001.safetensors", "model.layers.3.input_layernorm.weight": "model-00001.safetensors", "model.layers.3.mlp.down_proj.weight": "model-00001.safetensors", "model.layers.3.post_attention_layernorm.weight": "model-00001.safetensors", "model.layers.3.self_attn.q_proj.weight": "model-00001.safetensors", "model.layers.3.self_attn.k_proj.weight": "model-00001.safetensors", "model.layers.3.self_attn.v_proj.weight": "model-00001.safetensors", "model.layers.3.mlp.gate_proj.weight": "model-00001.safetensors", "model.layers.3.mlp.up_proj.weight": "model-00001.safetensors", "model.layers.4.self_attn.o_proj.weight": "model-00001.safetensors", "model.layers.4.input_layernorm.weight": "model-00001.safetensors", "model.layers.4.mlp.down_proj.weight": "model-00001.safetensors", "model.layers.4.post_attention_layernorm.weight": "model-00001.safetensors", "model.layers.4.self_attn.q_proj.weight": "model-00001.safetensors", "model.layers.4.self_attn.k_proj.weight": "model-00001.safetensors", "model.layers.4.self_attn.v_proj.weight": "model-00001.safetensors", "model.layers.4.mlp.gate_proj.weight": "model-00001.safetensors", "model.layers.4.mlp.up_proj.weight": "model-00001.safetensors", "model.layers.5.self_attn.o_proj.weight": "model-00001.safetensors", "model.layers.5.input_layernorm.weight": "model-00001.safetensors", "model.layers.5.mlp.down_proj.weight": "model-00001.safetensors", "model.layers.5.post_attention_layernorm.weight": "model-00001.safetensors", "model.layers.5.self_attn.q_proj.weight": "model-00001.safetensors", "model.layers.5.self_attn.k_proj.weight": "model-00001.safetensors", "model.layers.5.self_attn.v_proj.weight": "model-00001.safetensors", "model.layers.5.mlp.gate_proj.weight": "model-00001.safetensors", "model.layers.5.mlp.up_proj.weight": "model-00001.safetensors", "model.layers.6.self_attn.o_proj.weight": "model-00001.safetensors", "model.layers.6.input_layernorm.weight": "model-00001.safetensors", "model.layers.6.mlp.down_proj.weight": "model-00001.safetensors", "model.layers.6.post_attention_layernorm.weight": "model-00001.safetensors", "model.layers.6.self_attn.q_proj.weight": "model-00001.safetensors", "model.layers.6.self_attn.k_proj.weight": "model-00001.safetensors", "model.layers.6.self_attn.v_proj.weight": "model-00001.safetensors", "model.layers.6.mlp.gate_proj.weight": "model-00001.safetensors", "model.layers.6.mlp.up_proj.weight": "model-00001.safetensors", "model.layers.7.self_attn.o_proj.weight": "model-00001.safetensors", "model.layers.7.input_layernorm.weight": "model-00001.safetensors", "model.layers.7.mlp.down_proj.weight": "model-00001.safetensors", "model.layers.7.post_attention_layernorm.weight": "model-00001.safetensors", "model.layers.7.self_attn.q_proj.weight": "model-00001.safetensors", "model.layers.7.self_attn.k_proj.weight": "model-00001.safetensors", "model.layers.7.self_attn.v_proj.weight": "model-00001.safetensors", "model.layers.7.mlp.gate_proj.weight": "model-00001.safetensors", "model.layers.7.mlp.up_proj.weight": "model-00001.safetensors", "model.layers.8.self_attn.o_proj.weight": "model-00001.safetensors", "model.layers.8.input_layernorm.weight": "model-00001.safetensors", "model.layers.8.mlp.down_proj.weight": "model-00001.safetensors", "model.layers.8.post_attention_layernorm.weight": "model-00001.safetensors", "model.layers.8.self_attn.q_proj.weight": "model-00001.safetensors", "model.layers.8.self_attn.k_proj.weight": "model-00001.safetensors", "model.layers.8.self_attn.v_proj.weight": "model-00001.safetensors", "model.layers.8.mlp.gate_proj.weight": "model-00001.safetensors", "model.layers.8.mlp.up_proj.weight": "model-00001.safetensors", "model.layers.9.self_attn.o_proj.weight": "model-00001.safetensors", "model.layers.9.input_layernorm.weight": "model-00001.safetensors", "model.layers.9.mlp.down_proj.weight": "model-00001.safetensors", "model.layers.9.post_attention_layernorm.weight": "model-00001.safetensors", "model.layers.9.self_attn.q_proj.weight": "model-00001.safetensors", "model.layers.9.self_attn.k_proj.weight": "model-00001.safetensors", "model.layers.9.self_attn.v_proj.weight": "model-00001.safetensors", "model.layers.9.mlp.gate_proj.weight": "model-00001.safetensors", "model.layers.9.mlp.up_proj.weight": "model-00001.safetensors", "model.layers.10.self_attn.o_proj.weight": "model-00001.safetensors", "model.layers.10.input_layernorm.weight": "model-00001.safetensors", "model.layers.10.mlp.down_proj.weight": "model-00001.safetensors", "model.layers.10.post_attention_layernorm.weight": "model-00001.safetensors", "model.layers.10.self_attn.q_proj.weight": "model-00001.safetensors", "model.layers.10.self_attn.k_proj.weight": "model-00001.safetensors", "model.layers.10.self_attn.v_proj.weight": "model-00001.safetensors", "model.layers.10.mlp.gate_proj.weight": "model-00001.safetensors", "model.layers.10.mlp.up_proj.weight": "model-00001.safetensors", "model.layers.11.self_attn.o_proj.weight": "model-00001.safetensors", "model.layers.11.input_layernorm.weight": "model-00001.safetensors", "model.layers.11.mlp.down_proj.weight": "model-00001.safetensors", "model.layers.11.post_attention_layernorm.weight": "model-00001.safetensors", "model.layers.11.self_attn.q_proj.weight": "model-00001.safetensors", "model.layers.11.self_attn.k_proj.weight": "model-00001.safetensors", "model.layers.11.self_attn.v_proj.weight": "model-00001.safetensors", "model.layers.11.mlp.gate_proj.weight": "model-00001.safetensors", "model.layers.11.mlp.up_proj.weight": "model-00001.safetensors", "model.layers.12.self_attn.o_proj.weight": "model-00001.safetensors", "model.layers.12.input_layernorm.weight": "model-00001.safetensors", "model.layers.12.mlp.down_proj.weight": "model-00001.safetensors", "model.layers.12.post_attention_layernorm.weight": "model-00001.safetensors", "model.layers.12.self_attn.q_proj.weight": "model-00001.safetensors", "model.layers.12.self_attn.k_proj.weight": "model-00001.safetensors", "model.layers.12.self_attn.v_proj.weight": "model-00001.safetensors", "model.layers.12.mlp.gate_proj.weight": "model-00001.safetensors", "model.layers.12.mlp.up_proj.weight": "model-00001.safetensors", "model.layers.13.self_attn.o_proj.weight": "model-00001.safetensors", "model.layers.13.input_layernorm.weight": "model-00001.safetensors", "model.layers.13.mlp.down_proj.weight": "model-00001.safetensors", "model.layers.13.post_attention_layernorm.weight": "model-00001.safetensors", "model.layers.13.self_attn.q_proj.weight": "model-00001.safetensors", "model.layers.13.self_attn.k_proj.weight": "model-00001.safetensors", "model.layers.13.self_attn.v_proj.weight": "model-00001.safetensors", "model.layers.13.mlp.gate_proj.weight": "model-00001.safetensors", "model.layers.13.mlp.up_proj.weight": "model-00001.safetensors", "model.layers.14.self_attn.o_proj.weight": "model-00001.safetensors", "model.layers.14.input_layernorm.weight": "model-00001.safetensors", "model.layers.14.mlp.down_proj.weight": "model-00001.safetensors", "model.layers.14.post_attention_layernorm.weight": "model-00001.safetensors", "model.layers.14.self_attn.q_proj.weight": "model-00001.safetensors", "model.layers.14.self_attn.k_proj.weight": "model-00001.safetensors", "model.layers.14.self_attn.v_proj.weight": "model-00001.safetensors", "model.layers.14.mlp.gate_proj.weight": "model-00001.safetensors", "model.layers.14.mlp.up_proj.weight": "model-00001.safetensors", "model.layers.15.self_attn.o_proj.weight": "model-00001.safetensors", "model.layers.15.input_layernorm.weight": "model-00001.safetensors", "model.layers.15.mlp.down_proj.weight": "model-00001.safetensors", "model.layers.15.post_attention_layernorm.weight": "model-00001.safetensors", "model.layers.15.self_attn.q_proj.weight": "model-00001.safetensors", "model.layers.15.self_attn.k_proj.weight": "model-00001.safetensors", "model.layers.15.self_attn.v_proj.weight": "model-00001.safetensors", "model.layers.15.mlp.gate_proj.weight": "model-00001.safetensors", "model.layers.15.mlp.up_proj.weight": "model-00001.safetensors", "model.layers.16.self_attn.o_proj.weight": "model-00001.safetensors", "model.layers.16.input_layernorm.weight": "model-00001.safetensors", "model.layers.16.mlp.down_proj.weight": "model-00001.safetensors", "model.layers.16.post_attention_layernorm.weight": "model-00001.safetensors", "model.layers.16.self_attn.q_proj.weight": "model-00001.safetensors", "model.layers.16.self_attn.k_proj.weight": "model-00001.safetensors", "model.layers.16.self_attn.v_proj.weight": "model-00001.safetensors", "model.layers.16.mlp.gate_proj.weight": "model-00001.safetensors", "model.layers.16.mlp.up_proj.weight": "model-00001.safetensors", "model.layers.17.self_attn.o_proj.weight": "model-00001.safetensors", "model.layers.17.input_layernorm.weight": "model-00001.safetensors", "model.layers.17.mlp.down_proj.weight": "model-00001.safetensors", "model.layers.17.post_attention_layernorm.weight": "model-00001.safetensors", "model.layers.17.self_attn.q_proj.weight": "model-00001.safetensors", "model.layers.17.self_attn.k_proj.weight": "model-00001.safetensors", "model.layers.17.self_attn.v_proj.weight": "model-00001.safetensors", "model.layers.17.mlp.gate_proj.weight": "model-00001.safetensors", "model.layers.17.mlp.up_proj.weight": "model-00001.safetensors", "model.layers.18.self_attn.o_proj.weight": "model-00001.safetensors", "model.layers.18.input_layernorm.weight": "model-00001.safetensors", "model.layers.18.mlp.down_proj.weight": "model-00001.safetensors", "model.layers.18.post_attention_layernorm.weight": "model-00001.safetensors", "model.layers.18.self_attn.q_proj.weight": "model-00001.safetensors", "model.layers.18.self_attn.k_proj.weight": "model-00001.safetensors", "model.layers.18.self_attn.v_proj.weight": "model-00001.safetensors", "model.layers.18.mlp.gate_proj.weight": "model-00001.safetensors", "model.layers.18.mlp.up_proj.weight": "model-00001.safetensors", "model.layers.19.self_attn.o_proj.weight": "model-00001.safetensors", "model.layers.19.input_layernorm.weight": "model-00001.safetensors", "model.layers.19.mlp.down_proj.weight": "model-00001.safetensors", "model.layers.19.post_attention_layernorm.weight": "model-00001.safetensors", "model.layers.19.self_attn.q_proj.weight": "model-00001.safetensors", "model.layers.19.self_attn.k_proj.weight": "model-00001.safetensors", "model.layers.19.self_attn.v_proj.weight": "model-00001.safetensors", "model.layers.19.mlp.gate_proj.weight": "model-00001.safetensors", "model.layers.19.mlp.up_proj.weight": "model-00001.safetensors", "model.layers.20.self_attn.o_proj.weight": "model-00001.safetensors", "model.layers.20.input_layernorm.weight": "model-00001.safetensors", "model.layers.20.mlp.down_proj.weight": "model-00001.safetensors", "model.layers.20.post_attention_layernorm.weight": "model-00001.safetensors", "model.layers.20.self_attn.q_proj.weight": "model-00001.safetensors", "model.layers.20.self_attn.k_proj.weight": "model-00001.safetensors", "model.layers.20.self_attn.v_proj.weight": "model-00001.safetensors", "model.layers.20.mlp.gate_proj.weight": "model-00001.safetensors", "model.layers.20.mlp.up_proj.weight": "model-00001.safetensors", "model.layers.21.self_attn.o_proj.weight": "model-00001.safetensors", "model.layers.21.input_layernorm.weight": "model-00001.safetensors", "model.layers.21.mlp.down_proj.weight": "model-00001.safetensors", "model.layers.21.post_attention_layernorm.weight": "model-00001.safetensors", "model.layers.21.self_attn.q_proj.weight": "model-00001.safetensors", "model.layers.21.self_attn.k_proj.weight": "model-00001.safetensors", "model.layers.21.self_attn.v_proj.weight": "model-00001.safetensors", "model.layers.21.mlp.gate_proj.weight": "model-00001.safetensors", "model.layers.21.mlp.up_proj.weight": "model-00001.safetensors", "model.layers.22.self_attn.o_proj.weight": "model-00001.safetensors", "model.layers.22.input_layernorm.weight": "model-00001.safetensors", "model.layers.22.mlp.down_proj.weight": "model-00001.safetensors", "model.layers.22.post_attention_layernorm.weight": "model-00001.safetensors", "model.layers.22.self_attn.q_proj.weight": "model-00001.safetensors", "model.layers.22.self_attn.k_proj.weight": "model-00001.safetensors", "model.layers.22.self_attn.v_proj.weight": "model-00001.safetensors", "model.layers.22.mlp.gate_proj.weight": "model-00001.safetensors", "model.layers.22.mlp.up_proj.weight": "model-00001.safetensors", "model.layers.23.self_attn.o_proj.weight": "model-00001.safetensors", "model.layers.23.input_layernorm.weight": "model-00001.safetensors", "model.layers.23.mlp.down_proj.weight": "model-00001.safetensors", "model.layers.23.post_attention_layernorm.weight": "model-00001.safetensors", "model.layers.23.self_attn.q_proj.weight": "model-00001.safetensors", "model.layers.23.self_attn.k_proj.weight": "model-00001.safetensors", "model.layers.23.self_attn.v_proj.weight": "model-00001.safetensors", "model.layers.23.mlp.gate_proj.weight": "model-00001.safetensors", "model.layers.23.mlp.up_proj.weight": "model-00001.safetensors", "model.layers.24.self_attn.o_proj.weight": "model-00001.safetensors", "model.layers.24.input_layernorm.weight": "model-00001.safetensors", "model.layers.24.mlp.down_proj.weight": "model-00001.safetensors", "model.layers.24.post_attention_layernorm.weight": "model-00001.safetensors", "model.layers.24.self_attn.q_proj.weight": "model-00001.safetensors", "model.layers.24.self_attn.k_proj.weight": "model-00001.safetensors", "model.layers.24.self_attn.v_proj.weight": "model-00001.safetensors", "model.layers.24.mlp.gate_proj.weight": "model-00001.safetensors", "model.layers.24.mlp.up_proj.weight": "model-00001.safetensors", "model.layers.25.self_attn.o_proj.weight": "model-00001.safetensors", "model.layers.25.input_layernorm.weight": "model-00001.safetensors", "model.layers.25.mlp.down_proj.weight": "model-00001.safetensors", "model.layers.25.post_attention_layernorm.weight": "model-00001.safetensors", "model.layers.25.self_attn.q_proj.weight": "model-00001.safetensors", "model.layers.25.self_attn.k_proj.weight": "model-00001.safetensors", "model.layers.25.self_attn.v_proj.weight": "model-00001.safetensors", "model.layers.25.mlp.gate_proj.weight": "model-00001.safetensors", "model.layers.25.mlp.up_proj.weight": "model-00001.safetensors", "model.layers.26.self_attn.o_proj.weight": "model-00001.safetensors", "model.layers.26.input_layernorm.weight": "model-00001.safetensors", "model.layers.26.mlp.down_proj.weight": "model-00001.safetensors", "model.layers.26.post_attention_layernorm.weight": "model-00001.safetensors", "model.layers.26.self_attn.q_proj.weight": "model-00001.safetensors", "model.layers.26.self_attn.k_proj.weight": "model-00001.safetensors", "model.layers.26.self_attn.v_proj.weight": "model-00001.safetensors", "model.layers.26.mlp.gate_proj.weight": "model-00001.safetensors", "model.layers.26.mlp.up_proj.weight": "model-00001.safetensors", "model.layers.27.self_attn.o_proj.weight": "model-00001.safetensors", "model.layers.27.input_layernorm.weight": "model-00001.safetensors", "model.layers.27.mlp.down_proj.weight": "model-00001.safetensors", "model.layers.27.post_attention_layernorm.weight": "model-00001.safetensors", "model.layers.27.self_attn.q_proj.weight": "model-00001.safetensors", "model.layers.27.self_attn.k_proj.weight": "model-00001.safetensors", "model.layers.27.self_attn.v_proj.weight": "model-00001.safetensors", "model.layers.27.mlp.gate_proj.weight": "model-00001.safetensors", "model.layers.27.mlp.up_proj.weight": "model-00001.safetensors", "model.layers.28.self_attn.o_proj.weight": "model-00001.safetensors", "model.layers.28.input_layernorm.weight": "model-00001.safetensors", "model.layers.28.mlp.down_proj.weight": "model-00001.safetensors", "model.layers.28.post_attention_layernorm.weight": "model-00001.safetensors", "model.layers.28.self_attn.q_proj.weight": "model-00001.safetensors", "model.layers.28.self_attn.k_proj.weight": "model-00001.safetensors", "model.layers.28.self_attn.v_proj.weight": "model-00001.safetensors", "model.layers.28.mlp.gate_proj.weight": "model-00001.safetensors", "model.layers.28.mlp.up_proj.weight": "model-00001.safetensors", "model.layers.29.self_attn.o_proj.weight": "model-00001.safetensors", "model.layers.29.input_layernorm.weight": "model-00001.safetensors", "model.layers.29.mlp.down_proj.weight": "model-00001.safetensors", "model.layers.29.post_attention_layernorm.weight": "model-00001.safetensors", "model.layers.29.self_attn.q_proj.weight": "model-00001.safetensors", "model.layers.29.self_attn.k_proj.weight": "model-00001.safetensors", "model.layers.29.self_attn.v_proj.weight": "model-00001.safetensors", "model.layers.29.mlp.gate_proj.weight": "model-00001.safetensors", "model.layers.29.mlp.up_proj.weight": "model-00001.safetensors", "model.layers.30.self_attn.o_proj.weight": "model-00001.safetensors", "model.layers.30.input_layernorm.weight": "model-00001.safetensors", "model.layers.30.mlp.down_proj.weight": "model-00001.safetensors", "model.layers.30.post_attention_layernorm.weight": "model-00001.safetensors", "model.layers.30.self_attn.q_proj.weight": "model-00001.safetensors", "model.layers.30.self_attn.k_proj.weight": "model-00001.safetensors", "model.layers.30.self_attn.v_proj.weight": "model-00001.safetensors", "model.layers.30.mlp.gate_proj.weight": "model-00001.safetensors", "model.layers.30.mlp.up_proj.weight": "model-00001.safetensors", "model.layers.31.self_attn.o_proj.weight": "model-00001.safetensors", "model.layers.31.input_layernorm.weight": "model-00001.safetensors", "model.layers.31.mlp.down_proj.weight": "model-00001.safetensors", "model.layers.31.post_attention_layernorm.weight": "model-00001.safetensors", "model.layers.31.self_attn.q_proj.weight": "model-00001.safetensors", "model.layers.31.self_attn.k_proj.weight": "model-00001.safetensors", "model.layers.31.self_attn.v_proj.weight": "model-00001.safetensors", "model.layers.31.mlp.gate_proj.weight": "model-00001.safetensors", "model.layers.31.mlp.up_proj.weight": "model-00001.safetensors", "model.norm.weight": "model-00001.safetensors", "lm_head.weight": "model-00001.safetensors"}}
|
models/Step-Audio-EditX/modeling_step1.py
ADDED
|
@@ -0,0 +1,414 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import math
|
| 2 |
+
from typing import Optional, Tuple, Union, List
|
| 3 |
+
|
| 4 |
+
import torch
|
| 5 |
+
import torch.utils.checkpoint
|
| 6 |
+
from torch import nn
|
| 7 |
+
from transformers.generation import GenerationMixin
|
| 8 |
+
|
| 9 |
+
from transformers.modeling_utils import PreTrainedModel
|
| 10 |
+
from transformers.utils import logging
|
| 11 |
+
from .configuration_step1 import Step1Config
|
| 12 |
+
from transformers.cache_utils import Cache, DynamicCache
|
| 13 |
+
from einops import rearrange
|
| 14 |
+
from transformers.modeling_outputs import (
|
| 15 |
+
BaseModelOutputWithPast,
|
| 16 |
+
CausalLMOutputWithPast,
|
| 17 |
+
)
|
| 18 |
+
|
| 19 |
+
logger = logging.get_logger(__name__)
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
def build_alibi_cache(block_size, n_heads, dtype, device):
|
| 23 |
+
# get slopes
|
| 24 |
+
n = 2 ** math.floor(math.log2(n_heads)) # nearest 2**n to n_heads
|
| 25 |
+
m0 = 2.0 ** (-8.0 / n)
|
| 26 |
+
# 2^(-8/n), 2^(-8*2/n), 2^(-8*3/n), ...
|
| 27 |
+
slopes = torch.pow(m0, torch.arange(1, n + 1))
|
| 28 |
+
if n < n_heads:
|
| 29 |
+
m1 = 2.0 ** (-4.0 / n)
|
| 30 |
+
# 2^(-8/(2n)), 2^(-8*3/(2n)), 2^(-8*5/(2n)), ...
|
| 31 |
+
mm = torch.pow(m1, torch.arange(1, 1 + 2 * (n_heads - n), 2))
|
| 32 |
+
slopes = torch.cat([slopes, mm])
|
| 33 |
+
slopes = slopes.to(device)
|
| 34 |
+
|
| 35 |
+
tril = torch.tril(torch.ones(1, 1, block_size, block_size, device=device))
|
| 36 |
+
|
| 37 |
+
bias_rows = torch.arange(block_size, device=device).view(1, -1)
|
| 38 |
+
bias_cols = torch.arange(block_size, device=device).view(-1, 1)
|
| 39 |
+
bias = -torch.sqrt(bias_cols - bias_rows)
|
| 40 |
+
bias = bias.view(1, block_size, block_size) * slopes.view(-1, 1, 1)
|
| 41 |
+
bias = bias.masked_fill(tril == 0, float("-inf"))
|
| 42 |
+
|
| 43 |
+
return bias.type(dtype)
|
| 44 |
+
|
| 45 |
+
|
| 46 |
+
class StepRMSNorm(torch.nn.Module):
|
| 47 |
+
def __init__(self, hidden_size, eps=1e-5):
|
| 48 |
+
super().__init__()
|
| 49 |
+
self.weight = torch.nn.Parameter(torch.ones(hidden_size))
|
| 50 |
+
self.eps = eps
|
| 51 |
+
|
| 52 |
+
def forward(self, x: torch.Tensor):
|
| 53 |
+
var = x.float().pow(2).mean(-1, keepdim=True)
|
| 54 |
+
x = x * torch.rsqrt(var + self.eps).to(x.dtype)
|
| 55 |
+
x = x * self.weight
|
| 56 |
+
return x
|
| 57 |
+
|
| 58 |
+
|
| 59 |
+
class StepAttention(torch.nn.Module):
|
| 60 |
+
def __init__(self, hidden_size, num_heads, num_groups, layer_idx: int):
|
| 61 |
+
super().__init__()
|
| 62 |
+
|
| 63 |
+
self.num_heads = num_heads
|
| 64 |
+
self.num_groups = num_groups
|
| 65 |
+
self.hidden_size = hidden_size
|
| 66 |
+
self.head_dim = hidden_size // num_heads
|
| 67 |
+
|
| 68 |
+
self.q_proj = torch.nn.Linear(hidden_size, hidden_size, bias=False)
|
| 69 |
+
self.k_proj = torch.nn.Linear(
|
| 70 |
+
hidden_size, num_groups * self.head_dim, bias=False
|
| 71 |
+
)
|
| 72 |
+
self.v_proj = torch.nn.Linear(
|
| 73 |
+
hidden_size, num_groups * self.head_dim, bias=False
|
| 74 |
+
)
|
| 75 |
+
self.o_proj = torch.nn.Linear(hidden_size, hidden_size, bias=False)
|
| 76 |
+
|
| 77 |
+
self.layer_idx = layer_idx
|
| 78 |
+
|
| 79 |
+
def flash_attn_func(self, q, k, v, dropout_p=0.0, softmax_scale=None, causal=True,
|
| 80 |
+
return_attn_probs=False, tp_group_rank=0, tp_group_size=1):
|
| 81 |
+
softmax_scale = q.size(-1) ** (-0.5) if softmax_scale is None else softmax_scale
|
| 82 |
+
return torch.ops.Optimus.fwd(q, k, v, None, dropout_p, softmax_scale, causal, return_attn_probs, None, tp_group_rank, tp_group_size)[0]
|
| 83 |
+
|
| 84 |
+
def forward(
|
| 85 |
+
self,
|
| 86 |
+
x: torch.Tensor,
|
| 87 |
+
past_key_value: Optional[Cache] = None,
|
| 88 |
+
attention_mask: Optional[torch.Tensor] = None,
|
| 89 |
+
cache_position: Optional[torch.LongTensor] = None,
|
| 90 |
+
):
|
| 91 |
+
|
| 92 |
+
q: torch.Tensor = self.q_proj(x)
|
| 93 |
+
k: torch.Tensor = self.k_proj(x)
|
| 94 |
+
v: torch.Tensor = self.v_proj(x)
|
| 95 |
+
if past_key_value is not None:
|
| 96 |
+
cache_kwargs = {"cache_position": cache_position}
|
| 97 |
+
k, v = past_key_value.update(k, v, self.layer_idx, cache_kwargs)
|
| 98 |
+
|
| 99 |
+
q = rearrange(q, "b s (h d) -> b s h d", h=self.num_heads)
|
| 100 |
+
k = rearrange(k, "b s (g d) -> b s g d", g=self.num_groups)
|
| 101 |
+
v = rearrange(v, "b s (g d) -> b s g d", g=self.num_groups)
|
| 102 |
+
|
| 103 |
+
try:
|
| 104 |
+
if self.head_dim not in (64, 128):
|
| 105 |
+
raise ValueError("head_dim must be 64 or 128")
|
| 106 |
+
attn_output = self.flash_attn_func(q, k, v)
|
| 107 |
+
attn_output = attn_output.flatten(-2, -1)
|
| 108 |
+
except:
|
| 109 |
+
k = k.repeat_interleave(self.num_heads // self.num_groups, dim=-2)
|
| 110 |
+
v = v.repeat_interleave(self.num_heads // self.num_groups, dim=-2)
|
| 111 |
+
|
| 112 |
+
attention_mask = build_alibi_cache(
|
| 113 |
+
k.size(1), self.num_heads, dtype=q.dtype, device=q.device
|
| 114 |
+
)[:, :, -q.size(1) :, :].contiguous()
|
| 115 |
+
|
| 116 |
+
q = q.transpose(1, 2)
|
| 117 |
+
k = k.transpose(1, 2)
|
| 118 |
+
v = v.transpose(1, 2)
|
| 119 |
+
|
| 120 |
+
attn_output: torch.Tensor = torch.nn.functional.scaled_dot_product_attention(
|
| 121 |
+
q, k, v, attn_mask=attention_mask
|
| 122 |
+
)
|
| 123 |
+
|
| 124 |
+
attn_output = attn_output.transpose(1, 2).flatten(-2, -1)
|
| 125 |
+
|
| 126 |
+
out = self.o_proj(attn_output)
|
| 127 |
+
return out, None # attn weights are not returned
|
| 128 |
+
|
| 129 |
+
|
| 130 |
+
class StepMLP(torch.nn.Module):
|
| 131 |
+
def __init__(self, hidden_size, intermediate_size):
|
| 132 |
+
super().__init__()
|
| 133 |
+
self.gate_proj = torch.nn.Linear(hidden_size, intermediate_size, bias=False)
|
| 134 |
+
self.up_proj = torch.nn.Linear(hidden_size, intermediate_size, bias=False)
|
| 135 |
+
self.down_proj = torch.nn.Linear(intermediate_size, hidden_size, bias=False)
|
| 136 |
+
|
| 137 |
+
def forward(self, x):
|
| 138 |
+
gate = self.gate_proj(x)
|
| 139 |
+
up = self.up_proj(x)
|
| 140 |
+
x = torch.nn.functional.silu(gate) * up
|
| 141 |
+
x = self.down_proj(x)
|
| 142 |
+
return x
|
| 143 |
+
|
| 144 |
+
|
| 145 |
+
class StepLayer(torch.nn.Module):
|
| 146 |
+
def __init__(self, config: Step1Config, layer_idx: int):
|
| 147 |
+
super().__init__()
|
| 148 |
+
self.layer_idx = layer_idx
|
| 149 |
+
self.self_attn = StepAttention(
|
| 150 |
+
hidden_size=config.hidden_size,
|
| 151 |
+
num_heads=config.num_attention_heads,
|
| 152 |
+
num_groups=config.num_attention_groups,
|
| 153 |
+
layer_idx=layer_idx,
|
| 154 |
+
)
|
| 155 |
+
self.mlp = StepMLP(
|
| 156 |
+
hidden_size=config.hidden_size,
|
| 157 |
+
intermediate_size=config.intermediate_size,
|
| 158 |
+
)
|
| 159 |
+
self.input_layernorm = StepRMSNorm(
|
| 160 |
+
hidden_size=config.hidden_size, eps=config.rms_norm_eps
|
| 161 |
+
)
|
| 162 |
+
self.post_attention_layernorm = StepRMSNorm(
|
| 163 |
+
hidden_size=config.hidden_size, eps=config.rms_norm_eps
|
| 164 |
+
)
|
| 165 |
+
|
| 166 |
+
def forward(
|
| 167 |
+
self,
|
| 168 |
+
hidden_states: torch.Tensor,
|
| 169 |
+
attention_mask: Optional[torch.Tensor] = None,
|
| 170 |
+
past_key_value: Optional[Cache] = None,
|
| 171 |
+
output_attentions: Optional[bool] = False,
|
| 172 |
+
cache_position: Optional[torch.LongTensor] = None,
|
| 173 |
+
):
|
| 174 |
+
residual = hidden_states
|
| 175 |
+
hidden_states = self.input_layernorm(hidden_states)
|
| 176 |
+
hidden_states, self_attn_weights = self.self_attn(hidden_states, past_key_value, attention_mask, cache_position)
|
| 177 |
+
hidden_states = residual + hidden_states
|
| 178 |
+
|
| 179 |
+
residual = hidden_states
|
| 180 |
+
hidden_states = self.post_attention_layernorm(hidden_states)
|
| 181 |
+
hidden_states = self.mlp(hidden_states)
|
| 182 |
+
hidden_states = residual + hidden_states
|
| 183 |
+
|
| 184 |
+
outputs = (hidden_states, )
|
| 185 |
+
if output_attentions:
|
| 186 |
+
outputs += (self_attn_weights,)
|
| 187 |
+
return outputs
|
| 188 |
+
|
| 189 |
+
|
| 190 |
+
class StepPreTrainedModel(PreTrainedModel):
|
| 191 |
+
config_class = Step1Config
|
| 192 |
+
base_model_prefix = "model"
|
| 193 |
+
supports_gradient_checkpointing = True
|
| 194 |
+
_no_split_modules = ["StepLayer"]
|
| 195 |
+
_skip_keys_device_placement = ["past_key_values"]
|
| 196 |
+
_supports_cache_class = True
|
| 197 |
+
_supports_static_cache = True
|
| 198 |
+
|
| 199 |
+
def _init_weights(self, module):
|
| 200 |
+
std = self.config.initializer_range
|
| 201 |
+
if isinstance(module, nn.Linear):
|
| 202 |
+
module.weight.data.normal_(mean=0.0, std=std)
|
| 203 |
+
if module.bias is not None:
|
| 204 |
+
module.bias.data.zero_()
|
| 205 |
+
elif isinstance(module, nn.Embedding):
|
| 206 |
+
module.weight.data.normal_(mean=0.0, std=std)
|
| 207 |
+
if module.padding_idx is not None:
|
| 208 |
+
module.weight.data[module.padding_idx].zero_()
|
| 209 |
+
|
| 210 |
+
|
| 211 |
+
class Step1Model(StepPreTrainedModel):
|
| 212 |
+
"""
|
| 213 |
+
Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`LlamaDecoderLayer`]
|
| 214 |
+
|
| 215 |
+
Args:
|
| 216 |
+
config: Step1Config
|
| 217 |
+
"""
|
| 218 |
+
|
| 219 |
+
def __init__(self, config: Step1Config):
|
| 220 |
+
super().__init__(config)
|
| 221 |
+
self.config = config
|
| 222 |
+
self.embed_tokens = torch.nn.Embedding(config.vocab_size, config.hidden_size)
|
| 223 |
+
|
| 224 |
+
self.layers = torch.nn.Sequential(
|
| 225 |
+
*[
|
| 226 |
+
StepLayer(config, layer_idx)
|
| 227 |
+
for layer_idx in range(config.num_hidden_layers)
|
| 228 |
+
]
|
| 229 |
+
)
|
| 230 |
+
|
| 231 |
+
self.norm = StepRMSNorm(
|
| 232 |
+
hidden_size=config.hidden_size, eps=config.rms_norm_eps
|
| 233 |
+
)
|
| 234 |
+
|
| 235 |
+
# Initialize weights and apply final processing
|
| 236 |
+
self.post_init()
|
| 237 |
+
|
| 238 |
+
def get_input_embeddings(self):
|
| 239 |
+
return self.embed_tokens
|
| 240 |
+
|
| 241 |
+
def set_input_embeddings(self, value):
|
| 242 |
+
self.embed_tokens = value
|
| 243 |
+
|
| 244 |
+
def forward(
|
| 245 |
+
self,
|
| 246 |
+
input_ids: torch.LongTensor = None,
|
| 247 |
+
attention_mask: Optional[torch.Tensor] = None,
|
| 248 |
+
past_key_values: Optional[Cache] = None,
|
| 249 |
+
inputs_embeds: Optional[torch.FloatTensor] = None,
|
| 250 |
+
use_cache: Optional[bool] = None,
|
| 251 |
+
output_attentions: Optional[bool] = None,
|
| 252 |
+
output_hidden_states: Optional[bool] = None,
|
| 253 |
+
return_dict: Optional[bool] = None,
|
| 254 |
+
cache_position: Optional[torch.LongTensor] = None,
|
| 255 |
+
) -> Union[Tuple, BaseModelOutputWithPast]:
|
| 256 |
+
output_attentions = (
|
| 257 |
+
output_attentions
|
| 258 |
+
if output_attentions is not None
|
| 259 |
+
else self.config.output_attentions
|
| 260 |
+
)
|
| 261 |
+
output_hidden_states = (
|
| 262 |
+
output_hidden_states
|
| 263 |
+
if output_hidden_states is not None
|
| 264 |
+
else self.config.output_hidden_states
|
| 265 |
+
)
|
| 266 |
+
use_cache = use_cache if use_cache is not None else self.config.use_cache
|
| 267 |
+
return_dict = (
|
| 268 |
+
return_dict if return_dict is not None else self.config.use_return_dict
|
| 269 |
+
)
|
| 270 |
+
|
| 271 |
+
if (input_ids is None) ^ (inputs_embeds is not None):
|
| 272 |
+
raise ValueError(
|
| 273 |
+
"You must specify exactly one of input_ids or inputs_embeds"
|
| 274 |
+
)
|
| 275 |
+
|
| 276 |
+
if inputs_embeds is None:
|
| 277 |
+
inputs_embeds = self.embed_tokens(input_ids)
|
| 278 |
+
|
| 279 |
+
if use_cache and past_key_values is None:
|
| 280 |
+
past_key_values = DynamicCache()
|
| 281 |
+
|
| 282 |
+
if cache_position is None:
|
| 283 |
+
past_seen_tokens = (
|
| 284 |
+
past_key_values.get_seq_length() if past_key_values is not None else 0
|
| 285 |
+
)
|
| 286 |
+
cache_position = torch.arange(
|
| 287 |
+
past_seen_tokens,
|
| 288 |
+
past_seen_tokens + inputs_embeds.shape[1],
|
| 289 |
+
device=inputs_embeds.device,
|
| 290 |
+
)
|
| 291 |
+
|
| 292 |
+
causal_mask = attention_mask
|
| 293 |
+
|
| 294 |
+
hidden_states = inputs_embeds
|
| 295 |
+
|
| 296 |
+
# decoder layers
|
| 297 |
+
all_hidden_states = () if output_hidden_states else None
|
| 298 |
+
all_self_attns = () if output_attentions else None
|
| 299 |
+
|
| 300 |
+
for decoder_layer in self.layers[: self.config.num_hidden_layers]:
|
| 301 |
+
if output_hidden_states:
|
| 302 |
+
all_hidden_states += (hidden_states,)
|
| 303 |
+
|
| 304 |
+
layer_outputs = decoder_layer(
|
| 305 |
+
hidden_states,
|
| 306 |
+
attention_mask=causal_mask,
|
| 307 |
+
past_key_value=past_key_values,
|
| 308 |
+
cache_position=cache_position,
|
| 309 |
+
output_attentions=output_attentions,
|
| 310 |
+
)
|
| 311 |
+
|
| 312 |
+
hidden_states = layer_outputs[0]
|
| 313 |
+
|
| 314 |
+
if output_attentions:
|
| 315 |
+
all_self_attns += (layer_outputs[1],)
|
| 316 |
+
|
| 317 |
+
hidden_states = self.norm(hidden_states)
|
| 318 |
+
|
| 319 |
+
# add hidden states from the last decoder layer
|
| 320 |
+
if output_hidden_states:
|
| 321 |
+
all_hidden_states += (hidden_states,)
|
| 322 |
+
|
| 323 |
+
output = BaseModelOutputWithPast(
|
| 324 |
+
last_hidden_state=hidden_states,
|
| 325 |
+
past_key_values=past_key_values if use_cache else None,
|
| 326 |
+
hidden_states=all_hidden_states,
|
| 327 |
+
attentions=None,
|
| 328 |
+
)
|
| 329 |
+
return output if return_dict else output.to_tuple()
|
| 330 |
+
|
| 331 |
+
|
| 332 |
+
class Step1ForCausalLM(StepPreTrainedModel, GenerationMixin):
|
| 333 |
+
_tied_weights_keys = ["lm_head.weight"]
|
| 334 |
+
|
| 335 |
+
def __init__(self, config):
|
| 336 |
+
super().__init__(config)
|
| 337 |
+
self.model = Step1Model(config)
|
| 338 |
+
self.vocab_size = config.vocab_size
|
| 339 |
+
self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
|
| 340 |
+
|
| 341 |
+
# Initialize weights and apply final processing
|
| 342 |
+
self.post_init()
|
| 343 |
+
|
| 344 |
+
def get_input_embeddings(self):
|
| 345 |
+
return self.model.embed_tokens
|
| 346 |
+
|
| 347 |
+
def set_input_embeddings(self, value):
|
| 348 |
+
self.model.embed_tokens = value
|
| 349 |
+
|
| 350 |
+
def set_decoder(self, decoder):
|
| 351 |
+
self.model = decoder
|
| 352 |
+
|
| 353 |
+
def get_decoder(self):
|
| 354 |
+
return self.model
|
| 355 |
+
|
| 356 |
+
def forward(
|
| 357 |
+
self,
|
| 358 |
+
input_ids: torch.LongTensor = None,
|
| 359 |
+
attention_mask: Optional[torch.Tensor] = None,
|
| 360 |
+
position_ids: Optional[torch.LongTensor] = None,
|
| 361 |
+
past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None,
|
| 362 |
+
inputs_embeds: Optional[torch.FloatTensor] = None,
|
| 363 |
+
labels: Optional[torch.LongTensor] = None,
|
| 364 |
+
use_cache: Optional[bool] = None,
|
| 365 |
+
output_attentions: Optional[bool] = None,
|
| 366 |
+
output_hidden_states: Optional[bool] = None,
|
| 367 |
+
return_dict: Optional[bool] = None,
|
| 368 |
+
cache_position: Optional[torch.LongTensor] = None,
|
| 369 |
+
) -> Union[Tuple, CausalLMOutputWithPast]:
|
| 370 |
+
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
|
| 371 |
+
output_hidden_states = (
|
| 372 |
+
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
|
| 373 |
+
)
|
| 374 |
+
return_dict = (
|
| 375 |
+
return_dict if return_dict is not None else self.config.use_return_dict
|
| 376 |
+
)
|
| 377 |
+
|
| 378 |
+
# decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
|
| 379 |
+
outputs = self.model(
|
| 380 |
+
input_ids=input_ids,
|
| 381 |
+
attention_mask=attention_mask,
|
| 382 |
+
past_key_values=past_key_values,
|
| 383 |
+
inputs_embeds=inputs_embeds,
|
| 384 |
+
use_cache=use_cache,
|
| 385 |
+
output_attentions=output_attentions,
|
| 386 |
+
output_hidden_states=output_hidden_states,
|
| 387 |
+
return_dict=return_dict,
|
| 388 |
+
cache_position=cache_position,
|
| 389 |
+
)
|
| 390 |
+
|
| 391 |
+
hidden_states = outputs[0]
|
| 392 |
+
# Only compute necessary logits, and do not upcast them to float if we are not computing the loss
|
| 393 |
+
|
| 394 |
+
logits = self.lm_head(hidden_states)
|
| 395 |
+
|
| 396 |
+
loss = None
|
| 397 |
+
if labels is not None:
|
| 398 |
+
loss = self.loss_function(
|
| 399 |
+
logits=logits,
|
| 400 |
+
labels=labels,
|
| 401 |
+
vocab_size=self.config.vocab_size,
|
| 402 |
+
)
|
| 403 |
+
|
| 404 |
+
if not return_dict:
|
| 405 |
+
output = (logits,) + outputs[1:]
|
| 406 |
+
return (loss,) + output if loss is not None else output
|
| 407 |
+
|
| 408 |
+
return CausalLMOutputWithPast(
|
| 409 |
+
loss=loss,
|
| 410 |
+
logits=logits,
|
| 411 |
+
past_key_values=outputs.past_key_values,
|
| 412 |
+
hidden_states=outputs.hidden_states,
|
| 413 |
+
attentions=outputs.attentions,
|
| 414 |
+
)
|
models/Step-Audio-EditX/source.txt
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
https://huggingface.co/stepfun-ai/Step-Audio-EditX
|
models/Step-Audio-EditX/tokenizer.model
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:25e122d9205d035033a9994c4d46a6a1b467a938654e4178fc0e5f4f5d610674
|
| 3 |
+
size 1264044
|
models/Step-Audio-EditX/tokenizer_config.json
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"bos_token": "<s>",
|
| 3 |
+
"clean_up_tokenization_spaces": false,
|
| 4 |
+
"eos_token": "</s>",
|
| 5 |
+
"legacy": false,
|
| 6 |
+
"model_max_length": 65536,
|
| 7 |
+
"pad_token": "<unk>",
|
| 8 |
+
"padding_side": "left",
|
| 9 |
+
"sp_model_kwargs": {},
|
| 10 |
+
"tokenizer_class": "LlamaTokenizer",
|
| 11 |
+
"unk_token": "<unk>",
|
| 12 |
+
"use_default_system_prompt": false
|
| 13 |
+
}
|
| 14 |
+
|
models/Step-Audio-R1-NVFP4A16/.gitattributes
ADDED
|
@@ -0,0 +1,36 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
| 2 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
| 3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
| 4 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
| 5 |
+
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
| 6 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
| 7 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
| 8 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
| 9 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
| 10 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
| 11 |
+
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
| 12 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
| 13 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
| 14 |
+
*.npy filter=lfs diff=lfs merge=lfs -text
|
| 15 |
+
*.npz filter=lfs diff=lfs merge=lfs -text
|
| 16 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
| 17 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
| 18 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
| 19 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
| 20 |
+
*.pickle filter=lfs diff=lfs merge=lfs -text
|
| 21 |
+
*.pkl filter=lfs diff=lfs merge=lfs -text
|
| 22 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
| 23 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
| 24 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
| 25 |
+
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
| 26 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
| 27 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
| 28 |
+
*.tar filter=lfs diff=lfs merge=lfs -text
|
| 29 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
| 30 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
| 31 |
+
*.wasm filter=lfs diff=lfs merge=lfs -text
|
| 32 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
| 33 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
+
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
+
tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
models/Step-Audio-R1-NVFP4A16/README.md
ADDED
|
@@ -0,0 +1,216 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
license: apache-2.0
|
| 3 |
+
pipeline_tag: audio-text-to-text
|
| 4 |
+
library_name: transformers
|
| 5 |
+
tags:
|
| 6 |
+
- audio-reasoning
|
| 7 |
+
- chain-of-thought
|
| 8 |
+
- multi-modal
|
| 9 |
+
- step-audio-r1
|
| 10 |
+
---
|
| 11 |
+
## Step-Audio-R1-NVFP4A16 (Quantized)
|
| 12 |
+
|
| 13 |
+
This is a **quantized version** of Step-Audio-R1 using NVFP4A16 quantization via [LLM Compressor](https://github.com/vllm-project/llm-compressor).
|
| 14 |
+
|
| 15 |
+
### Quantization Details
|
| 16 |
+
|
| 17 |
+
- **Scheme**: NVFP4A16 (FP4 weights with FP16 activations)
|
| 18 |
+
- **Target layers**: All Linear layers (except `encoder`, `adapter`, `lm_head`)
|
| 19 |
+
- **Group size**: 16
|
| 20 |
+
- **Method**: Post-Training Quantization (PTQ)
|
| 21 |
+
|
| 22 |
+
### Quantization Code
|
| 23 |
+
```python
|
| 24 |
+
from transformers import AutoModelForCausalLM, AutoTokenizer
|
| 25 |
+
from llmcompressor import oneshot
|
| 26 |
+
from llmcompressor.modifiers.quantization import QuantizationModifier
|
| 27 |
+
from llmcompressor.utils import dispatch_for_generation
|
| 28 |
+
|
| 29 |
+
MODEL_ID = "stepfun-ai/Step-Audio-R1"
|
| 30 |
+
|
| 31 |
+
# Load model
|
| 32 |
+
model = AutoModelForCausalLM.from_pretrained(MODEL_ID, torch_dtype="auto")
|
| 33 |
+
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
|
| 34 |
+
|
| 35 |
+
# Configure the quantization algorithm and scheme
|
| 36 |
+
# Quantize weights to FP4 with per group 16 via PTQ
|
| 37 |
+
recipe = QuantizationModifier(targets="Linear", scheme="NVFP4A16", ignore=["lm_head", "re:encoder.*", "re:adapter.*"])
|
| 38 |
+
|
| 39 |
+
# Apply quantization
|
| 40 |
+
oneshot(model=model, recipe=recipe)
|
| 41 |
+
|
| 42 |
+
# Save to disk in compressed-tensors format
|
| 43 |
+
SAVE_DIR = "Step-Audio-R1-NVFP4A16"
|
| 44 |
+
model.save_pretrained(SAVE_DIR, save_compressed=True)
|
| 45 |
+
tokenizer.save_pretrained(SAVE_DIR)
|
| 46 |
+
```
|
| 47 |
+
|
| 48 |
+
|
| 49 |
+
## Step-Audio-R1
|
| 50 |
+
|
| 51 |
+
✨ [Demo Page](https://stepaudiollm.github.io/step-audio-r1/)
|
| 52 |
+
| 🎮 [Playground](https://huggingface.co/spaces/stepfun-ai/Step-Audio-R1)
|
| 53 |
+
| 🌟 [GitHub](https://github.com/stepfun-ai/Step-Audio-R1)
|
| 54 |
+
| 📑 [Paper](https://arxiv.org/abs/2511.15848)
|
| 55 |
+
|
| 56 |
+
Step-Audio-R1 is the **first audio language model to successfully unlock Chain-of-Thought (CoT) reasoning**.
|
| 57 |
+
It decisively solves the "inverted scaling" problem that plagues existing models, where performance degrades
|
| 58 |
+
with longer reasoning. Step-Audio-R1 is the first model to demonstrate that for audio, like text and vision,
|
| 59 |
+
allocating more compute at test-time predictably improves performance.
|
| 60 |
+
|
| 61 |
+
We found the root cause of this anomaly: models were engaging in **textual surrogate reasoning**
|
| 62 |
+
(analyzing transcripts, not audio) due to a modality mismatch. To solve this, we introduce
|
| 63 |
+
**Modality-Grounded Reasoning Distillation (MGRD)**, an iterative training framework that shifts the model's
|
| 64 |
+
reasoning from textual abstractions to acoustic properties.
|
| 65 |
+
|
| 66 |
+
This new approach allows us to create **Step-Audio-R1**, which:
|
| 67 |
+
- Is the **first audio reasoning model** that successfully benefits from test-time compute scaling.
|
| 68 |
+
- Surpasses **Gemini 2.5 Pro** and is comparable to **Gemini 3** across major audio reasoning tasks.
|
| 69 |
+
- Transforms extended deliberation from a liability into a **powerful asset** for audio intelligence.
|
| 70 |
+
|
| 71 |
+
## Features
|
| 72 |
+
- **Chain-of-Thought (CoT) Reasoning**
|
| 73 |
+
- First audio language model to successfully unlock Chain-of-Thought reasoning capabilities.
|
| 74 |
+
- Generates audio-relevant reasoning chains that genuinely ground themselves in acoustic features.
|
| 75 |
+
|
| 76 |
+
- **Modality-Grounded Reasoning Distillation (MGRD)**
|
| 77 |
+
- Innovative iterative training framework that shifts reasoning from textual abstractions to acoustic properties.
|
| 78 |
+
- Solves the modality mismatch problem that caused textual surrogate reasoning in previous models.
|
| 79 |
+
|
| 80 |
+
- **Superior Performance**
|
| 81 |
+
- Surpasses **Gemini 2.5 Pro** across comprehensive audio understanding and reasoning benchmarks.
|
| 82 |
+
- Comparable to **Gemini 3** across major audio reasoning tasks.
|
| 83 |
+
- Surpasses **Qwen3** in textual reasoning.
|
| 84 |
+
- Covers speech, environmental sounds, and music domains.
|
| 85 |
+
|
| 86 |
+
|
| 87 |
+
For more examples, see [demo page](https://stepaudiollm.github.io/step-audio-r1/).
|
| 88 |
+
|
| 89 |
+
## Model Usage
|
| 90 |
+
### 📜 Requirements
|
| 91 |
+
- **GPU**: NVIDIA GPUs with CUDA support (tested on 4×L40S/H100/H800/H20).
|
| 92 |
+
- **Operating System**: Linux.
|
| 93 |
+
- **Python**: >= 3.10.0.
|
| 94 |
+
|
| 95 |
+
### ⬇️ Download Model
|
| 96 |
+
First, you need to download the Step-Audio-R1 model weights.
|
| 97 |
+
|
| 98 |
+
**Method A · Git LFS**
|
| 99 |
+
```bash
|
| 100 |
+
git lfs install
|
| 101 |
+
git clone https://huggingface.co/stepfun-ai/Step-Audio-R1
|
| 102 |
+
```
|
| 103 |
+
|
| 104 |
+
**Method B · Hugging Face CLI**
|
| 105 |
+
```bash
|
| 106 |
+
hf download stepfun-ai/Step-Audio-R1 --local-dir ./Step-Audio-R1
|
| 107 |
+
```
|
| 108 |
+
|
| 109 |
+
### 🚀 Deployment and Execution
|
| 110 |
+
We provide two ways to serve the model: Docker (recommended) or compiling the customized vLLM backend.
|
| 111 |
+
|
| 112 |
+
#### 🐳 Method 1 · Run with Docker (Recommended)
|
| 113 |
+
|
| 114 |
+
A customized vLLM image is required.
|
| 115 |
+
|
| 116 |
+
1. **Pull the image**:
|
| 117 |
+
```bash
|
| 118 |
+
docker pull stepfun2025/vllm:step-audio-2-v20250909
|
| 119 |
+
```
|
| 120 |
+
2. **Start the service**:
|
| 121 |
+
Assuming the model is downloaded in the `Step-Audio-R1` folder in the current directory.
|
| 122 |
+
|
| 123 |
+
```bash
|
| 124 |
+
docker run --rm -ti --gpus all \
|
| 125 |
+
-v $(pwd)/Step-Audio-R1:/Step-Audio-R1 \
|
| 126 |
+
-p 9999:9999 \
|
| 127 |
+
stepfun2025/vllm:step-audio-2-v20250909 \
|
| 128 |
+
-- vllm serve /Step-Audio-R1 \
|
| 129 |
+
--served-model-name Step-Audio-R1 \
|
| 130 |
+
--port 9999 \
|
| 131 |
+
--max-model-len 16384 \
|
| 132 |
+
--max-num-seqs 32 \
|
| 133 |
+
--tensor-parallel-size 4 \
|
| 134 |
+
--chat-template '{%- macro render_content(content) -%}{%- if content is string -%}{{- content.replace("<audio_patch>\n", "<audio_patch>") -}}{%- elif content is mapping -%}{{- content['"'"'value'"'"'] if '"'"'value'"'"' in content else content['"'"'text'"'"'] -}}{%- elif content is iterable -%}{%- for item in content -%}{%- if item.type == '"'"'text'"'"' -%}{{- item['"'"'value'"'"'] if '"'"'value'"'"' in item else item['"'"'text'"'"'] -}}{%- elif item.type == '"'"'audio'"'"' -%}<audio_patch>{%- endif -%}{%- endfor -%}{%- endif -%}{%- endmacro -%}{%- if tools -%}{{- '"'"'<|BOT|>system\n'"'"' -}}{%- if messages[0]['"'"'role'"'"'] == '"'"'system'"'"' -%}{{- render_content(messages[0]['"'"'content'"'"']) + '"'"'<|EOT|>'"'"' -}}{%- endif -%}{{- '"'"'<|BOT|>tool_json_schemas\n'"'"' + tools|tojson + '"'"'<|EOT|>'"'"' -}}{%- else -%}{%- if messages[0]['"'"'role'"'"'] == '"'"'system'"'"' -%}{{- '"'"'<|BOT|>system\n'"'"' + render_content(messages[0]['"'"'content'"'"']) + '"'"'<|EOT|>'"'"' -}}{%- endif -%}{%- endif -%}{%- for message in messages -%}{%- if message["role"] == "user" -%}{{- '"'"'<|BOT|>human\n'"'"' + render_content(message["content"]) + '"'"'<|EOT|>'"'"' -}}{%- elif message["role"] == "assistant" -%}{{- '"'"'<|BOT|>assistant\n'"'"' + (render_content(message["content"]) if message["content"] else '"'"''"'"') -}}{%- set is_last_assistant = true -%}{%- for m in messages[loop.index:] -%}{%- if m["role"] == "assistant" -%}{%- set is_last_assistant = false -%}{%- endif -%}{%- endfor -%}{%- if not is_last_assistant -%}{{- '"'"'<|EOT|>'"'"' -}}{%- endif -%}{%- elif message["role"] == "function_output" -%}{%- else -%}{%- if not (loop.first and message["role"] == "system") -%}{{- '"'"'<|BOT|>'"'"' + message["role"] + '"'"'\n'"'"' + render_content(message["content"]) + '"'"'<|EOT|>'"'"' -}}{%- endif -%}{%- endif -%}{%- endfor -%}{%- if add_generation_prompt -%}{{- '"'"'<|BOT|>assistant\n<think>\n'"'"' -}}{%- endif -%}' \
|
| 135 |
+
--enable-log-requests \
|
| 136 |
+
--interleave-mm-strings \
|
| 137 |
+
--trust-remote-code
|
| 138 |
+
```
|
| 139 |
+
After the service starts, it will listen on `localhost:9999`.
|
| 140 |
+
|
| 141 |
+
#### 🐳 Method 2 · Run from Source (Compile vLLM)
|
| 142 |
+
Step-Audio-R1 requires a customized vLLM backend.
|
| 143 |
+
|
| 144 |
+
1. **Download Source Code**:
|
| 145 |
+
```bash
|
| 146 |
+
git clone https://github.com/stepfun-ai/vllm.git
|
| 147 |
+
cd vllm
|
| 148 |
+
```
|
| 149 |
+
|
| 150 |
+
2. **Prepare Environment**:
|
| 151 |
+
```bash
|
| 152 |
+
python3 -m venv .venv
|
| 153 |
+
source .venv/bin/activate
|
| 154 |
+
```
|
| 155 |
+
|
| 156 |
+
3. **Install and Compile**:
|
| 157 |
+
vLLM contains both C++ and Python code. We mainly modified the Python code, so the C++ part can use the pre-compiled version to speed up the process.
|
| 158 |
+
|
| 159 |
+
```bash
|
| 160 |
+
# Use pre-compiled C++ extensions (Recommended)
|
| 161 |
+
VLLM_USE_PRECOMPILED=1 pip install -e .
|
| 162 |
+
```
|
| 163 |
+
|
| 164 |
+
4. **Switch Branch**:
|
| 165 |
+
After compilation, switch to the branch that supports Step-Audio.
|
| 166 |
+
```bash
|
| 167 |
+
git checkout step-audio-2-mini
|
| 168 |
+
```
|
| 169 |
+
|
| 170 |
+
5. **Start the Service**:
|
| 171 |
+
```bash
|
| 172 |
+
# Ensure you are in the vllm directory and the virtual environment is activated
|
| 173 |
+
source .venv/bin/activate
|
| 174 |
+
|
| 175 |
+
python3 -m vllm.entrypoints.openai.api_server \
|
| 176 |
+
--model ../Step-Audio-R1 \
|
| 177 |
+
--served-model-name Step-Audio-R1 \
|
| 178 |
+
--port 9999 \
|
| 179 |
+
--host 0.0.0.0 \
|
| 180 |
+
--max-model-len 65536 \
|
| 181 |
+
--max-num-seqs 128 \
|
| 182 |
+
--tensor-parallel-size 4 \
|
| 183 |
+
--gpu-memory-utilization 0.85 \
|
| 184 |
+
--trust-remote-code \
|
| 185 |
+
--enable-log-requests \
|
| 186 |
+
--interleave-mm-strings \
|
| 187 |
+
--chat-template '{%- macro render_content(content) -%}{%- if content is string -%}{{- content.replace("<audio_patch>\n", "<audio_patch>") -}}{%- elif content is mapping -%}{{- content['"'"'value'"'"'] if '"'"'value'"'"' in content else content['"'"'text'"'"'] -}}{%- elif content is iterable -%}{%- for item in content -%}{%- if item.type == '"'"'text'"'"' -%}{{- item['"'"'value'"'"'] if '"'"'value'"'"' in item else item['"'"'text'"'"'] -}}{%- elif item.type == '"'"'audio'"'"' -%}<audio_patch>{%- endif -%}{%- endfor -%}{%- endif -%}{%- endmacro -%}{%- if tools -%}{{- '"'"'<|BOT|>system\n'"'"' -}}{%- if messages[0]['"'"'role'"'"'] == '"'"'system'"'"' -%}{{- render_content(messages[0]['"'"'content'"'"']) + '"'"'<|EOT|>'"'"' -}}{%- endif -%}{{- '"'"'<|BOT|>tool_json_schemas\n'"'"' + tools|tojson + '"'"'<|EOT|>'"'"' -}}{%- else -%}{%- if messages[0]['"'"'role'"'"'] == '"'"'system'"'"' -%}{{- '"'"'<|BOT|>system\n'"'"' + render_content(messages[0]['"'"'content'"'"']) + '"'"'<|EOT|>'"'"' -}}{%- endif -%}{%- endif -%}{%- for message in messages -%}{%- if message["role"] == "user" -%}{{- '"'"'<|BOT|>human\n'"'"' + render_content(message["content"]) + '"'"'<|EOT|>'"'"' -}}{%- elif message["role"] == "assistant" -%}{{- '"'"'<|BOT|>assistant\n'"'"' + (render_content(message["content"]) if message["content"] else '"'"''"'"') -}}{%- set is_last_assistant = true -%}{%- for m in messages[loop.index:] -%}{%- if m["role"] == "assistant" -%}{%- set is_last_assistant = false -%}{%- endif -%}{%- endfor -%}{%- if not is_last_assistant -%}{{- '"'"'<|EOT|>'"'"' -}}{%- endif -%}{%- elif message["role"] == "function_output" -%}{%- else -%}{%- if not (loop.first and message["role"] == "system") -%}{{- '"'"'<|BOT|>'"'"' + message["role"] + '"'"'\n'"'"' + render_content(message["content"]) + '"'"'<|EOT|>'"'"' -}}{%- endif -%}{%- endif -%}{%- endfor -%}{%- if add_generation_prompt -%}{{- '"'"'<|BOT|>assistant\n<think>\n'"'"' -}}{%- endif -%}'
|
| 188 |
+
```
|
| 189 |
+
|
| 190 |
+
After the service starts, it will listen on `localhost:9999`.
|
| 191 |
+
|
| 192 |
+
|
| 193 |
+
### 🧪 Client Examples
|
| 194 |
+
|
| 195 |
+
Get the example code and run it:
|
| 196 |
+
```bash
|
| 197 |
+
# Clone the repository containing example scripts
|
| 198 |
+
git clone https://github.com/stepfun-ai/Step-Audio-R1.git r1-scripts
|
| 199 |
+
|
| 200 |
+
# Run the example
|
| 201 |
+
cd r1-scripts
|
| 202 |
+
python examples-vllm_r1.py
|
| 203 |
+
```
|
| 204 |
+
|
| 205 |
+
|
| 206 |
+
## Citation
|
| 207 |
+
|
| 208 |
+
```
|
| 209 |
+
@article{tian2025step,
|
| 210 |
+
title={Step-Audio-R1 Technical Report},
|
| 211 |
+
author={Tian, Fei and Zhang, Xiangyu Tony and Zhang, Yuxin and Zhang, Haoyang and Li, Yuxin and Liu, Daijiao and Deng, Yayue and Wu, Donghang and Chen, Jun and Zhao, Liang and others},
|
| 212 |
+
journal={arXiv preprint arXiv:2511.15848},
|
| 213 |
+
year={2025}
|
| 214 |
+
}
|
| 215 |
+
|
| 216 |
+
```
|
models/Step-Audio-R1-NVFP4A16/added_tokens.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
models/Step-Audio-R1-NVFP4A16/chat_template.jinja
ADDED
|
@@ -0,0 +1,74 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{%- if tools %}
|
| 2 |
+
{{- '<|BOT|>system
|
| 3 |
+
' }}
|
| 4 |
+
{%- if messages[0]['role'] == 'system' %}
|
| 5 |
+
{{- messages[0]['content'] + '<|EOT|>' }}
|
| 6 |
+
{%- else %}
|
| 7 |
+
{{- 'You are a helpful assistant. Please think step by step and provide your reasoning process within <think> </think> tags, followed by your final answer. Format: <think>your reasoning here</think>your final answer<|EOT|>' }}
|
| 8 |
+
{%- endif %}
|
| 9 |
+
{{- '<|BOT|>' }}
|
| 10 |
+
{{- "tool_json_schemas
|
| 11 |
+
" }}
|
| 12 |
+
{{- tools | tojson }}
|
| 13 |
+
{{- '<|EOT|>' }}
|
| 14 |
+
{%- else %}
|
| 15 |
+
{%- if messages[0]['role'] == 'system' %}
|
| 16 |
+
{{- '<|BOT|>system
|
| 17 |
+
' + messages[0]['content'] + '<|EOT|>' }}
|
| 18 |
+
{%- else %}
|
| 19 |
+
{{- '<|BOT|>system
|
| 20 |
+
You are a helpful assistant. Please think step by step and provide your reasoning process within <think> </think> tags, followed by your final answer. Format: <think>your reasoning here</think>your final answer<|EOT|>' }}
|
| 21 |
+
{%- endif %}
|
| 22 |
+
{%- endif %}
|
| 23 |
+
{%- for message in messages %}
|
| 24 |
+
{%- if message["role"] == "user" %}
|
| 25 |
+
{{- '<|BOT|>human
|
| 26 |
+
' + message["content"] + '<|EOT|>' }}
|
| 27 |
+
{%- elif (message["role"] == "system" and not loop.first) or (message["role"] == "assistant" and not message["tool_calls"]) %}
|
| 28 |
+
{{- '<|BOT|>' + message["role"] + '
|
| 29 |
+
' + message["content"] + '<|EOT|>' }}
|
| 30 |
+
{%- elif message["role"] == "assistant" %}
|
| 31 |
+
{{- '<|BOT|>' + message["role"] + '
|
| 32 |
+
' }}
|
| 33 |
+
{%- if message["content"] %}
|
| 34 |
+
{{- message["content"] }}
|
| 35 |
+
{%- endif %}
|
| 36 |
+
{%- for tool_call in message.tool_calls %}
|
| 37 |
+
{%- if tool_call["function"] is defined %}
|
| 38 |
+
{%- set tool_call = tool_call["function"] %}
|
| 39 |
+
{%- endif %}
|
| 40 |
+
{{- '<|CALL_START|>' + 'function
|
| 41 |
+
' + tool_call["name"] + '
|
| 42 |
+
' }}
|
| 43 |
+
{{- tool_call["arguments"] | tojson }}
|
| 44 |
+
{{- '<|CALL_END|>' }}
|
| 45 |
+
{%- endfor %}
|
| 46 |
+
{{- '<|EOT|>' }}
|
| 47 |
+
{%- elif message["role"] == "tool" %}
|
| 48 |
+
{{- '<|BOT|>' }}
|
| 49 |
+
{%- set ns = namespace(function_name="tool") %}
|
| 50 |
+
{%- if message["tool_call_id"] %}
|
| 51 |
+
{%- for prev_msg in messages %}
|
| 52 |
+
{%- if prev_msg["role"] == "assistant" and prev_msg["tool_calls"] %}
|
| 53 |
+
{%- for tool_call in prev_msg["tool_calls"] %}
|
| 54 |
+
{%- if tool_call["id"] == message["tool_call_id"] %}
|
| 55 |
+
{%- if tool_call["function"] is defined %}
|
| 56 |
+
{%- set ns.function_name = tool_call["function"]["name"] %}
|
| 57 |
+
{%- endif %}
|
| 58 |
+
{%- endif %}
|
| 59 |
+
{%- endfor %}
|
| 60 |
+
{%- endif %}
|
| 61 |
+
{%- endfor %}
|
| 62 |
+
{%- endif %}
|
| 63 |
+
{{- 'function_output
|
| 64 |
+
' + ns.function_name + '
|
| 65 |
+
' }}
|
| 66 |
+
{{- message["content"] }}
|
| 67 |
+
{{- '<|EOT|>' }}
|
| 68 |
+
{%- endif %}
|
| 69 |
+
{%- endfor %}
|
| 70 |
+
{%- if add_generation_prompt %}
|
| 71 |
+
{{- '<|BOT|>assistant
|
| 72 |
+
<think>
|
| 73 |
+
' }}
|
| 74 |
+
{%- endif %}
|
models/Step-Audio-R1-NVFP4A16/config.json
ADDED
|
@@ -0,0 +1,348 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"architectures": [
|
| 3 |
+
"StepAudio2ForCausalLM"
|
| 4 |
+
],
|
| 5 |
+
"audio_encoder_config": {
|
| 6 |
+
"adapter_stride": 2,
|
| 7 |
+
"kernel_size": 3,
|
| 8 |
+
"llm_dim": 5120,
|
| 9 |
+
"model_type": "step_audio_2_encoder",
|
| 10 |
+
"n_audio_ctx": 1500,
|
| 11 |
+
"n_audio_head": 20,
|
| 12 |
+
"n_audio_layer": 32,
|
| 13 |
+
"n_audio_state": 1280,
|
| 14 |
+
"n_codebook_size": 4096,
|
| 15 |
+
"n_mels": 128
|
| 16 |
+
},
|
| 17 |
+
"auto_map": {
|
| 18 |
+
"AutoConfig": "configuration_step_audio_2.StepAudio2Config",
|
| 19 |
+
"AutoModelForCausalLM": "modeling_step_audio_2.StepAudio2ForCausalLM"
|
| 20 |
+
},
|
| 21 |
+
"dtype": "bfloat16",
|
| 22 |
+
"max_window_layers": null,
|
| 23 |
+
"model_type": "step_audio_2",
|
| 24 |
+
"quantization_config": {
|
| 25 |
+
"config_groups": {
|
| 26 |
+
"group_0": {
|
| 27 |
+
"format": "nvfp4-pack-quantized",
|
| 28 |
+
"input_activations": null,
|
| 29 |
+
"output_activations": null,
|
| 30 |
+
"targets": [
|
| 31 |
+
"Linear"
|
| 32 |
+
],
|
| 33 |
+
"weights": {
|
| 34 |
+
"actorder": null,
|
| 35 |
+
"block_structure": null,
|
| 36 |
+
"dynamic": false,
|
| 37 |
+
"group_size": 16,
|
| 38 |
+
"num_bits": 4,
|
| 39 |
+
"observer": "minmax",
|
| 40 |
+
"observer_kwargs": {},
|
| 41 |
+
"strategy": "tensor_group",
|
| 42 |
+
"symmetric": true,
|
| 43 |
+
"type": "float"
|
| 44 |
+
}
|
| 45 |
+
}
|
| 46 |
+
},
|
| 47 |
+
"format": "nvfp4-pack-quantized",
|
| 48 |
+
"global_compression_ratio": null,
|
| 49 |
+
"ignore": [
|
| 50 |
+
"encoder.blocks.0.attn.query",
|
| 51 |
+
"encoder.blocks.0.attn.key",
|
| 52 |
+
"encoder.blocks.0.attn.value",
|
| 53 |
+
"encoder.blocks.0.attn.out",
|
| 54 |
+
"encoder.blocks.0.mlp.0",
|
| 55 |
+
"encoder.blocks.0.mlp.2",
|
| 56 |
+
"encoder.blocks.1.attn.query",
|
| 57 |
+
"encoder.blocks.1.attn.key",
|
| 58 |
+
"encoder.blocks.1.attn.value",
|
| 59 |
+
"encoder.blocks.1.attn.out",
|
| 60 |
+
"encoder.blocks.1.mlp.0",
|
| 61 |
+
"encoder.blocks.1.mlp.2",
|
| 62 |
+
"encoder.blocks.2.attn.query",
|
| 63 |
+
"encoder.blocks.2.attn.key",
|
| 64 |
+
"encoder.blocks.2.attn.value",
|
| 65 |
+
"encoder.blocks.2.attn.out",
|
| 66 |
+
"encoder.blocks.2.mlp.0",
|
| 67 |
+
"encoder.blocks.2.mlp.2",
|
| 68 |
+
"encoder.blocks.3.attn.query",
|
| 69 |
+
"encoder.blocks.3.attn.key",
|
| 70 |
+
"encoder.blocks.3.attn.value",
|
| 71 |
+
"encoder.blocks.3.attn.out",
|
| 72 |
+
"encoder.blocks.3.mlp.0",
|
| 73 |
+
"encoder.blocks.3.mlp.2",
|
| 74 |
+
"encoder.blocks.4.attn.query",
|
| 75 |
+
"encoder.blocks.4.attn.key",
|
| 76 |
+
"encoder.blocks.4.attn.value",
|
| 77 |
+
"encoder.blocks.4.attn.out",
|
| 78 |
+
"encoder.blocks.4.mlp.0",
|
| 79 |
+
"encoder.blocks.4.mlp.2",
|
| 80 |
+
"encoder.blocks.5.attn.query",
|
| 81 |
+
"encoder.blocks.5.attn.key",
|
| 82 |
+
"encoder.blocks.5.attn.value",
|
| 83 |
+
"encoder.blocks.5.attn.out",
|
| 84 |
+
"encoder.blocks.5.mlp.0",
|
| 85 |
+
"encoder.blocks.5.mlp.2",
|
| 86 |
+
"encoder.blocks.6.attn.query",
|
| 87 |
+
"encoder.blocks.6.attn.key",
|
| 88 |
+
"encoder.blocks.6.attn.value",
|
| 89 |
+
"encoder.blocks.6.attn.out",
|
| 90 |
+
"encoder.blocks.6.mlp.0",
|
| 91 |
+
"encoder.blocks.6.mlp.2",
|
| 92 |
+
"encoder.blocks.7.attn.query",
|
| 93 |
+
"encoder.blocks.7.attn.key",
|
| 94 |
+
"encoder.blocks.7.attn.value",
|
| 95 |
+
"encoder.blocks.7.attn.out",
|
| 96 |
+
"encoder.blocks.7.mlp.0",
|
| 97 |
+
"encoder.blocks.7.mlp.2",
|
| 98 |
+
"encoder.blocks.8.attn.query",
|
| 99 |
+
"encoder.blocks.8.attn.key",
|
| 100 |
+
"encoder.blocks.8.attn.value",
|
| 101 |
+
"encoder.blocks.8.attn.out",
|
| 102 |
+
"encoder.blocks.8.mlp.0",
|
| 103 |
+
"encoder.blocks.8.mlp.2",
|
| 104 |
+
"encoder.blocks.9.attn.query",
|
| 105 |
+
"encoder.blocks.9.attn.key",
|
| 106 |
+
"encoder.blocks.9.attn.value",
|
| 107 |
+
"encoder.blocks.9.attn.out",
|
| 108 |
+
"encoder.blocks.9.mlp.0",
|
| 109 |
+
"encoder.blocks.9.mlp.2",
|
| 110 |
+
"encoder.blocks.10.attn.query",
|
| 111 |
+
"encoder.blocks.10.attn.key",
|
| 112 |
+
"encoder.blocks.10.attn.value",
|
| 113 |
+
"encoder.blocks.10.attn.out",
|
| 114 |
+
"encoder.blocks.10.mlp.0",
|
| 115 |
+
"encoder.blocks.10.mlp.2",
|
| 116 |
+
"encoder.blocks.11.attn.query",
|
| 117 |
+
"encoder.blocks.11.attn.key",
|
| 118 |
+
"encoder.blocks.11.attn.value",
|
| 119 |
+
"encoder.blocks.11.attn.out",
|
| 120 |
+
"encoder.blocks.11.mlp.0",
|
| 121 |
+
"encoder.blocks.11.mlp.2",
|
| 122 |
+
"encoder.blocks.12.attn.query",
|
| 123 |
+
"encoder.blocks.12.attn.key",
|
| 124 |
+
"encoder.blocks.12.attn.value",
|
| 125 |
+
"encoder.blocks.12.attn.out",
|
| 126 |
+
"encoder.blocks.12.mlp.0",
|
| 127 |
+
"encoder.blocks.12.mlp.2",
|
| 128 |
+
"encoder.blocks.13.attn.query",
|
| 129 |
+
"encoder.blocks.13.attn.key",
|
| 130 |
+
"encoder.blocks.13.attn.value",
|
| 131 |
+
"encoder.blocks.13.attn.out",
|
| 132 |
+
"encoder.blocks.13.mlp.0",
|
| 133 |
+
"encoder.blocks.13.mlp.2",
|
| 134 |
+
"encoder.blocks.14.attn.query",
|
| 135 |
+
"encoder.blocks.14.attn.key",
|
| 136 |
+
"encoder.blocks.14.attn.value",
|
| 137 |
+
"encoder.blocks.14.attn.out",
|
| 138 |
+
"encoder.blocks.14.mlp.0",
|
| 139 |
+
"encoder.blocks.14.mlp.2",
|
| 140 |
+
"encoder.blocks.15.attn.query",
|
| 141 |
+
"encoder.blocks.15.attn.key",
|
| 142 |
+
"encoder.blocks.15.attn.value",
|
| 143 |
+
"encoder.blocks.15.attn.out",
|
| 144 |
+
"encoder.blocks.15.mlp.0",
|
| 145 |
+
"encoder.blocks.15.mlp.2",
|
| 146 |
+
"encoder.blocks.16.attn.query",
|
| 147 |
+
"encoder.blocks.16.attn.key",
|
| 148 |
+
"encoder.blocks.16.attn.value",
|
| 149 |
+
"encoder.blocks.16.attn.out",
|
| 150 |
+
"encoder.blocks.16.mlp.0",
|
| 151 |
+
"encoder.blocks.16.mlp.2",
|
| 152 |
+
"encoder.blocks.17.attn.query",
|
| 153 |
+
"encoder.blocks.17.attn.key",
|
| 154 |
+
"encoder.blocks.17.attn.value",
|
| 155 |
+
"encoder.blocks.17.attn.out",
|
| 156 |
+
"encoder.blocks.17.mlp.0",
|
| 157 |
+
"encoder.blocks.17.mlp.2",
|
| 158 |
+
"encoder.blocks.18.attn.query",
|
| 159 |
+
"encoder.blocks.18.attn.key",
|
| 160 |
+
"encoder.blocks.18.attn.value",
|
| 161 |
+
"encoder.blocks.18.attn.out",
|
| 162 |
+
"encoder.blocks.18.mlp.0",
|
| 163 |
+
"encoder.blocks.18.mlp.2",
|
| 164 |
+
"encoder.blocks.19.attn.query",
|
| 165 |
+
"encoder.blocks.19.attn.key",
|
| 166 |
+
"encoder.blocks.19.attn.value",
|
| 167 |
+
"encoder.blocks.19.attn.out",
|
| 168 |
+
"encoder.blocks.19.mlp.0",
|
| 169 |
+
"encoder.blocks.19.mlp.2",
|
| 170 |
+
"encoder.blocks.20.attn.query",
|
| 171 |
+
"encoder.blocks.20.attn.key",
|
| 172 |
+
"encoder.blocks.20.attn.value",
|
| 173 |
+
"encoder.blocks.20.attn.out",
|
| 174 |
+
"encoder.blocks.20.mlp.0",
|
| 175 |
+
"encoder.blocks.20.mlp.2",
|
| 176 |
+
"encoder.blocks.21.attn.query",
|
| 177 |
+
"encoder.blocks.21.attn.key",
|
| 178 |
+
"encoder.blocks.21.attn.value",
|
| 179 |
+
"encoder.blocks.21.attn.out",
|
| 180 |
+
"encoder.blocks.21.mlp.0",
|
| 181 |
+
"encoder.blocks.21.mlp.2",
|
| 182 |
+
"encoder.blocks.22.attn.query",
|
| 183 |
+
"encoder.blocks.22.attn.key",
|
| 184 |
+
"encoder.blocks.22.attn.value",
|
| 185 |
+
"encoder.blocks.22.attn.out",
|
| 186 |
+
"encoder.blocks.22.mlp.0",
|
| 187 |
+
"encoder.blocks.22.mlp.2",
|
| 188 |
+
"encoder.blocks.23.attn.query",
|
| 189 |
+
"encoder.blocks.23.attn.key",
|
| 190 |
+
"encoder.blocks.23.attn.value",
|
| 191 |
+
"encoder.blocks.23.attn.out",
|
| 192 |
+
"encoder.blocks.23.mlp.0",
|
| 193 |
+
"encoder.blocks.23.mlp.2",
|
| 194 |
+
"encoder.blocks.24.attn.query",
|
| 195 |
+
"encoder.blocks.24.attn.key",
|
| 196 |
+
"encoder.blocks.24.attn.value",
|
| 197 |
+
"encoder.blocks.24.attn.out",
|
| 198 |
+
"encoder.blocks.24.mlp.0",
|
| 199 |
+
"encoder.blocks.24.mlp.2",
|
| 200 |
+
"encoder.blocks.25.attn.query",
|
| 201 |
+
"encoder.blocks.25.attn.key",
|
| 202 |
+
"encoder.blocks.25.attn.value",
|
| 203 |
+
"encoder.blocks.25.attn.out",
|
| 204 |
+
"encoder.blocks.25.mlp.0",
|
| 205 |
+
"encoder.blocks.25.mlp.2",
|
| 206 |
+
"encoder.blocks.26.attn.query",
|
| 207 |
+
"encoder.blocks.26.attn.key",
|
| 208 |
+
"encoder.blocks.26.attn.value",
|
| 209 |
+
"encoder.blocks.26.attn.out",
|
| 210 |
+
"encoder.blocks.26.mlp.0",
|
| 211 |
+
"encoder.blocks.26.mlp.2",
|
| 212 |
+
"encoder.blocks.27.attn.query",
|
| 213 |
+
"encoder.blocks.27.attn.key",
|
| 214 |
+
"encoder.blocks.27.attn.value",
|
| 215 |
+
"encoder.blocks.27.attn.out",
|
| 216 |
+
"encoder.blocks.27.mlp.0",
|
| 217 |
+
"encoder.blocks.27.mlp.2",
|
| 218 |
+
"encoder.blocks.28.attn.query",
|
| 219 |
+
"encoder.blocks.28.attn.key",
|
| 220 |
+
"encoder.blocks.28.attn.value",
|
| 221 |
+
"encoder.blocks.28.attn.out",
|
| 222 |
+
"encoder.blocks.28.mlp.0",
|
| 223 |
+
"encoder.blocks.28.mlp.2",
|
| 224 |
+
"encoder.blocks.29.attn.query",
|
| 225 |
+
"encoder.blocks.29.attn.key",
|
| 226 |
+
"encoder.blocks.29.attn.value",
|
| 227 |
+
"encoder.blocks.29.attn.out",
|
| 228 |
+
"encoder.blocks.29.mlp.0",
|
| 229 |
+
"encoder.blocks.29.mlp.2",
|
| 230 |
+
"encoder.blocks.30.attn.query",
|
| 231 |
+
"encoder.blocks.30.attn.key",
|
| 232 |
+
"encoder.blocks.30.attn.value",
|
| 233 |
+
"encoder.blocks.30.attn.out",
|
| 234 |
+
"encoder.blocks.30.mlp.0",
|
| 235 |
+
"encoder.blocks.30.mlp.2",
|
| 236 |
+
"encoder.blocks.31.attn.query",
|
| 237 |
+
"encoder.blocks.31.attn.key",
|
| 238 |
+
"encoder.blocks.31.attn.value",
|
| 239 |
+
"encoder.blocks.31.attn.out",
|
| 240 |
+
"encoder.blocks.31.mlp.0",
|
| 241 |
+
"encoder.blocks.31.mlp.2",
|
| 242 |
+
"adapter.linear1",
|
| 243 |
+
"adapter.linear2",
|
| 244 |
+
"lm_head"
|
| 245 |
+
],
|
| 246 |
+
"kv_cache_scheme": null,
|
| 247 |
+
"quant_method": "compressed-tensors",
|
| 248 |
+
"quantization_status": "compressed",
|
| 249 |
+
"sparsity_config": {},
|
| 250 |
+
"transform_config": {},
|
| 251 |
+
"version": "0.12.2"
|
| 252 |
+
},
|
| 253 |
+
"sliding_window": 2048,
|
| 254 |
+
"text_config": {
|
| 255 |
+
"architectures": [
|
| 256 |
+
"Qwen2ForCausalLM"
|
| 257 |
+
],
|
| 258 |
+
"attention_dropout": 0.0,
|
| 259 |
+
"dtype": "bfloat16",
|
| 260 |
+
"hidden_act": "silu",
|
| 261 |
+
"hidden_size": 5120,
|
| 262 |
+
"initializer_range": 0.02,
|
| 263 |
+
"intermediate_size": 27648,
|
| 264 |
+
"layer_types": [
|
| 265 |
+
"full_attention",
|
| 266 |
+
"full_attention",
|
| 267 |
+
"full_attention",
|
| 268 |
+
"full_attention",
|
| 269 |
+
"full_attention",
|
| 270 |
+
"full_attention",
|
| 271 |
+
"full_attention",
|
| 272 |
+
"full_attention",
|
| 273 |
+
"full_attention",
|
| 274 |
+
"full_attention",
|
| 275 |
+
"full_attention",
|
| 276 |
+
"full_attention",
|
| 277 |
+
"full_attention",
|
| 278 |
+
"full_attention",
|
| 279 |
+
"full_attention",
|
| 280 |
+
"full_attention",
|
| 281 |
+
"full_attention",
|
| 282 |
+
"full_attention",
|
| 283 |
+
"full_attention",
|
| 284 |
+
"full_attention",
|
| 285 |
+
"full_attention",
|
| 286 |
+
"full_attention",
|
| 287 |
+
"full_attention",
|
| 288 |
+
"full_attention",
|
| 289 |
+
"full_attention",
|
| 290 |
+
"full_attention",
|
| 291 |
+
"full_attention",
|
| 292 |
+
"full_attention",
|
| 293 |
+
"full_attention",
|
| 294 |
+
"full_attention",
|
| 295 |
+
"full_attention",
|
| 296 |
+
"full_attention",
|
| 297 |
+
"full_attention",
|
| 298 |
+
"full_attention",
|
| 299 |
+
"full_attention",
|
| 300 |
+
"full_attention",
|
| 301 |
+
"full_attention",
|
| 302 |
+
"full_attention",
|
| 303 |
+
"full_attention",
|
| 304 |
+
"full_attention",
|
| 305 |
+
"full_attention",
|
| 306 |
+
"full_attention",
|
| 307 |
+
"full_attention",
|
| 308 |
+
"full_attention",
|
| 309 |
+
"full_attention",
|
| 310 |
+
"full_attention",
|
| 311 |
+
"full_attention",
|
| 312 |
+
"full_attention",
|
| 313 |
+
"full_attention",
|
| 314 |
+
"full_attention",
|
| 315 |
+
"full_attention",
|
| 316 |
+
"full_attention",
|
| 317 |
+
"full_attention",
|
| 318 |
+
"full_attention",
|
| 319 |
+
"full_attention",
|
| 320 |
+
"full_attention",
|
| 321 |
+
"full_attention",
|
| 322 |
+
"full_attention",
|
| 323 |
+
"full_attention",
|
| 324 |
+
"full_attention",
|
| 325 |
+
"full_attention",
|
| 326 |
+
"full_attention",
|
| 327 |
+
"full_attention",
|
| 328 |
+
"full_attention"
|
| 329 |
+
],
|
| 330 |
+
"max_position_embeddings": 65536,
|
| 331 |
+
"max_window_layers": 28,
|
| 332 |
+
"model_type": "qwen2",
|
| 333 |
+
"num_attention_heads": 40,
|
| 334 |
+
"num_hidden_layers": 64,
|
| 335 |
+
"num_attention_groups": 8,
|
| 336 |
+
"num_key_value_heads": 8,
|
| 337 |
+
"rms_norm_eps": 1e-05,
|
| 338 |
+
"rope_scaling": null,
|
| 339 |
+
"rope_theta": 1000000.0,
|
| 340 |
+
"sliding_window": null,
|
| 341 |
+
"use_cache": true,
|
| 342 |
+
"use_sliding_window": false,
|
| 343 |
+
"vocab_size": 158720
|
| 344 |
+
},
|
| 345 |
+
"tie_word_embeddings": false,
|
| 346 |
+
"transformers_version": "4.56.2",
|
| 347 |
+
"use_sliding_window": false
|
| 348 |
+
}
|
models/Step-Audio-R1-NVFP4A16/configuration_step_audio_2.py
ADDED
|
@@ -0,0 +1,128 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from typing import Optional, Union
|
| 2 |
+
|
| 3 |
+
from transformers import Qwen2Config
|
| 4 |
+
from transformers.configuration_utils import PretrainedConfig
|
| 5 |
+
|
| 6 |
+
|
| 7 |
+
class StepAudio2EncoderConfig(PretrainedConfig):
|
| 8 |
+
model_type = "step_audio_2_encoder"
|
| 9 |
+
|
| 10 |
+
def __init__(
|
| 11 |
+
self,
|
| 12 |
+
n_mels=128,
|
| 13 |
+
n_audio_ctx=1500,
|
| 14 |
+
n_audio_state=512,
|
| 15 |
+
n_audio_head=8,
|
| 16 |
+
n_audio_layer=6,
|
| 17 |
+
llm_dim=4096,
|
| 18 |
+
kernel_size=3,
|
| 19 |
+
adapter_stride=2,
|
| 20 |
+
**kwargs,
|
| 21 |
+
):
|
| 22 |
+
self.n_mels = n_mels
|
| 23 |
+
self.n_audio_ctx = n_audio_ctx
|
| 24 |
+
self.n_audio_state = n_audio_state
|
| 25 |
+
self.n_audio_head = n_audio_head
|
| 26 |
+
self.n_audio_layer = n_audio_layer
|
| 27 |
+
self.llm_dim = llm_dim
|
| 28 |
+
self.kernel_size = kernel_size
|
| 29 |
+
self.adapter_stride = adapter_stride
|
| 30 |
+
super().__init__(**kwargs)
|
| 31 |
+
|
| 32 |
+
class StepAudio2TextConfig(PretrainedConfig):
|
| 33 |
+
model_type = "step_audio_2_text"
|
| 34 |
+
|
| 35 |
+
def __init__(
|
| 36 |
+
self,
|
| 37 |
+
vocab_size=64012,
|
| 38 |
+
hidden_size=4096,
|
| 39 |
+
intermediate_size=11008,
|
| 40 |
+
num_hidden_layers=48,
|
| 41 |
+
num_attention_heads=32,
|
| 42 |
+
num_attention_groups=4,
|
| 43 |
+
num_key_value_heads=4,
|
| 44 |
+
hidden_act="silu",
|
| 45 |
+
max_position_embeddings=8192,
|
| 46 |
+
initializer_range=0.02,
|
| 47 |
+
rms_norm_eps=1e-6,
|
| 48 |
+
rope_theta=1000000.0,
|
| 49 |
+
rope_scaling=None,
|
| 50 |
+
eos_token_id=None,
|
| 51 |
+
**kwargs
|
| 52 |
+
):
|
| 53 |
+
|
| 54 |
+
if eos_token_id is not None:
|
| 55 |
+
if isinstance(eos_token_id, list):
|
| 56 |
+
eos_token_id = list(set([151643, 151645, 151665] + eos_token_id))
|
| 57 |
+
else:
|
| 58 |
+
eos_token_id = [151643, 151645, 151665, eos_token_id]
|
| 59 |
+
else:
|
| 60 |
+
eos_token_id = [151643, 151645, 151665]
|
| 61 |
+
|
| 62 |
+
super().__init__(
|
| 63 |
+
eos_token_id=eos_token_id,
|
| 64 |
+
**kwargs)
|
| 65 |
+
|
| 66 |
+
self.vocab_size = vocab_size
|
| 67 |
+
self.hidden_size = hidden_size
|
| 68 |
+
self.intermediate_size = intermediate_size
|
| 69 |
+
self.num_hidden_layers = num_hidden_layers
|
| 70 |
+
self.num_attention_heads = num_attention_heads
|
| 71 |
+
self.num_attention_groups = num_attention_groups
|
| 72 |
+
self.num_key_value_heads = num_key_value_heads
|
| 73 |
+
assert self.num_attention_groups == self.num_key_value_heads, "num_attention_groups must be equal to num_key_value_heads"
|
| 74 |
+
self.hidden_act = hidden_act
|
| 75 |
+
self.max_position_embeddings = max_position_embeddings
|
| 76 |
+
self.initializer_range = initializer_range
|
| 77 |
+
self.rms_norm_eps = rms_norm_eps
|
| 78 |
+
self.rope_theta = rope_theta
|
| 79 |
+
self.rope_scaling = rope_scaling
|
| 80 |
+
|
| 81 |
+
self.text_config = Qwen2Config(
|
| 82 |
+
vocab_size=vocab_size,
|
| 83 |
+
hidden_size=hidden_size,
|
| 84 |
+
intermediate_size=intermediate_size,
|
| 85 |
+
num_hidden_layers=num_hidden_layers,
|
| 86 |
+
num_attention_heads=num_attention_heads,
|
| 87 |
+
num_key_value_heads=num_key_value_heads,
|
| 88 |
+
hidden_act=hidden_act,
|
| 89 |
+
max_position_embeddings=max_position_embeddings,
|
| 90 |
+
initializer_range=initializer_range,
|
| 91 |
+
rms_norm_eps=rms_norm_eps,
|
| 92 |
+
rope_theta=rope_theta,
|
| 93 |
+
rope_scaling=rope_scaling,
|
| 94 |
+
architectures=["Qwen2ForCausalLM"],
|
| 95 |
+
torch_dtype=getattr(self, "torch_dtype", "bfloat16"),
|
| 96 |
+
)
|
| 97 |
+
|
| 98 |
+
class StepAudio2Config(PretrainedConfig):
|
| 99 |
+
model_type = "step_audio_2"
|
| 100 |
+
architectures = ["StepAudio2ForCausalLM"]
|
| 101 |
+
|
| 102 |
+
def __init__(
|
| 103 |
+
self,
|
| 104 |
+
audio_encoder_config :Optional[Union[dict, StepAudio2EncoderConfig]] = None,
|
| 105 |
+
text_config: Optional[Union[dict, StepAudio2TextConfig]] = None,
|
| 106 |
+
use_sliding_window: bool = False,
|
| 107 |
+
sliding_window: Optional[int] = 2048,
|
| 108 |
+
max_window_layers: Optional[int] = None,
|
| 109 |
+
**kwargs
|
| 110 |
+
):
|
| 111 |
+
kwargs.setdefault("use_sliding_window", use_sliding_window)
|
| 112 |
+
kwargs.setdefault("sliding_window", sliding_window)
|
| 113 |
+
if max_window_layers is None:
|
| 114 |
+
max_window_layers = kwargs.get("num_hidden_layers", None)
|
| 115 |
+
kwargs.setdefault("max_window_layers", max_window_layers)
|
| 116 |
+
super().__init__(**kwargs)
|
| 117 |
+
|
| 118 |
+
if text_config is None:
|
| 119 |
+
text_config = StepAudio2TextConfig().text_config
|
| 120 |
+
elif isinstance(text_config, dict):
|
| 121 |
+
text_config = StepAudio2TextConfig(**text_config).text_config
|
| 122 |
+
|
| 123 |
+
self.text_config = text_config
|
| 124 |
+
|
| 125 |
+
if audio_encoder_config is None:
|
| 126 |
+
self.audio_encoder_config = StepAudio2EncoderConfig()
|
| 127 |
+
elif isinstance(audio_encoder_config, dict):
|
| 128 |
+
self.audio_encoder_config = StepAudio2EncoderConfig(**audio_encoder_config)
|
models/Step-Audio-R1-NVFP4A16/generation_config.json
ADDED
|
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"_from_model_config": true,
|
| 3 |
+
"bos_token_id": 151643,
|
| 4 |
+
"eos_token_id": [
|
| 5 |
+
151643,
|
| 6 |
+
151665
|
| 7 |
+
],
|
| 8 |
+
"pad_token_id": 151643,
|
| 9 |
+
"transformers_version": "4.56.2"
|
| 10 |
+
}
|
models/Step-Audio-R1-NVFP4A16/merges.txt
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
models/Step-Audio-R1-NVFP4A16/model-00001-of-00005.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:c6a7081330c6f8ab4c50e0f494f5bfbf97600b2e4cb6da7980ec451d2d25b6fc
|
| 3 |
+
size 4952370688
|
models/Step-Audio-R1-NVFP4A16/model-00002-of-00005.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:8163e602803002f0fec5eec86605dcffc1eb2a8d9c78ffff87efed340f3439f0
|
| 3 |
+
size 4937507688
|
models/Step-Audio-R1-NVFP4A16/model-00003-of-00005.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:102f0220802e3c746c0c2fc4a2709f8efef538f7687e75bba2a6c6a6d86d1a24
|
| 3 |
+
size 4937507688
|
models/Step-Audio-R1-NVFP4A16/model-00004-of-00005.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:5242ab3703db9f7cac5239a6bbe79c771e1085706ae7886cf694f4a376341db6
|
| 3 |
+
size 4997822352
|
models/Step-Audio-R1-NVFP4A16/model-00005-of-00005.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:2d812abb83c44af00a386341fc563d9c15cda01c08e1851ff1c904eb09d793a8
|
| 3 |
+
size 2291022848
|
models/Step-Audio-R1-NVFP4A16/model.safetensors.index.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|