niobures commited on
Commit
18efded
·
verified ·
1 Parent(s): de885ce

Step-Audio (models)

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +9 -0
  2. models/Step-Audio-EditX-bnb-4bit/.gitattributes +35 -0
  3. models/Step-Audio-EditX-bnb-4bit/CosyVoice-300M-25Hz/FLOW_VERSION +2 -0
  4. models/Step-Audio-EditX-bnb-4bit/CosyVoice-300M-25Hz/campplus.onnx +3 -0
  5. models/Step-Audio-EditX-bnb-4bit/CosyVoice-300M-25Hz/cosyvoice.yaml +72 -0
  6. models/Step-Audio-EditX-bnb-4bit/CosyVoice-300M-25Hz/flow.pt +3 -0
  7. models/Step-Audio-EditX-bnb-4bit/CosyVoice-300M-25Hz/hift.pt +3 -0
  8. models/Step-Audio-EditX-bnb-4bit/CosyVoice-300M-25Hz/speech_tokenizer_v1.onnx +3 -0
  9. models/Step-Audio-EditX-bnb-4bit/README.md +3 -0
  10. models/Step-Audio-EditX-bnb-4bit/config.json +39 -0
  11. models/Step-Audio-EditX-bnb-4bit/configuration_step1.py +43 -0
  12. models/Step-Audio-EditX-bnb-4bit/generation_config.json +7 -0
  13. models/Step-Audio-EditX-bnb-4bit/model.safetensors +3 -0
  14. models/Step-Audio-EditX-bnb-4bit/modeling_step1.py +414 -0
  15. models/Step-Audio-EditX-bnb-4bit/quantization_config.json +10 -0
  16. models/Step-Audio-EditX-bnb-4bit/source.txt +1 -0
  17. models/Step-Audio-EditX-bnb-4bit/special_tokens_map.json +30 -0
  18. models/Step-Audio-EditX-bnb-4bit/tokenizer.json +0 -0
  19. models/Step-Audio-EditX-bnb-4bit/tokenizer.model +3 -0
  20. models/Step-Audio-EditX-bnb-4bit/tokenizer_config.json +0 -0
  21. models/Step-Audio-EditX/.gitattributes +35 -0
  22. models/Step-Audio-EditX/CosyVoice-300M-25Hz/FLOW_VERSION +2 -0
  23. models/Step-Audio-EditX/CosyVoice-300M-25Hz/campplus.onnx +3 -0
  24. models/Step-Audio-EditX/CosyVoice-300M-25Hz/cosyvoice.yaml +72 -0
  25. models/Step-Audio-EditX/CosyVoice-300M-25Hz/flow.pt +3 -0
  26. models/Step-Audio-EditX/CosyVoice-300M-25Hz/hift.pt +3 -0
  27. models/Step-Audio-EditX/CosyVoice-300M-25Hz/speech_tokenizer_v1.onnx +3 -0
  28. models/Step-Audio-EditX/README.md +144 -0
  29. models/Step-Audio-EditX/config.json +22 -0
  30. models/Step-Audio-EditX/configuration_step1.py +41 -0
  31. models/Step-Audio-EditX/model-00001.safetensors +3 -0
  32. models/Step-Audio-EditX/model.safetensors.index.json +1 -0
  33. models/Step-Audio-EditX/modeling_step1.py +414 -0
  34. models/Step-Audio-EditX/source.txt +1 -0
  35. models/Step-Audio-EditX/tokenizer.model +3 -0
  36. models/Step-Audio-EditX/tokenizer_config.json +14 -0
  37. models/Step-Audio-R1-NVFP4A16/.gitattributes +36 -0
  38. models/Step-Audio-R1-NVFP4A16/README.md +216 -0
  39. models/Step-Audio-R1-NVFP4A16/added_tokens.json +0 -0
  40. models/Step-Audio-R1-NVFP4A16/chat_template.jinja +74 -0
  41. models/Step-Audio-R1-NVFP4A16/config.json +348 -0
  42. models/Step-Audio-R1-NVFP4A16/configuration_step_audio_2.py +128 -0
  43. models/Step-Audio-R1-NVFP4A16/generation_config.json +10 -0
  44. models/Step-Audio-R1-NVFP4A16/merges.txt +0 -0
  45. models/Step-Audio-R1-NVFP4A16/model-00001-of-00005.safetensors +3 -0
  46. models/Step-Audio-R1-NVFP4A16/model-00002-of-00005.safetensors +3 -0
  47. models/Step-Audio-R1-NVFP4A16/model-00003-of-00005.safetensors +3 -0
  48. models/Step-Audio-R1-NVFP4A16/model-00004-of-00005.safetensors +3 -0
  49. models/Step-Audio-R1-NVFP4A16/model-00005-of-00005.safetensors +3 -0
  50. models/Step-Audio-R1-NVFP4A16/model.safetensors.index.json +0 -0
.gitattributes CHANGED
@@ -55,3 +55,12 @@ models/Step-Audio-TTS-3B/lib/liboptimus_ths-torch2.5-cu124.cpython-310-x86_64-li
55
  models/Step-Audio-Chat/lib/liboptimus_ths-torch2.2-cu121.cpython-310-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
56
  models/Step-Audio-Chat/lib/liboptimus_ths-torch2.3-cu121.cpython-310-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
57
  models/Step-Audio-Chat/lib/liboptimus_ths-torch2.5-cu124.cpython-310-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
55
  models/Step-Audio-Chat/lib/liboptimus_ths-torch2.2-cu121.cpython-310-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
56
  models/Step-Audio-Chat/lib/liboptimus_ths-torch2.3-cu121.cpython-310-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
57
  models/Step-Audio-Chat/lib/liboptimus_ths-torch2.5-cu124.cpython-310-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
58
+ models/Step-Audio-R1-NVFP4A16/tokenizer.json filter=lfs diff=lfs merge=lfs -text
59
+ models/StepAudio2mini_BPFT/checkpoint-12000/tokenizer.json filter=lfs diff=lfs merge=lfs -text
60
+ models/StepAudio2mini_BPFT/checkpoint-15000/tokenizer.json filter=lfs diff=lfs merge=lfs -text
61
+ models/StepAudio2mini_BPFT/checkpoint-18000/tokenizer.json filter=lfs diff=lfs merge=lfs -text
62
+ models/StepAudio2mini_BPFT/checkpoint-18850/tokenizer.json filter=lfs diff=lfs merge=lfs -text
63
+ models/StepAudio2mini_BPFT/checkpoint-3000/tokenizer.json filter=lfs diff=lfs merge=lfs -text
64
+ models/StepAudio2mini_BPFT/checkpoint-6000/tokenizer.json filter=lfs diff=lfs merge=lfs -text
65
+ models/StepAudio2mini_BPFT/checkpoint-9000/tokenizer.json filter=lfs diff=lfs merge=lfs -text
66
+ models/StepAudio2mini_BPFT/tokenizer.json filter=lfs diff=lfs merge=lfs -text
models/Step-Audio-EditX-bnb-4bit/.gitattributes ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
models/Step-Audio-EditX-bnb-4bit/CosyVoice-300M-25Hz/FLOW_VERSION ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ /mnt/wby-jfs/models/train/flow_matching/flow_v2_1node_vq0206_dit_v8_fullattn_exp0227_sft_exp0408_stepaudio_sft_exp0616/model_epoch_5_whole.pt
2
+ fae53942e60310eb172b170396202069
models/Step-Audio-EditX-bnb-4bit/CosyVoice-300M-25Hz/campplus.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a6ac6a63997761ae2997373e2ee1c47040854b4b759ea41ec48e4e42df0f4d73
3
+ size 28303423
models/Step-Audio-EditX-bnb-4bit/CosyVoice-300M-25Hz/cosyvoice.yaml ADDED
@@ -0,0 +1,72 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ mel_conf:
2
+ num_mels: 80
3
+ n_fft: 1920
4
+ hop_size: 480
5
+ win_size: 1920
6
+ sampling_rate: 24000
7
+ fmin: 0
8
+ fmax: 8000
9
+
10
+
11
+ flow: !new:stepvocoder.cosyvoice2.flow.flow.CausalMaskedDiffWithXvec
12
+ input_size: 512
13
+ output_size: 80
14
+ spk_embed_dim: 192
15
+ output_type: 'mel'
16
+ vocab_size: 5121 # 1024(vq02) + 4096(vq06) + 1(vq02-pad)
17
+ input_embedding: !new:stepvocoder.cosyvoice2.embedding.dual_codebook.DualCodebookEmbedding
18
+ vocab_size: 5121 # 1024(vq02) + 4096(vq06) + 1(vq02-pad)
19
+ input_size: 512
20
+ encoder: !new:stepvocoder.cosyvoice2.transformer.upsample_encoder_v2.UpsampleConformerEncoderV2
21
+ input_size: 512
22
+ output_size: 512
23
+ input_layer: 'linear'
24
+ pre_lookahead_len: 3
25
+ num_blocks: 6
26
+ num_up_blocks: 4
27
+ up_stride: 2
28
+ up_scale_factor: 2
29
+ attention_heads: 8
30
+ pos_enc_layer_type: 'rel_pos_espnet'
31
+ selfattention_layer_type: 'rel_selfattn'
32
+ key_bias: true
33
+ linear_units: 2048
34
+ dropout_rate: 0.1
35
+ positional_dropout_rate: 0.1
36
+ attention_dropout_rate: 0.1
37
+ normalize_before: True
38
+ decoder: !new:stepvocoder.cosyvoice2.flow.flow_matching.CausalConditionalCFM
39
+ inference_cfg_rate: 0.7
40
+ estimator: !new:stepvocoder.cosyvoice2.flow.decoder_dit.DiT
41
+ in_channels: 320
42
+ out_channels: 80
43
+ mlp_ratio: 4.0
44
+ depth: 16
45
+ num_heads: 8
46
+ head_dim: 64
47
+ hidden_size: 512
48
+
49
+
50
+ hift: !new:stepvocoder.cosyvoice2.hifigan.generator.HiFTGenerator
51
+ in_channels: 80
52
+ base_channels: 512
53
+ nb_harmonics: 8
54
+ sampling_rate: 24000
55
+ nsf_alpha: 0.1
56
+ nsf_sigma: 0.003
57
+ nsf_voiced_threshold: 10
58
+ upsample_rates: [8, 5, 3]
59
+ upsample_kernel_sizes: [16, 11, 7]
60
+ istft_params:
61
+ n_fft: 16
62
+ hop_len: 4
63
+ resblock_kernel_sizes: [3, 7, 11]
64
+ resblock_dilation_sizes: [[1, 3, 5], [1, 3, 5], [1, 3, 5]]
65
+ source_resblock_kernel_sizes: [7, 7, 11]
66
+ source_resblock_dilation_sizes: [[1, 3, 5], [1, 3, 5], [1, 3, 5]]
67
+ lrelu_slope: 0.1
68
+ audio_limit: 0.99
69
+ f0_predictor: !new:stepvocoder.cosyvoice2.hifigan.f0_predictor.ConvRNNF0Predictor
70
+ num_class: 1
71
+ in_channels: 80
72
+ cond_channels: 512
models/Step-Audio-EditX-bnb-4bit/CosyVoice-300M-25Hz/flow.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:37f18fcb9c374bb8d8ae229e2f7618b6effaa208609bd0407fc661234125531c
3
+ size 615269316
models/Step-Audio-EditX-bnb-4bit/CosyVoice-300M-25Hz/hift.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3386cc880324d4e98e05987b99107f49e40ed925b8ecc87c1f4939432d429879
3
+ size 83390254
models/Step-Audio-EditX-bnb-4bit/CosyVoice-300M-25Hz/speech_tokenizer_v1.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:56285ddd4a83e883ee0cb9f8d69c1089b53a94b1f78ff7e4a0224a27eb4cb486
3
+ size 522625011
models/Step-Audio-EditX-bnb-4bit/README.md ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ ---
2
+ license: apache-2.0
3
+ ---
models/Step-Audio-EditX-bnb-4bit/config.json ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "Step1ForCausalLM"
4
+ ],
5
+ "auto_map": {
6
+ "AutoConfig": "configuration_step1.Step1Config",
7
+ "AutoModelForCausalLM": "modeling_step1.Step1ForCausalLM"
8
+ },
9
+ "bos_token_id": 1,
10
+ "eos_token_id": 3,
11
+ "hidden_size": 3072,
12
+ "intermediate_size": 8192,
13
+ "max_seq_len": 32768,
14
+ "model_type": "step1",
15
+ "num_attention_groups": 4,
16
+ "num_attention_heads": 48,
17
+ "num_hidden_layers": 32,
18
+ "pad_token_id": 0,
19
+ "quantization_config": {
20
+ "_load_in_4bit": true,
21
+ "_load_in_8bit": false,
22
+ "bnb_4bit_compute_dtype": "float16",
23
+ "bnb_4bit_quant_storage": "uint8",
24
+ "bnb_4bit_quant_type": "nf4",
25
+ "bnb_4bit_use_double_quant": true,
26
+ "llm_int8_enable_fp32_cpu_offload": false,
27
+ "llm_int8_has_fp16_weight": false,
28
+ "llm_int8_skip_modules": null,
29
+ "llm_int8_threshold": 6.0,
30
+ "load_in_4bit": true,
31
+ "load_in_8bit": false,
32
+ "quant_method": "bitsandbytes"
33
+ },
34
+ "rms_norm_eps": 1e-05,
35
+ "torch_dtype": "float16",
36
+ "transformers_version": "4.53.3",
37
+ "use_cache": true,
38
+ "vocab_size": 74752
39
+ }
models/Step-Audio-EditX-bnb-4bit/configuration_step1.py ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Optional, List, Any, Dict
2
+ from transformers.configuration_utils import PretrainedConfig
3
+
4
+
5
+
6
+ class Step1Config(PretrainedConfig):
7
+ model_type = "step1"
8
+ keys_to_ignore_at_inference = ["past_key_values"]
9
+
10
+ def __init__(
11
+ self,
12
+ hidden_size: int = 5120,
13
+ intermediate_size: int = 13312,
14
+ num_attention_heads: int = 40,
15
+ num_attention_groups: int = 8,
16
+ num_hidden_layers: int = 48,
17
+ max_seq_len: int = 4096,
18
+ vocab_size: int = 65536,
19
+ rms_norm_eps: float = 1e-5,
20
+ bos_token_id: int = 1,
21
+ eos_token_id: int = 3,
22
+ pad_token_id: int = 0,
23
+ use_cache: bool = True,
24
+ **kwargs,
25
+ ) -> None:
26
+ self.hidden_size = hidden_size
27
+ self.intermediate_size = intermediate_size
28
+ self.num_attention_heads = num_attention_heads
29
+ self.num_attention_groups = num_attention_groups
30
+ self.num_hidden_layers = num_hidden_layers
31
+ self.max_seq_len = max_seq_len
32
+ self.vocab_size = vocab_size
33
+ self.rms_norm_eps = rms_norm_eps
34
+ self.use_cache = use_cache
35
+ super().__init__(
36
+ bos_token_id=bos_token_id,
37
+ pad_token_id=pad_token_id,
38
+ eos_token_id=eos_token_id,
39
+ **kwargs
40
+ )
41
+
42
+
43
+ __all__ = ["Step1Config"]
models/Step-Audio-EditX-bnb-4bit/generation_config.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 1,
4
+ "eos_token_id": 3,
5
+ "pad_token_id": 0,
6
+ "transformers_version": "4.53.3"
7
+ }
models/Step-Audio-EditX-bnb-4bit/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:535f2dd870385812ce2ef8170634c614f62a1b260d7f0f0028eb19196f4af688
3
+ size 2503238347
models/Step-Audio-EditX-bnb-4bit/modeling_step1.py ADDED
@@ -0,0 +1,414 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import math
2
+ from typing import Optional, Tuple, Union, List
3
+
4
+ import torch
5
+ import torch.utils.checkpoint
6
+ from torch import nn
7
+ from transformers.generation import GenerationMixin
8
+
9
+ from transformers.modeling_utils import PreTrainedModel
10
+ from transformers.utils import logging
11
+ from .configuration_step1 import Step1Config
12
+ from transformers.cache_utils import Cache, DynamicCache
13
+ from einops import rearrange
14
+ from transformers.modeling_outputs import (
15
+ BaseModelOutputWithPast,
16
+ CausalLMOutputWithPast,
17
+ )
18
+
19
+ logger = logging.get_logger(__name__)
20
+
21
+
22
+ def build_alibi_cache(block_size, n_heads, dtype, device):
23
+ # get slopes
24
+ n = 2 ** math.floor(math.log2(n_heads)) # nearest 2**n to n_heads
25
+ m0 = 2.0 ** (-8.0 / n)
26
+ # 2^(-8/n), 2^(-8*2/n), 2^(-8*3/n), ...
27
+ slopes = torch.pow(m0, torch.arange(1, n + 1))
28
+ if n < n_heads:
29
+ m1 = 2.0 ** (-4.0 / n)
30
+ # 2^(-8/(2n)), 2^(-8*3/(2n)), 2^(-8*5/(2n)), ...
31
+ mm = torch.pow(m1, torch.arange(1, 1 + 2 * (n_heads - n), 2))
32
+ slopes = torch.cat([slopes, mm])
33
+ slopes = slopes.to(device)
34
+
35
+ tril = torch.tril(torch.ones(1, 1, block_size, block_size, device=device))
36
+
37
+ bias_rows = torch.arange(block_size, device=device).view(1, -1)
38
+ bias_cols = torch.arange(block_size, device=device).view(-1, 1)
39
+ bias = -torch.sqrt(bias_cols - bias_rows)
40
+ bias = bias.view(1, block_size, block_size) * slopes.view(-1, 1, 1)
41
+ bias = bias.masked_fill(tril == 0, float("-inf"))
42
+
43
+ return bias.type(dtype)
44
+
45
+
46
+ class StepRMSNorm(torch.nn.Module):
47
+ def __init__(self, hidden_size, eps=1e-5):
48
+ super().__init__()
49
+ self.weight = torch.nn.Parameter(torch.ones(hidden_size))
50
+ self.eps = eps
51
+
52
+ def forward(self, x: torch.Tensor):
53
+ var = x.float().pow(2).mean(-1, keepdim=True)
54
+ x = x * torch.rsqrt(var + self.eps).to(x.dtype)
55
+ x = x * self.weight
56
+ return x
57
+
58
+
59
+ class StepAttention(torch.nn.Module):
60
+ def __init__(self, hidden_size, num_heads, num_groups, layer_idx: int):
61
+ super().__init__()
62
+
63
+ self.num_heads = num_heads
64
+ self.num_groups = num_groups
65
+ self.hidden_size = hidden_size
66
+ self.head_dim = hidden_size // num_heads
67
+
68
+ self.q_proj = torch.nn.Linear(hidden_size, hidden_size, bias=False)
69
+ self.k_proj = torch.nn.Linear(
70
+ hidden_size, num_groups * self.head_dim, bias=False
71
+ )
72
+ self.v_proj = torch.nn.Linear(
73
+ hidden_size, num_groups * self.head_dim, bias=False
74
+ )
75
+ self.o_proj = torch.nn.Linear(hidden_size, hidden_size, bias=False)
76
+
77
+ self.layer_idx = layer_idx
78
+
79
+ def flash_attn_func(self, q, k, v, dropout_p=0.0, softmax_scale=None, causal=True,
80
+ return_attn_probs=False, tp_group_rank=0, tp_group_size=1):
81
+ softmax_scale = q.size(-1) ** (-0.5) if softmax_scale is None else softmax_scale
82
+ return torch.ops.Optimus.fwd(q, k, v, None, dropout_p, softmax_scale, causal, return_attn_probs, None, tp_group_rank, tp_group_size)[0]
83
+
84
+ def forward(
85
+ self,
86
+ x: torch.Tensor,
87
+ past_key_value: Optional[Cache] = None,
88
+ attention_mask: Optional[torch.Tensor] = None,
89
+ cache_position: Optional[torch.LongTensor] = None,
90
+ ):
91
+
92
+ q: torch.Tensor = self.q_proj(x)
93
+ k: torch.Tensor = self.k_proj(x)
94
+ v: torch.Tensor = self.v_proj(x)
95
+ if past_key_value is not None:
96
+ cache_kwargs = {"cache_position": cache_position}
97
+ k, v = past_key_value.update(k, v, self.layer_idx, cache_kwargs)
98
+
99
+ q = rearrange(q, "b s (h d) -> b s h d", h=self.num_heads)
100
+ k = rearrange(k, "b s (g d) -> b s g d", g=self.num_groups)
101
+ v = rearrange(v, "b s (g d) -> b s g d", g=self.num_groups)
102
+
103
+ try:
104
+ if self.head_dim not in (64, 128):
105
+ raise ValueError("head_dim must be 64 or 128")
106
+ attn_output = self.flash_attn_func(q, k, v)
107
+ attn_output = attn_output.flatten(-2, -1)
108
+ except:
109
+ k = k.repeat_interleave(self.num_heads // self.num_groups, dim=-2)
110
+ v = v.repeat_interleave(self.num_heads // self.num_groups, dim=-2)
111
+
112
+ attention_mask = build_alibi_cache(
113
+ k.size(1), self.num_heads, dtype=q.dtype, device=q.device
114
+ )[:, :, -q.size(1) :, :].contiguous()
115
+
116
+ q = q.transpose(1, 2)
117
+ k = k.transpose(1, 2)
118
+ v = v.transpose(1, 2)
119
+
120
+ attn_output: torch.Tensor = torch.nn.functional.scaled_dot_product_attention(
121
+ q, k, v, attn_mask=attention_mask
122
+ )
123
+
124
+ attn_output = attn_output.transpose(1, 2).flatten(-2, -1)
125
+
126
+ out = self.o_proj(attn_output)
127
+ return out, None # attn weights are not returned
128
+
129
+
130
+ class StepMLP(torch.nn.Module):
131
+ def __init__(self, hidden_size, intermediate_size):
132
+ super().__init__()
133
+ self.gate_proj = torch.nn.Linear(hidden_size, intermediate_size, bias=False)
134
+ self.up_proj = torch.nn.Linear(hidden_size, intermediate_size, bias=False)
135
+ self.down_proj = torch.nn.Linear(intermediate_size, hidden_size, bias=False)
136
+
137
+ def forward(self, x):
138
+ gate = self.gate_proj(x)
139
+ up = self.up_proj(x)
140
+ x = torch.nn.functional.silu(gate) * up
141
+ x = self.down_proj(x)
142
+ return x
143
+
144
+
145
+ class StepLayer(torch.nn.Module):
146
+ def __init__(self, config: Step1Config, layer_idx: int):
147
+ super().__init__()
148
+ self.layer_idx = layer_idx
149
+ self.self_attn = StepAttention(
150
+ hidden_size=config.hidden_size,
151
+ num_heads=config.num_attention_heads,
152
+ num_groups=config.num_attention_groups,
153
+ layer_idx=layer_idx,
154
+ )
155
+ self.mlp = StepMLP(
156
+ hidden_size=config.hidden_size,
157
+ intermediate_size=config.intermediate_size,
158
+ )
159
+ self.input_layernorm = StepRMSNorm(
160
+ hidden_size=config.hidden_size, eps=config.rms_norm_eps
161
+ )
162
+ self.post_attention_layernorm = StepRMSNorm(
163
+ hidden_size=config.hidden_size, eps=config.rms_norm_eps
164
+ )
165
+
166
+ def forward(
167
+ self,
168
+ hidden_states: torch.Tensor,
169
+ attention_mask: Optional[torch.Tensor] = None,
170
+ past_key_value: Optional[Cache] = None,
171
+ output_attentions: Optional[bool] = False,
172
+ cache_position: Optional[torch.LongTensor] = None,
173
+ ):
174
+ residual = hidden_states
175
+ hidden_states = self.input_layernorm(hidden_states)
176
+ hidden_states, self_attn_weights = self.self_attn(hidden_states, past_key_value, attention_mask, cache_position)
177
+ hidden_states = residual + hidden_states
178
+
179
+ residual = hidden_states
180
+ hidden_states = self.post_attention_layernorm(hidden_states)
181
+ hidden_states = self.mlp(hidden_states)
182
+ hidden_states = residual + hidden_states
183
+
184
+ outputs = (hidden_states, )
185
+ if output_attentions:
186
+ outputs += (self_attn_weights,)
187
+ return outputs
188
+
189
+
190
+ class StepPreTrainedModel(PreTrainedModel):
191
+ config_class = Step1Config
192
+ base_model_prefix = "model"
193
+ supports_gradient_checkpointing = True
194
+ _no_split_modules = ["StepLayer"]
195
+ _skip_keys_device_placement = ["past_key_values"]
196
+ _supports_cache_class = True
197
+ _supports_static_cache = True
198
+
199
+ def _init_weights(self, module):
200
+ std = self.config.initializer_range
201
+ if isinstance(module, nn.Linear):
202
+ module.weight.data.normal_(mean=0.0, std=std)
203
+ if module.bias is not None:
204
+ module.bias.data.zero_()
205
+ elif isinstance(module, nn.Embedding):
206
+ module.weight.data.normal_(mean=0.0, std=std)
207
+ if module.padding_idx is not None:
208
+ module.weight.data[module.padding_idx].zero_()
209
+
210
+
211
+ class Step1Model(StepPreTrainedModel):
212
+ """
213
+ Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`LlamaDecoderLayer`]
214
+
215
+ Args:
216
+ config: Step1Config
217
+ """
218
+
219
+ def __init__(self, config: Step1Config):
220
+ super().__init__(config)
221
+ self.config = config
222
+ self.embed_tokens = torch.nn.Embedding(config.vocab_size, config.hidden_size)
223
+
224
+ self.layers = torch.nn.Sequential(
225
+ *[
226
+ StepLayer(config, layer_idx)
227
+ for layer_idx in range(config.num_hidden_layers)
228
+ ]
229
+ )
230
+
231
+ self.norm = StepRMSNorm(
232
+ hidden_size=config.hidden_size, eps=config.rms_norm_eps
233
+ )
234
+
235
+ # Initialize weights and apply final processing
236
+ self.post_init()
237
+
238
+ def get_input_embeddings(self):
239
+ return self.embed_tokens
240
+
241
+ def set_input_embeddings(self, value):
242
+ self.embed_tokens = value
243
+
244
+ def forward(
245
+ self,
246
+ input_ids: torch.LongTensor = None,
247
+ attention_mask: Optional[torch.Tensor] = None,
248
+ past_key_values: Optional[Cache] = None,
249
+ inputs_embeds: Optional[torch.FloatTensor] = None,
250
+ use_cache: Optional[bool] = None,
251
+ output_attentions: Optional[bool] = None,
252
+ output_hidden_states: Optional[bool] = None,
253
+ return_dict: Optional[bool] = None,
254
+ cache_position: Optional[torch.LongTensor] = None,
255
+ ) -> Union[Tuple, BaseModelOutputWithPast]:
256
+ output_attentions = (
257
+ output_attentions
258
+ if output_attentions is not None
259
+ else self.config.output_attentions
260
+ )
261
+ output_hidden_states = (
262
+ output_hidden_states
263
+ if output_hidden_states is not None
264
+ else self.config.output_hidden_states
265
+ )
266
+ use_cache = use_cache if use_cache is not None else self.config.use_cache
267
+ return_dict = (
268
+ return_dict if return_dict is not None else self.config.use_return_dict
269
+ )
270
+
271
+ if (input_ids is None) ^ (inputs_embeds is not None):
272
+ raise ValueError(
273
+ "You must specify exactly one of input_ids or inputs_embeds"
274
+ )
275
+
276
+ if inputs_embeds is None:
277
+ inputs_embeds = self.embed_tokens(input_ids)
278
+
279
+ if use_cache and past_key_values is None:
280
+ past_key_values = DynamicCache()
281
+
282
+ if cache_position is None:
283
+ past_seen_tokens = (
284
+ past_key_values.get_seq_length() if past_key_values is not None else 0
285
+ )
286
+ cache_position = torch.arange(
287
+ past_seen_tokens,
288
+ past_seen_tokens + inputs_embeds.shape[1],
289
+ device=inputs_embeds.device,
290
+ )
291
+
292
+ causal_mask = attention_mask
293
+
294
+ hidden_states = inputs_embeds
295
+
296
+ # decoder layers
297
+ all_hidden_states = () if output_hidden_states else None
298
+ all_self_attns = () if output_attentions else None
299
+
300
+ for decoder_layer in self.layers[: self.config.num_hidden_layers]:
301
+ if output_hidden_states:
302
+ all_hidden_states += (hidden_states,)
303
+
304
+ layer_outputs = decoder_layer(
305
+ hidden_states,
306
+ attention_mask=causal_mask,
307
+ past_key_value=past_key_values,
308
+ cache_position=cache_position,
309
+ output_attentions=output_attentions,
310
+ )
311
+
312
+ hidden_states = layer_outputs[0]
313
+
314
+ if output_attentions:
315
+ all_self_attns += (layer_outputs[1],)
316
+
317
+ hidden_states = self.norm(hidden_states)
318
+
319
+ # add hidden states from the last decoder layer
320
+ if output_hidden_states:
321
+ all_hidden_states += (hidden_states,)
322
+
323
+ output = BaseModelOutputWithPast(
324
+ last_hidden_state=hidden_states,
325
+ past_key_values=past_key_values if use_cache else None,
326
+ hidden_states=all_hidden_states,
327
+ attentions=None,
328
+ )
329
+ return output if return_dict else output.to_tuple()
330
+
331
+
332
+ class Step1ForCausalLM(StepPreTrainedModel, GenerationMixin):
333
+ _tied_weights_keys = ["lm_head.weight"]
334
+
335
+ def __init__(self, config):
336
+ super().__init__(config)
337
+ self.model = Step1Model(config)
338
+ self.vocab_size = config.vocab_size
339
+ self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
340
+
341
+ # Initialize weights and apply final processing
342
+ self.post_init()
343
+
344
+ def get_input_embeddings(self):
345
+ return self.model.embed_tokens
346
+
347
+ def set_input_embeddings(self, value):
348
+ self.model.embed_tokens = value
349
+
350
+ def set_decoder(self, decoder):
351
+ self.model = decoder
352
+
353
+ def get_decoder(self):
354
+ return self.model
355
+
356
+ def forward(
357
+ self,
358
+ input_ids: torch.LongTensor = None,
359
+ attention_mask: Optional[torch.Tensor] = None,
360
+ position_ids: Optional[torch.LongTensor] = None,
361
+ past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None,
362
+ inputs_embeds: Optional[torch.FloatTensor] = None,
363
+ labels: Optional[torch.LongTensor] = None,
364
+ use_cache: Optional[bool] = None,
365
+ output_attentions: Optional[bool] = None,
366
+ output_hidden_states: Optional[bool] = None,
367
+ return_dict: Optional[bool] = None,
368
+ cache_position: Optional[torch.LongTensor] = None,
369
+ ) -> Union[Tuple, CausalLMOutputWithPast]:
370
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
371
+ output_hidden_states = (
372
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
373
+ )
374
+ return_dict = (
375
+ return_dict if return_dict is not None else self.config.use_return_dict
376
+ )
377
+
378
+ # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
379
+ outputs = self.model(
380
+ input_ids=input_ids,
381
+ attention_mask=attention_mask,
382
+ past_key_values=past_key_values,
383
+ inputs_embeds=inputs_embeds,
384
+ use_cache=use_cache,
385
+ output_attentions=output_attentions,
386
+ output_hidden_states=output_hidden_states,
387
+ return_dict=return_dict,
388
+ cache_position=cache_position,
389
+ )
390
+
391
+ hidden_states = outputs[0]
392
+ # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
393
+
394
+ logits = self.lm_head(hidden_states)
395
+
396
+ loss = None
397
+ if labels is not None:
398
+ loss = self.loss_function(
399
+ logits=logits,
400
+ labels=labels,
401
+ vocab_size=self.config.vocab_size,
402
+ )
403
+
404
+ if not return_dict:
405
+ output = (logits,) + outputs[1:]
406
+ return (loss,) + output if loss is not None else output
407
+
408
+ return CausalLMOutputWithPast(
409
+ loss=loss,
410
+ logits=logits,
411
+ past_key_values=outputs.past_key_values,
412
+ hidden_states=outputs.hidden_states,
413
+ attentions=outputs.attentions,
414
+ )
models/Step-Audio-EditX-bnb-4bit/quantization_config.json ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bits": 4,
3
+ "compute_dtype": "float16",
4
+ "quantization_method": "bitsandbytes",
5
+ "load_in_4bit": true,
6
+ "load_in_8bit": false,
7
+ "bnb_4bit_compute_dtype": "float16",
8
+ "bnb_4bit_use_double_quant": true,
9
+ "bnb_4bit_quant_type": "nf4"
10
+ }
models/Step-Audio-EditX-bnb-4bit/source.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ https://huggingface.co/SUP3RMASS1VE/Step-Audio-EditX-bnb-4bit
models/Step-Audio-EditX-bnb-4bit/special_tokens_map.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<s>",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "</s>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": {
17
+ "content": "<unk>",
18
+ "lstrip": false,
19
+ "normalized": false,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ },
23
+ "unk_token": {
24
+ "content": "<unk>",
25
+ "lstrip": false,
26
+ "normalized": false,
27
+ "rstrip": false,
28
+ "single_word": false
29
+ }
30
+ }
models/Step-Audio-EditX-bnb-4bit/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
models/Step-Audio-EditX-bnb-4bit/tokenizer.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:25e122d9205d035033a9994c4d46a6a1b467a938654e4178fc0e5f4f5d610674
3
+ size 1264044
models/Step-Audio-EditX-bnb-4bit/tokenizer_config.json ADDED
The diff for this file is too large to render. See raw diff
 
models/Step-Audio-EditX/.gitattributes ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
models/Step-Audio-EditX/CosyVoice-300M-25Hz/FLOW_VERSION ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ /mnt/wby-jfs/models/train/flow_matching/flow_v2_1node_vq0206_dit_v8_fullattn_exp0227_sft_exp0408_stepaudio_sft_exp0616/model_epoch_5_whole.pt
2
+ fae53942e60310eb172b170396202069
models/Step-Audio-EditX/CosyVoice-300M-25Hz/campplus.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a6ac6a63997761ae2997373e2ee1c47040854b4b759ea41ec48e4e42df0f4d73
3
+ size 28303423
models/Step-Audio-EditX/CosyVoice-300M-25Hz/cosyvoice.yaml ADDED
@@ -0,0 +1,72 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ mel_conf:
2
+ num_mels: 80
3
+ n_fft: 1920
4
+ hop_size: 480
5
+ win_size: 1920
6
+ sampling_rate: 24000
7
+ fmin: 0
8
+ fmax: 8000
9
+
10
+
11
+ flow: !new:stepvocoder.cosyvoice2.flow.flow.CausalMaskedDiffWithXvec
12
+ input_size: 512
13
+ output_size: 80
14
+ spk_embed_dim: 192
15
+ output_type: 'mel'
16
+ vocab_size: 5121 # 1024(vq02) + 4096(vq06) + 1(vq02-pad)
17
+ input_embedding: !new:stepvocoder.cosyvoice2.embedding.dual_codebook.DualCodebookEmbedding
18
+ vocab_size: 5121 # 1024(vq02) + 4096(vq06) + 1(vq02-pad)
19
+ input_size: 512
20
+ encoder: !new:stepvocoder.cosyvoice2.transformer.upsample_encoder_v2.UpsampleConformerEncoderV2
21
+ input_size: 512
22
+ output_size: 512
23
+ input_layer: 'linear'
24
+ pre_lookahead_len: 3
25
+ num_blocks: 6
26
+ num_up_blocks: 4
27
+ up_stride: 2
28
+ up_scale_factor: 2
29
+ attention_heads: 8
30
+ pos_enc_layer_type: 'rel_pos_espnet'
31
+ selfattention_layer_type: 'rel_selfattn'
32
+ key_bias: true
33
+ linear_units: 2048
34
+ dropout_rate: 0.1
35
+ positional_dropout_rate: 0.1
36
+ attention_dropout_rate: 0.1
37
+ normalize_before: True
38
+ decoder: !new:stepvocoder.cosyvoice2.flow.flow_matching.CausalConditionalCFM
39
+ inference_cfg_rate: 0.7
40
+ estimator: !new:stepvocoder.cosyvoice2.flow.decoder_dit.DiT
41
+ in_channels: 320
42
+ out_channels: 80
43
+ mlp_ratio: 4.0
44
+ depth: 16
45
+ num_heads: 8
46
+ head_dim: 64
47
+ hidden_size: 512
48
+
49
+
50
+ hift: !new:stepvocoder.cosyvoice2.hifigan.generator.HiFTGenerator
51
+ in_channels: 80
52
+ base_channels: 512
53
+ nb_harmonics: 8
54
+ sampling_rate: 24000
55
+ nsf_alpha: 0.1
56
+ nsf_sigma: 0.003
57
+ nsf_voiced_threshold: 10
58
+ upsample_rates: [8, 5, 3]
59
+ upsample_kernel_sizes: [16, 11, 7]
60
+ istft_params:
61
+ n_fft: 16
62
+ hop_len: 4
63
+ resblock_kernel_sizes: [3, 7, 11]
64
+ resblock_dilation_sizes: [[1, 3, 5], [1, 3, 5], [1, 3, 5]]
65
+ source_resblock_kernel_sizes: [7, 7, 11]
66
+ source_resblock_dilation_sizes: [[1, 3, 5], [1, 3, 5], [1, 3, 5]]
67
+ lrelu_slope: 0.1
68
+ audio_limit: 0.99
69
+ f0_predictor: !new:stepvocoder.cosyvoice2.hifigan.f0_predictor.ConvRNNF0Predictor
70
+ num_class: 1
71
+ in_channels: 80
72
+ cond_channels: 512
models/Step-Audio-EditX/CosyVoice-300M-25Hz/flow.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:37f18fcb9c374bb8d8ae229e2f7618b6effaa208609bd0407fc661234125531c
3
+ size 615269316
models/Step-Audio-EditX/CosyVoice-300M-25Hz/hift.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3386cc880324d4e98e05987b99107f49e40ed925b8ecc87c1f4939432d429879
3
+ size 83390254
models/Step-Audio-EditX/CosyVoice-300M-25Hz/speech_tokenizer_v1.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:56285ddd4a83e883ee0cb9f8d69c1089b53a94b1f78ff7e4a0224a27eb4cb486
3
+ size 522625011
models/Step-Audio-EditX/README.md ADDED
@@ -0,0 +1,144 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: apache-2.0
3
+ pipeline_tag: text-to-speech
4
+ library_name: transformers
5
+ ---
6
+ ## Step-Audio-EditX
7
+
8
+ ✨ [Demo Page](https://stepaudiollm.github.io/step-audio-editx/)&nbsp;
9
+ | 🌟 [GitHub](https://github.com/stepfun-ai/Step-Audio-EditX)&nbsp;
10
+ | 📑 [Paper](https://arxiv.org/abs/2511.03601)&nbsp;
11
+
12
+ Check our open-source repository https://github.com/stepfun-ai/Step-Audio-EditX for more details!
13
+
14
+ ## 🔥🔥🔥 News!!!
15
+ * Nov 28, 2025: 🚀 New Model Release: Now supporting **`Japanese`** and **`Korean`** languages.
16
+ * Nov 23, 2025: 📊 [Step-Audio-Edit-Benchmark](https://github.com/stepfun-ai/Step-Audio-Edit-Benchmark) Released!
17
+ * Nov 19, 2025: ⚙️ We release a **new version** of our model, which **supports polyphonic pronunciation control** and improves the performance of emotion, speaking style, and paralinguistic editing.
18
+
19
+ We are open-sourcing **Step-Audio-EditX**, a powerful **3B parameters** LLM-based audio model specialized in expressive and **iterative audio editing**.
20
+ It excels at **editing emotion**, **speaking style**, and **paralinguistics**, and also features robust **zero-shot text-to-speech (TTS)** capabilities.
21
+
22
+ ## Features
23
+ - **Zero-Shot TTS**
24
+ - Excellent zero-shot TTS cloning for `Mandarin`, `English`, `Sichuanese`, `Cantonese`, `Japanese` and `Korean`.
25
+ - To use a dialect, just add a **`[Sichuanese]`**, **`[Cantonese]`** ,**`[Japanese]`**,**`[Korean]`** tag before your text.
26
+
27
+ - **Emotion and Speaking Style Editing**
28
+ - Remarkably effective iterative control over emotions and styles, supporting **dozens** of options for editing.
29
+ - Emotion Editing : [ *Angry*, *Happy*, *Sad*, *Excited*, *Fearful*, *Surprised*, *Disgusted*, etc. ]
30
+ - Speaking Style Editing: [ *Act_coy*, *Older*, *Child*, *Whisper*, *Serious*, *Generous*, *Exaggerated*, etc.]
31
+ - Editing with more emotion and more speaking styles is on the way. **Get Ready!** 🚀
32
+
33
+ - **Paralinguistic Editing**:
34
+ - Precise control over 10 types of paralinguistic features for more natural, human-like, and expressive synthetic audio.
35
+ - Supporting Tags:
36
+ - [ *Breathing*, *Laughter*, *Suprise-oh*, *Confirmation-en*, *Uhm*, *Suprise-ah*, *Suprise-wa*, *Sigh*, *Question-ei*, *Dissatisfaction-hnn* ]
37
+
38
+ For more examples, see [demo page](https://stepaudiollm.github.io/step-audio-editx/).
39
+
40
+ ## Model Usage
41
+ ### 📜 Requirements
42
+ The following table shows the requirements for running Step-Audio-EditX model:
43
+
44
+ | Model | Parameters | Setting<br/>(sample frequency) | GPU Optimal Memory |
45
+ |------------|------------|--------------------------------|----------------|
46
+ | Step-Audio-EditX | 3B| 41.6Hz | 32 GB |
47
+
48
+ * An NVIDIA GPU with CUDA support is required.
49
+ * The model is tested on a single L40S GPU.
50
+ * Tested operating system: Linux
51
+
52
+ ### 🔧 Dependencies and Installation
53
+ - Python >= 3.10.0 (Recommend to use [Anaconda](https://www.anaconda.com/download/#linux) or [Miniconda](https://docs.conda.io/en/latest/miniconda.html))
54
+ - [PyTorch >= 2.4.1-cu121](https://pytorch.org/)
55
+ - [CUDA Toolkit](https://developer.nvidia.com/cuda-downloads)
56
+
57
+ ```bash
58
+ git clone https://github.com/stepfun-ai/Step-Audio-EditX.git
59
+ conda create -n stepaudioedit python=3.10
60
+ conda activate stepaudioedit
61
+
62
+ cd Step-Audio-EditX
63
+ pip install -r requirements.txt
64
+
65
+ git lfs install
66
+ git clone https://huggingface.co/stepfun-ai/Step-Audio-Tokenizer
67
+ git clone https://huggingface.co/stepfun-ai/Step-Audio-EditX
68
+
69
+ ```
70
+
71
+ After downloading the models, where_you_download_dir should have the following structure:
72
+ ```
73
+ where_you_download_dir
74
+ ├── Step-Audio-Tokenizer
75
+ ├── Step-Audio-EditX
76
+ ```
77
+
78
+ #### Run with Docker
79
+
80
+ You can set up the environment required for running Step-Audio using the provided Dockerfile.
81
+
82
+ ```bash
83
+ # build docker
84
+ docker build . -t step-audio-editx
85
+
86
+ # run docker
87
+ docker run --rm --gpus all \
88
+ -v /your/code/path:/app \
89
+ -v /your/model/path:/model \
90
+ -p 7860:7860 \
91
+ step-audio-editx
92
+ ```
93
+
94
+
95
+ #### Launch Web Demo
96
+ Start a local server for online inference.
97
+ Assume you have one GPU with at least 32GB memory available and have already downloaded all the models.
98
+
99
+ ```bash
100
+ # Step-Audio-EditX demo
101
+ python app.py --model-path where_you_download_dir --model-source local
102
+ ```
103
+
104
+ #### Local Inference Demo
105
+ > [!TIP]
106
+ > For optimal performance, keep audio under 30 seconds per inference.
107
+
108
+ ```bash
109
+ # zero-shot cloning
110
+ python3 tts_infer.py \
111
+ --model-path where_you_download_dir \
112
+ --output-dir ./output \
113
+ --prompt-text "your prompt text"\
114
+ --prompt-audio your_prompt_audio_path \
115
+ --generated-text "your target text" \
116
+ --edit-type "clone"
117
+
118
+ # edit
119
+ python3 tts_infer.py \
120
+ --model-path where_you_download_dir \
121
+ --output-dir ./output \
122
+ --prompt-text "your promt text" \
123
+ --prompt-audio your_prompt_audio_path \
124
+ --generated-text "" \ # for para-linguistic editing, you need to specify the generatedd text
125
+ --edit-type "emotion" \
126
+ --edit-info "sad" \
127
+ --n-edit-iter 2
128
+ ```
129
+
130
+
131
+ ## Citation
132
+
133
+ ```
134
+ @misc{yan2025stepaudioeditxtechnicalreport,
135
+ title={Step-Audio-EditX Technical Report},
136
+ author={Chao Yan and Boyong Wu and Peng Yang and Pengfei Tan and Guoqiang Hu and Yuxin Zhang and Xiangyu and Zhang and Fei Tian and Xuerui Yang and Xiangyu Zhang and Daxin Jiang and Gang Yu},
137
+ year={2025},
138
+ eprint={2511.03601},
139
+ archivePrefix={arXiv},
140
+ primaryClass={cs.CL},
141
+ url={https://arxiv.org/abs/2511.03601},
142
+ }
143
+
144
+ ```
models/Step-Audio-EditX/config.json ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "Step1ForCausalLM"
4
+ ],
5
+ "auto_map": {
6
+ "AutoConfig": "configuration_step1.Step1Config",
7
+ "AutoModelForCausalLM": "modeling_step1.Step1ForCausalLM"
8
+ },
9
+ "model_type": "step1",
10
+ "bos_token_id": 1,
11
+ "pad_token_id": 0,
12
+ "eos_token_id": 3,
13
+ "hidden_size": 3072,
14
+ "intermediate_size": 8192,
15
+ "num_attention_heads": 48,
16
+ "num_attention_groups": 4,
17
+ "num_hidden_layers": 32,
18
+ "max_seq_len": 32768,
19
+ "vocab_size": 74752,
20
+ "rms_norm_eps": 1e-05,
21
+ "torch_dtype": "bfloat16"
22
+ }
models/Step-Audio-EditX/configuration_step1.py ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Optional, List, Any, Dict
2
+ from transformers.configuration_utils import PretrainedConfig
3
+
4
+
5
+
6
+ class Step1Config(PretrainedConfig):
7
+ model_type = "step1"
8
+ keys_to_ignore_at_inference = ["past_key_values"]
9
+
10
+ def __init__(
11
+ self,
12
+ hidden_size: int = 5120,
13
+ intermediate_size: int = 13312,
14
+ num_attention_heads: int = 40,
15
+ num_attention_groups: int = 8,
16
+ num_hidden_layers: int = 48,
17
+ max_seq_len: int = 4096,
18
+ vocab_size: int = 65536,
19
+ rms_norm_eps: float = 1e-5,
20
+ bos_token_id: int = 1,
21
+ eos_token_id: int = 3,
22
+ pad_token_id: int = 0,
23
+ **kwargs,
24
+ ) -> None:
25
+ self.hidden_size = hidden_size
26
+ self.intermediate_size = intermediate_size
27
+ self.num_attention_heads = num_attention_heads
28
+ self.num_attention_groups = num_attention_groups
29
+ self.num_hidden_layers = num_hidden_layers
30
+ self.max_seq_len = max_seq_len
31
+ self.vocab_size = vocab_size
32
+ self.rms_norm_eps = rms_norm_eps
33
+ super().__init__(
34
+ bos_token_id=bos_token_id,
35
+ pad_token_id=pad_token_id,
36
+ eos_token_id=eos_token_id,
37
+ **kwargs
38
+ )
39
+
40
+
41
+ __all__ = ["Step1Config"]
models/Step-Audio-EditX/model-00001.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5b5e7e066a5efbad022a77d688f5c35031448c4edfb9b101e84624a2c593ce75
3
+ size 7059446656
models/Step-Audio-EditX/model.safetensors.index.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"metadata": {"total_size": 7059412992}, "weight_map": {"model.embed_tokens.weight": "model-00001.safetensors", "model.layers.0.self_attn.o_proj.weight": "model-00001.safetensors", "model.layers.0.input_layernorm.weight": "model-00001.safetensors", "model.layers.0.mlp.down_proj.weight": "model-00001.safetensors", "model.layers.0.post_attention_layernorm.weight": "model-00001.safetensors", "model.layers.0.self_attn.q_proj.weight": "model-00001.safetensors", "model.layers.0.self_attn.k_proj.weight": "model-00001.safetensors", "model.layers.0.self_attn.v_proj.weight": "model-00001.safetensors", "model.layers.0.mlp.gate_proj.weight": "model-00001.safetensors", "model.layers.0.mlp.up_proj.weight": "model-00001.safetensors", "model.layers.1.self_attn.o_proj.weight": "model-00001.safetensors", "model.layers.1.input_layernorm.weight": "model-00001.safetensors", "model.layers.1.mlp.down_proj.weight": "model-00001.safetensors", "model.layers.1.post_attention_layernorm.weight": "model-00001.safetensors", "model.layers.1.self_attn.q_proj.weight": "model-00001.safetensors", "model.layers.1.self_attn.k_proj.weight": "model-00001.safetensors", "model.layers.1.self_attn.v_proj.weight": "model-00001.safetensors", "model.layers.1.mlp.gate_proj.weight": "model-00001.safetensors", "model.layers.1.mlp.up_proj.weight": "model-00001.safetensors", "model.layers.2.self_attn.o_proj.weight": "model-00001.safetensors", "model.layers.2.input_layernorm.weight": "model-00001.safetensors", "model.layers.2.mlp.down_proj.weight": "model-00001.safetensors", "model.layers.2.post_attention_layernorm.weight": "model-00001.safetensors", "model.layers.2.self_attn.q_proj.weight": "model-00001.safetensors", "model.layers.2.self_attn.k_proj.weight": "model-00001.safetensors", "model.layers.2.self_attn.v_proj.weight": "model-00001.safetensors", "model.layers.2.mlp.gate_proj.weight": "model-00001.safetensors", "model.layers.2.mlp.up_proj.weight": "model-00001.safetensors", "model.layers.3.self_attn.o_proj.weight": "model-00001.safetensors", "model.layers.3.input_layernorm.weight": "model-00001.safetensors", "model.layers.3.mlp.down_proj.weight": "model-00001.safetensors", "model.layers.3.post_attention_layernorm.weight": "model-00001.safetensors", "model.layers.3.self_attn.q_proj.weight": "model-00001.safetensors", "model.layers.3.self_attn.k_proj.weight": "model-00001.safetensors", "model.layers.3.self_attn.v_proj.weight": "model-00001.safetensors", "model.layers.3.mlp.gate_proj.weight": "model-00001.safetensors", "model.layers.3.mlp.up_proj.weight": "model-00001.safetensors", "model.layers.4.self_attn.o_proj.weight": "model-00001.safetensors", "model.layers.4.input_layernorm.weight": "model-00001.safetensors", "model.layers.4.mlp.down_proj.weight": "model-00001.safetensors", "model.layers.4.post_attention_layernorm.weight": "model-00001.safetensors", "model.layers.4.self_attn.q_proj.weight": "model-00001.safetensors", "model.layers.4.self_attn.k_proj.weight": "model-00001.safetensors", "model.layers.4.self_attn.v_proj.weight": "model-00001.safetensors", "model.layers.4.mlp.gate_proj.weight": "model-00001.safetensors", "model.layers.4.mlp.up_proj.weight": "model-00001.safetensors", "model.layers.5.self_attn.o_proj.weight": "model-00001.safetensors", "model.layers.5.input_layernorm.weight": "model-00001.safetensors", "model.layers.5.mlp.down_proj.weight": "model-00001.safetensors", "model.layers.5.post_attention_layernorm.weight": "model-00001.safetensors", "model.layers.5.self_attn.q_proj.weight": "model-00001.safetensors", "model.layers.5.self_attn.k_proj.weight": "model-00001.safetensors", "model.layers.5.self_attn.v_proj.weight": "model-00001.safetensors", "model.layers.5.mlp.gate_proj.weight": "model-00001.safetensors", "model.layers.5.mlp.up_proj.weight": "model-00001.safetensors", "model.layers.6.self_attn.o_proj.weight": "model-00001.safetensors", "model.layers.6.input_layernorm.weight": "model-00001.safetensors", "model.layers.6.mlp.down_proj.weight": "model-00001.safetensors", "model.layers.6.post_attention_layernorm.weight": "model-00001.safetensors", "model.layers.6.self_attn.q_proj.weight": "model-00001.safetensors", "model.layers.6.self_attn.k_proj.weight": "model-00001.safetensors", "model.layers.6.self_attn.v_proj.weight": "model-00001.safetensors", "model.layers.6.mlp.gate_proj.weight": "model-00001.safetensors", "model.layers.6.mlp.up_proj.weight": "model-00001.safetensors", "model.layers.7.self_attn.o_proj.weight": "model-00001.safetensors", "model.layers.7.input_layernorm.weight": "model-00001.safetensors", "model.layers.7.mlp.down_proj.weight": "model-00001.safetensors", "model.layers.7.post_attention_layernorm.weight": "model-00001.safetensors", "model.layers.7.self_attn.q_proj.weight": "model-00001.safetensors", "model.layers.7.self_attn.k_proj.weight": "model-00001.safetensors", "model.layers.7.self_attn.v_proj.weight": "model-00001.safetensors", "model.layers.7.mlp.gate_proj.weight": "model-00001.safetensors", "model.layers.7.mlp.up_proj.weight": "model-00001.safetensors", "model.layers.8.self_attn.o_proj.weight": "model-00001.safetensors", "model.layers.8.input_layernorm.weight": "model-00001.safetensors", "model.layers.8.mlp.down_proj.weight": "model-00001.safetensors", "model.layers.8.post_attention_layernorm.weight": "model-00001.safetensors", "model.layers.8.self_attn.q_proj.weight": "model-00001.safetensors", "model.layers.8.self_attn.k_proj.weight": "model-00001.safetensors", "model.layers.8.self_attn.v_proj.weight": "model-00001.safetensors", "model.layers.8.mlp.gate_proj.weight": "model-00001.safetensors", "model.layers.8.mlp.up_proj.weight": "model-00001.safetensors", "model.layers.9.self_attn.o_proj.weight": "model-00001.safetensors", "model.layers.9.input_layernorm.weight": "model-00001.safetensors", "model.layers.9.mlp.down_proj.weight": "model-00001.safetensors", "model.layers.9.post_attention_layernorm.weight": "model-00001.safetensors", "model.layers.9.self_attn.q_proj.weight": "model-00001.safetensors", "model.layers.9.self_attn.k_proj.weight": "model-00001.safetensors", "model.layers.9.self_attn.v_proj.weight": "model-00001.safetensors", "model.layers.9.mlp.gate_proj.weight": "model-00001.safetensors", "model.layers.9.mlp.up_proj.weight": "model-00001.safetensors", "model.layers.10.self_attn.o_proj.weight": "model-00001.safetensors", "model.layers.10.input_layernorm.weight": "model-00001.safetensors", "model.layers.10.mlp.down_proj.weight": "model-00001.safetensors", "model.layers.10.post_attention_layernorm.weight": "model-00001.safetensors", "model.layers.10.self_attn.q_proj.weight": "model-00001.safetensors", "model.layers.10.self_attn.k_proj.weight": "model-00001.safetensors", "model.layers.10.self_attn.v_proj.weight": "model-00001.safetensors", "model.layers.10.mlp.gate_proj.weight": "model-00001.safetensors", "model.layers.10.mlp.up_proj.weight": "model-00001.safetensors", "model.layers.11.self_attn.o_proj.weight": "model-00001.safetensors", "model.layers.11.input_layernorm.weight": "model-00001.safetensors", "model.layers.11.mlp.down_proj.weight": "model-00001.safetensors", "model.layers.11.post_attention_layernorm.weight": "model-00001.safetensors", "model.layers.11.self_attn.q_proj.weight": "model-00001.safetensors", "model.layers.11.self_attn.k_proj.weight": "model-00001.safetensors", "model.layers.11.self_attn.v_proj.weight": "model-00001.safetensors", "model.layers.11.mlp.gate_proj.weight": "model-00001.safetensors", "model.layers.11.mlp.up_proj.weight": "model-00001.safetensors", "model.layers.12.self_attn.o_proj.weight": "model-00001.safetensors", "model.layers.12.input_layernorm.weight": "model-00001.safetensors", "model.layers.12.mlp.down_proj.weight": "model-00001.safetensors", "model.layers.12.post_attention_layernorm.weight": "model-00001.safetensors", "model.layers.12.self_attn.q_proj.weight": "model-00001.safetensors", "model.layers.12.self_attn.k_proj.weight": "model-00001.safetensors", "model.layers.12.self_attn.v_proj.weight": "model-00001.safetensors", "model.layers.12.mlp.gate_proj.weight": "model-00001.safetensors", "model.layers.12.mlp.up_proj.weight": "model-00001.safetensors", "model.layers.13.self_attn.o_proj.weight": "model-00001.safetensors", "model.layers.13.input_layernorm.weight": "model-00001.safetensors", "model.layers.13.mlp.down_proj.weight": "model-00001.safetensors", "model.layers.13.post_attention_layernorm.weight": "model-00001.safetensors", "model.layers.13.self_attn.q_proj.weight": "model-00001.safetensors", "model.layers.13.self_attn.k_proj.weight": "model-00001.safetensors", "model.layers.13.self_attn.v_proj.weight": "model-00001.safetensors", "model.layers.13.mlp.gate_proj.weight": "model-00001.safetensors", "model.layers.13.mlp.up_proj.weight": "model-00001.safetensors", "model.layers.14.self_attn.o_proj.weight": "model-00001.safetensors", "model.layers.14.input_layernorm.weight": "model-00001.safetensors", "model.layers.14.mlp.down_proj.weight": "model-00001.safetensors", "model.layers.14.post_attention_layernorm.weight": "model-00001.safetensors", "model.layers.14.self_attn.q_proj.weight": "model-00001.safetensors", "model.layers.14.self_attn.k_proj.weight": "model-00001.safetensors", "model.layers.14.self_attn.v_proj.weight": "model-00001.safetensors", "model.layers.14.mlp.gate_proj.weight": "model-00001.safetensors", "model.layers.14.mlp.up_proj.weight": "model-00001.safetensors", "model.layers.15.self_attn.o_proj.weight": "model-00001.safetensors", "model.layers.15.input_layernorm.weight": "model-00001.safetensors", "model.layers.15.mlp.down_proj.weight": "model-00001.safetensors", "model.layers.15.post_attention_layernorm.weight": "model-00001.safetensors", "model.layers.15.self_attn.q_proj.weight": "model-00001.safetensors", "model.layers.15.self_attn.k_proj.weight": "model-00001.safetensors", "model.layers.15.self_attn.v_proj.weight": "model-00001.safetensors", "model.layers.15.mlp.gate_proj.weight": "model-00001.safetensors", "model.layers.15.mlp.up_proj.weight": "model-00001.safetensors", "model.layers.16.self_attn.o_proj.weight": "model-00001.safetensors", "model.layers.16.input_layernorm.weight": "model-00001.safetensors", "model.layers.16.mlp.down_proj.weight": "model-00001.safetensors", "model.layers.16.post_attention_layernorm.weight": "model-00001.safetensors", "model.layers.16.self_attn.q_proj.weight": "model-00001.safetensors", "model.layers.16.self_attn.k_proj.weight": "model-00001.safetensors", "model.layers.16.self_attn.v_proj.weight": "model-00001.safetensors", "model.layers.16.mlp.gate_proj.weight": "model-00001.safetensors", "model.layers.16.mlp.up_proj.weight": "model-00001.safetensors", "model.layers.17.self_attn.o_proj.weight": "model-00001.safetensors", "model.layers.17.input_layernorm.weight": "model-00001.safetensors", "model.layers.17.mlp.down_proj.weight": "model-00001.safetensors", "model.layers.17.post_attention_layernorm.weight": "model-00001.safetensors", "model.layers.17.self_attn.q_proj.weight": "model-00001.safetensors", "model.layers.17.self_attn.k_proj.weight": "model-00001.safetensors", "model.layers.17.self_attn.v_proj.weight": "model-00001.safetensors", "model.layers.17.mlp.gate_proj.weight": "model-00001.safetensors", "model.layers.17.mlp.up_proj.weight": "model-00001.safetensors", "model.layers.18.self_attn.o_proj.weight": "model-00001.safetensors", "model.layers.18.input_layernorm.weight": "model-00001.safetensors", "model.layers.18.mlp.down_proj.weight": "model-00001.safetensors", "model.layers.18.post_attention_layernorm.weight": "model-00001.safetensors", "model.layers.18.self_attn.q_proj.weight": "model-00001.safetensors", "model.layers.18.self_attn.k_proj.weight": "model-00001.safetensors", "model.layers.18.self_attn.v_proj.weight": "model-00001.safetensors", "model.layers.18.mlp.gate_proj.weight": "model-00001.safetensors", "model.layers.18.mlp.up_proj.weight": "model-00001.safetensors", "model.layers.19.self_attn.o_proj.weight": "model-00001.safetensors", "model.layers.19.input_layernorm.weight": "model-00001.safetensors", "model.layers.19.mlp.down_proj.weight": "model-00001.safetensors", "model.layers.19.post_attention_layernorm.weight": "model-00001.safetensors", "model.layers.19.self_attn.q_proj.weight": "model-00001.safetensors", "model.layers.19.self_attn.k_proj.weight": "model-00001.safetensors", "model.layers.19.self_attn.v_proj.weight": "model-00001.safetensors", "model.layers.19.mlp.gate_proj.weight": "model-00001.safetensors", "model.layers.19.mlp.up_proj.weight": "model-00001.safetensors", "model.layers.20.self_attn.o_proj.weight": "model-00001.safetensors", "model.layers.20.input_layernorm.weight": "model-00001.safetensors", "model.layers.20.mlp.down_proj.weight": "model-00001.safetensors", "model.layers.20.post_attention_layernorm.weight": "model-00001.safetensors", "model.layers.20.self_attn.q_proj.weight": "model-00001.safetensors", "model.layers.20.self_attn.k_proj.weight": "model-00001.safetensors", "model.layers.20.self_attn.v_proj.weight": "model-00001.safetensors", "model.layers.20.mlp.gate_proj.weight": "model-00001.safetensors", "model.layers.20.mlp.up_proj.weight": "model-00001.safetensors", "model.layers.21.self_attn.o_proj.weight": "model-00001.safetensors", "model.layers.21.input_layernorm.weight": "model-00001.safetensors", "model.layers.21.mlp.down_proj.weight": "model-00001.safetensors", "model.layers.21.post_attention_layernorm.weight": "model-00001.safetensors", "model.layers.21.self_attn.q_proj.weight": "model-00001.safetensors", "model.layers.21.self_attn.k_proj.weight": "model-00001.safetensors", "model.layers.21.self_attn.v_proj.weight": "model-00001.safetensors", "model.layers.21.mlp.gate_proj.weight": "model-00001.safetensors", "model.layers.21.mlp.up_proj.weight": "model-00001.safetensors", "model.layers.22.self_attn.o_proj.weight": "model-00001.safetensors", "model.layers.22.input_layernorm.weight": "model-00001.safetensors", "model.layers.22.mlp.down_proj.weight": "model-00001.safetensors", "model.layers.22.post_attention_layernorm.weight": "model-00001.safetensors", "model.layers.22.self_attn.q_proj.weight": "model-00001.safetensors", "model.layers.22.self_attn.k_proj.weight": "model-00001.safetensors", "model.layers.22.self_attn.v_proj.weight": "model-00001.safetensors", "model.layers.22.mlp.gate_proj.weight": "model-00001.safetensors", "model.layers.22.mlp.up_proj.weight": "model-00001.safetensors", "model.layers.23.self_attn.o_proj.weight": "model-00001.safetensors", "model.layers.23.input_layernorm.weight": "model-00001.safetensors", "model.layers.23.mlp.down_proj.weight": "model-00001.safetensors", "model.layers.23.post_attention_layernorm.weight": "model-00001.safetensors", "model.layers.23.self_attn.q_proj.weight": "model-00001.safetensors", "model.layers.23.self_attn.k_proj.weight": "model-00001.safetensors", "model.layers.23.self_attn.v_proj.weight": "model-00001.safetensors", "model.layers.23.mlp.gate_proj.weight": "model-00001.safetensors", "model.layers.23.mlp.up_proj.weight": "model-00001.safetensors", "model.layers.24.self_attn.o_proj.weight": "model-00001.safetensors", "model.layers.24.input_layernorm.weight": "model-00001.safetensors", "model.layers.24.mlp.down_proj.weight": "model-00001.safetensors", "model.layers.24.post_attention_layernorm.weight": "model-00001.safetensors", "model.layers.24.self_attn.q_proj.weight": "model-00001.safetensors", "model.layers.24.self_attn.k_proj.weight": "model-00001.safetensors", "model.layers.24.self_attn.v_proj.weight": "model-00001.safetensors", "model.layers.24.mlp.gate_proj.weight": "model-00001.safetensors", "model.layers.24.mlp.up_proj.weight": "model-00001.safetensors", "model.layers.25.self_attn.o_proj.weight": "model-00001.safetensors", "model.layers.25.input_layernorm.weight": "model-00001.safetensors", "model.layers.25.mlp.down_proj.weight": "model-00001.safetensors", "model.layers.25.post_attention_layernorm.weight": "model-00001.safetensors", "model.layers.25.self_attn.q_proj.weight": "model-00001.safetensors", "model.layers.25.self_attn.k_proj.weight": "model-00001.safetensors", "model.layers.25.self_attn.v_proj.weight": "model-00001.safetensors", "model.layers.25.mlp.gate_proj.weight": "model-00001.safetensors", "model.layers.25.mlp.up_proj.weight": "model-00001.safetensors", "model.layers.26.self_attn.o_proj.weight": "model-00001.safetensors", "model.layers.26.input_layernorm.weight": "model-00001.safetensors", "model.layers.26.mlp.down_proj.weight": "model-00001.safetensors", "model.layers.26.post_attention_layernorm.weight": "model-00001.safetensors", "model.layers.26.self_attn.q_proj.weight": "model-00001.safetensors", "model.layers.26.self_attn.k_proj.weight": "model-00001.safetensors", "model.layers.26.self_attn.v_proj.weight": "model-00001.safetensors", "model.layers.26.mlp.gate_proj.weight": "model-00001.safetensors", "model.layers.26.mlp.up_proj.weight": "model-00001.safetensors", "model.layers.27.self_attn.o_proj.weight": "model-00001.safetensors", "model.layers.27.input_layernorm.weight": "model-00001.safetensors", "model.layers.27.mlp.down_proj.weight": "model-00001.safetensors", "model.layers.27.post_attention_layernorm.weight": "model-00001.safetensors", "model.layers.27.self_attn.q_proj.weight": "model-00001.safetensors", "model.layers.27.self_attn.k_proj.weight": "model-00001.safetensors", "model.layers.27.self_attn.v_proj.weight": "model-00001.safetensors", "model.layers.27.mlp.gate_proj.weight": "model-00001.safetensors", "model.layers.27.mlp.up_proj.weight": "model-00001.safetensors", "model.layers.28.self_attn.o_proj.weight": "model-00001.safetensors", "model.layers.28.input_layernorm.weight": "model-00001.safetensors", "model.layers.28.mlp.down_proj.weight": "model-00001.safetensors", "model.layers.28.post_attention_layernorm.weight": "model-00001.safetensors", "model.layers.28.self_attn.q_proj.weight": "model-00001.safetensors", "model.layers.28.self_attn.k_proj.weight": "model-00001.safetensors", "model.layers.28.self_attn.v_proj.weight": "model-00001.safetensors", "model.layers.28.mlp.gate_proj.weight": "model-00001.safetensors", "model.layers.28.mlp.up_proj.weight": "model-00001.safetensors", "model.layers.29.self_attn.o_proj.weight": "model-00001.safetensors", "model.layers.29.input_layernorm.weight": "model-00001.safetensors", "model.layers.29.mlp.down_proj.weight": "model-00001.safetensors", "model.layers.29.post_attention_layernorm.weight": "model-00001.safetensors", "model.layers.29.self_attn.q_proj.weight": "model-00001.safetensors", "model.layers.29.self_attn.k_proj.weight": "model-00001.safetensors", "model.layers.29.self_attn.v_proj.weight": "model-00001.safetensors", "model.layers.29.mlp.gate_proj.weight": "model-00001.safetensors", "model.layers.29.mlp.up_proj.weight": "model-00001.safetensors", "model.layers.30.self_attn.o_proj.weight": "model-00001.safetensors", "model.layers.30.input_layernorm.weight": "model-00001.safetensors", "model.layers.30.mlp.down_proj.weight": "model-00001.safetensors", "model.layers.30.post_attention_layernorm.weight": "model-00001.safetensors", "model.layers.30.self_attn.q_proj.weight": "model-00001.safetensors", "model.layers.30.self_attn.k_proj.weight": "model-00001.safetensors", "model.layers.30.self_attn.v_proj.weight": "model-00001.safetensors", "model.layers.30.mlp.gate_proj.weight": "model-00001.safetensors", "model.layers.30.mlp.up_proj.weight": "model-00001.safetensors", "model.layers.31.self_attn.o_proj.weight": "model-00001.safetensors", "model.layers.31.input_layernorm.weight": "model-00001.safetensors", "model.layers.31.mlp.down_proj.weight": "model-00001.safetensors", "model.layers.31.post_attention_layernorm.weight": "model-00001.safetensors", "model.layers.31.self_attn.q_proj.weight": "model-00001.safetensors", "model.layers.31.self_attn.k_proj.weight": "model-00001.safetensors", "model.layers.31.self_attn.v_proj.weight": "model-00001.safetensors", "model.layers.31.mlp.gate_proj.weight": "model-00001.safetensors", "model.layers.31.mlp.up_proj.weight": "model-00001.safetensors", "model.norm.weight": "model-00001.safetensors", "lm_head.weight": "model-00001.safetensors"}}
models/Step-Audio-EditX/modeling_step1.py ADDED
@@ -0,0 +1,414 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import math
2
+ from typing import Optional, Tuple, Union, List
3
+
4
+ import torch
5
+ import torch.utils.checkpoint
6
+ from torch import nn
7
+ from transformers.generation import GenerationMixin
8
+
9
+ from transformers.modeling_utils import PreTrainedModel
10
+ from transformers.utils import logging
11
+ from .configuration_step1 import Step1Config
12
+ from transformers.cache_utils import Cache, DynamicCache
13
+ from einops import rearrange
14
+ from transformers.modeling_outputs import (
15
+ BaseModelOutputWithPast,
16
+ CausalLMOutputWithPast,
17
+ )
18
+
19
+ logger = logging.get_logger(__name__)
20
+
21
+
22
+ def build_alibi_cache(block_size, n_heads, dtype, device):
23
+ # get slopes
24
+ n = 2 ** math.floor(math.log2(n_heads)) # nearest 2**n to n_heads
25
+ m0 = 2.0 ** (-8.0 / n)
26
+ # 2^(-8/n), 2^(-8*2/n), 2^(-8*3/n), ...
27
+ slopes = torch.pow(m0, torch.arange(1, n + 1))
28
+ if n < n_heads:
29
+ m1 = 2.0 ** (-4.0 / n)
30
+ # 2^(-8/(2n)), 2^(-8*3/(2n)), 2^(-8*5/(2n)), ...
31
+ mm = torch.pow(m1, torch.arange(1, 1 + 2 * (n_heads - n), 2))
32
+ slopes = torch.cat([slopes, mm])
33
+ slopes = slopes.to(device)
34
+
35
+ tril = torch.tril(torch.ones(1, 1, block_size, block_size, device=device))
36
+
37
+ bias_rows = torch.arange(block_size, device=device).view(1, -1)
38
+ bias_cols = torch.arange(block_size, device=device).view(-1, 1)
39
+ bias = -torch.sqrt(bias_cols - bias_rows)
40
+ bias = bias.view(1, block_size, block_size) * slopes.view(-1, 1, 1)
41
+ bias = bias.masked_fill(tril == 0, float("-inf"))
42
+
43
+ return bias.type(dtype)
44
+
45
+
46
+ class StepRMSNorm(torch.nn.Module):
47
+ def __init__(self, hidden_size, eps=1e-5):
48
+ super().__init__()
49
+ self.weight = torch.nn.Parameter(torch.ones(hidden_size))
50
+ self.eps = eps
51
+
52
+ def forward(self, x: torch.Tensor):
53
+ var = x.float().pow(2).mean(-1, keepdim=True)
54
+ x = x * torch.rsqrt(var + self.eps).to(x.dtype)
55
+ x = x * self.weight
56
+ return x
57
+
58
+
59
+ class StepAttention(torch.nn.Module):
60
+ def __init__(self, hidden_size, num_heads, num_groups, layer_idx: int):
61
+ super().__init__()
62
+
63
+ self.num_heads = num_heads
64
+ self.num_groups = num_groups
65
+ self.hidden_size = hidden_size
66
+ self.head_dim = hidden_size // num_heads
67
+
68
+ self.q_proj = torch.nn.Linear(hidden_size, hidden_size, bias=False)
69
+ self.k_proj = torch.nn.Linear(
70
+ hidden_size, num_groups * self.head_dim, bias=False
71
+ )
72
+ self.v_proj = torch.nn.Linear(
73
+ hidden_size, num_groups * self.head_dim, bias=False
74
+ )
75
+ self.o_proj = torch.nn.Linear(hidden_size, hidden_size, bias=False)
76
+
77
+ self.layer_idx = layer_idx
78
+
79
+ def flash_attn_func(self, q, k, v, dropout_p=0.0, softmax_scale=None, causal=True,
80
+ return_attn_probs=False, tp_group_rank=0, tp_group_size=1):
81
+ softmax_scale = q.size(-1) ** (-0.5) if softmax_scale is None else softmax_scale
82
+ return torch.ops.Optimus.fwd(q, k, v, None, dropout_p, softmax_scale, causal, return_attn_probs, None, tp_group_rank, tp_group_size)[0]
83
+
84
+ def forward(
85
+ self,
86
+ x: torch.Tensor,
87
+ past_key_value: Optional[Cache] = None,
88
+ attention_mask: Optional[torch.Tensor] = None,
89
+ cache_position: Optional[torch.LongTensor] = None,
90
+ ):
91
+
92
+ q: torch.Tensor = self.q_proj(x)
93
+ k: torch.Tensor = self.k_proj(x)
94
+ v: torch.Tensor = self.v_proj(x)
95
+ if past_key_value is not None:
96
+ cache_kwargs = {"cache_position": cache_position}
97
+ k, v = past_key_value.update(k, v, self.layer_idx, cache_kwargs)
98
+
99
+ q = rearrange(q, "b s (h d) -> b s h d", h=self.num_heads)
100
+ k = rearrange(k, "b s (g d) -> b s g d", g=self.num_groups)
101
+ v = rearrange(v, "b s (g d) -> b s g d", g=self.num_groups)
102
+
103
+ try:
104
+ if self.head_dim not in (64, 128):
105
+ raise ValueError("head_dim must be 64 or 128")
106
+ attn_output = self.flash_attn_func(q, k, v)
107
+ attn_output = attn_output.flatten(-2, -1)
108
+ except:
109
+ k = k.repeat_interleave(self.num_heads // self.num_groups, dim=-2)
110
+ v = v.repeat_interleave(self.num_heads // self.num_groups, dim=-2)
111
+
112
+ attention_mask = build_alibi_cache(
113
+ k.size(1), self.num_heads, dtype=q.dtype, device=q.device
114
+ )[:, :, -q.size(1) :, :].contiguous()
115
+
116
+ q = q.transpose(1, 2)
117
+ k = k.transpose(1, 2)
118
+ v = v.transpose(1, 2)
119
+
120
+ attn_output: torch.Tensor = torch.nn.functional.scaled_dot_product_attention(
121
+ q, k, v, attn_mask=attention_mask
122
+ )
123
+
124
+ attn_output = attn_output.transpose(1, 2).flatten(-2, -1)
125
+
126
+ out = self.o_proj(attn_output)
127
+ return out, None # attn weights are not returned
128
+
129
+
130
+ class StepMLP(torch.nn.Module):
131
+ def __init__(self, hidden_size, intermediate_size):
132
+ super().__init__()
133
+ self.gate_proj = torch.nn.Linear(hidden_size, intermediate_size, bias=False)
134
+ self.up_proj = torch.nn.Linear(hidden_size, intermediate_size, bias=False)
135
+ self.down_proj = torch.nn.Linear(intermediate_size, hidden_size, bias=False)
136
+
137
+ def forward(self, x):
138
+ gate = self.gate_proj(x)
139
+ up = self.up_proj(x)
140
+ x = torch.nn.functional.silu(gate) * up
141
+ x = self.down_proj(x)
142
+ return x
143
+
144
+
145
+ class StepLayer(torch.nn.Module):
146
+ def __init__(self, config: Step1Config, layer_idx: int):
147
+ super().__init__()
148
+ self.layer_idx = layer_idx
149
+ self.self_attn = StepAttention(
150
+ hidden_size=config.hidden_size,
151
+ num_heads=config.num_attention_heads,
152
+ num_groups=config.num_attention_groups,
153
+ layer_idx=layer_idx,
154
+ )
155
+ self.mlp = StepMLP(
156
+ hidden_size=config.hidden_size,
157
+ intermediate_size=config.intermediate_size,
158
+ )
159
+ self.input_layernorm = StepRMSNorm(
160
+ hidden_size=config.hidden_size, eps=config.rms_norm_eps
161
+ )
162
+ self.post_attention_layernorm = StepRMSNorm(
163
+ hidden_size=config.hidden_size, eps=config.rms_norm_eps
164
+ )
165
+
166
+ def forward(
167
+ self,
168
+ hidden_states: torch.Tensor,
169
+ attention_mask: Optional[torch.Tensor] = None,
170
+ past_key_value: Optional[Cache] = None,
171
+ output_attentions: Optional[bool] = False,
172
+ cache_position: Optional[torch.LongTensor] = None,
173
+ ):
174
+ residual = hidden_states
175
+ hidden_states = self.input_layernorm(hidden_states)
176
+ hidden_states, self_attn_weights = self.self_attn(hidden_states, past_key_value, attention_mask, cache_position)
177
+ hidden_states = residual + hidden_states
178
+
179
+ residual = hidden_states
180
+ hidden_states = self.post_attention_layernorm(hidden_states)
181
+ hidden_states = self.mlp(hidden_states)
182
+ hidden_states = residual + hidden_states
183
+
184
+ outputs = (hidden_states, )
185
+ if output_attentions:
186
+ outputs += (self_attn_weights,)
187
+ return outputs
188
+
189
+
190
+ class StepPreTrainedModel(PreTrainedModel):
191
+ config_class = Step1Config
192
+ base_model_prefix = "model"
193
+ supports_gradient_checkpointing = True
194
+ _no_split_modules = ["StepLayer"]
195
+ _skip_keys_device_placement = ["past_key_values"]
196
+ _supports_cache_class = True
197
+ _supports_static_cache = True
198
+
199
+ def _init_weights(self, module):
200
+ std = self.config.initializer_range
201
+ if isinstance(module, nn.Linear):
202
+ module.weight.data.normal_(mean=0.0, std=std)
203
+ if module.bias is not None:
204
+ module.bias.data.zero_()
205
+ elif isinstance(module, nn.Embedding):
206
+ module.weight.data.normal_(mean=0.0, std=std)
207
+ if module.padding_idx is not None:
208
+ module.weight.data[module.padding_idx].zero_()
209
+
210
+
211
+ class Step1Model(StepPreTrainedModel):
212
+ """
213
+ Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`LlamaDecoderLayer`]
214
+
215
+ Args:
216
+ config: Step1Config
217
+ """
218
+
219
+ def __init__(self, config: Step1Config):
220
+ super().__init__(config)
221
+ self.config = config
222
+ self.embed_tokens = torch.nn.Embedding(config.vocab_size, config.hidden_size)
223
+
224
+ self.layers = torch.nn.Sequential(
225
+ *[
226
+ StepLayer(config, layer_idx)
227
+ for layer_idx in range(config.num_hidden_layers)
228
+ ]
229
+ )
230
+
231
+ self.norm = StepRMSNorm(
232
+ hidden_size=config.hidden_size, eps=config.rms_norm_eps
233
+ )
234
+
235
+ # Initialize weights and apply final processing
236
+ self.post_init()
237
+
238
+ def get_input_embeddings(self):
239
+ return self.embed_tokens
240
+
241
+ def set_input_embeddings(self, value):
242
+ self.embed_tokens = value
243
+
244
+ def forward(
245
+ self,
246
+ input_ids: torch.LongTensor = None,
247
+ attention_mask: Optional[torch.Tensor] = None,
248
+ past_key_values: Optional[Cache] = None,
249
+ inputs_embeds: Optional[torch.FloatTensor] = None,
250
+ use_cache: Optional[bool] = None,
251
+ output_attentions: Optional[bool] = None,
252
+ output_hidden_states: Optional[bool] = None,
253
+ return_dict: Optional[bool] = None,
254
+ cache_position: Optional[torch.LongTensor] = None,
255
+ ) -> Union[Tuple, BaseModelOutputWithPast]:
256
+ output_attentions = (
257
+ output_attentions
258
+ if output_attentions is not None
259
+ else self.config.output_attentions
260
+ )
261
+ output_hidden_states = (
262
+ output_hidden_states
263
+ if output_hidden_states is not None
264
+ else self.config.output_hidden_states
265
+ )
266
+ use_cache = use_cache if use_cache is not None else self.config.use_cache
267
+ return_dict = (
268
+ return_dict if return_dict is not None else self.config.use_return_dict
269
+ )
270
+
271
+ if (input_ids is None) ^ (inputs_embeds is not None):
272
+ raise ValueError(
273
+ "You must specify exactly one of input_ids or inputs_embeds"
274
+ )
275
+
276
+ if inputs_embeds is None:
277
+ inputs_embeds = self.embed_tokens(input_ids)
278
+
279
+ if use_cache and past_key_values is None:
280
+ past_key_values = DynamicCache()
281
+
282
+ if cache_position is None:
283
+ past_seen_tokens = (
284
+ past_key_values.get_seq_length() if past_key_values is not None else 0
285
+ )
286
+ cache_position = torch.arange(
287
+ past_seen_tokens,
288
+ past_seen_tokens + inputs_embeds.shape[1],
289
+ device=inputs_embeds.device,
290
+ )
291
+
292
+ causal_mask = attention_mask
293
+
294
+ hidden_states = inputs_embeds
295
+
296
+ # decoder layers
297
+ all_hidden_states = () if output_hidden_states else None
298
+ all_self_attns = () if output_attentions else None
299
+
300
+ for decoder_layer in self.layers[: self.config.num_hidden_layers]:
301
+ if output_hidden_states:
302
+ all_hidden_states += (hidden_states,)
303
+
304
+ layer_outputs = decoder_layer(
305
+ hidden_states,
306
+ attention_mask=causal_mask,
307
+ past_key_value=past_key_values,
308
+ cache_position=cache_position,
309
+ output_attentions=output_attentions,
310
+ )
311
+
312
+ hidden_states = layer_outputs[0]
313
+
314
+ if output_attentions:
315
+ all_self_attns += (layer_outputs[1],)
316
+
317
+ hidden_states = self.norm(hidden_states)
318
+
319
+ # add hidden states from the last decoder layer
320
+ if output_hidden_states:
321
+ all_hidden_states += (hidden_states,)
322
+
323
+ output = BaseModelOutputWithPast(
324
+ last_hidden_state=hidden_states,
325
+ past_key_values=past_key_values if use_cache else None,
326
+ hidden_states=all_hidden_states,
327
+ attentions=None,
328
+ )
329
+ return output if return_dict else output.to_tuple()
330
+
331
+
332
+ class Step1ForCausalLM(StepPreTrainedModel, GenerationMixin):
333
+ _tied_weights_keys = ["lm_head.weight"]
334
+
335
+ def __init__(self, config):
336
+ super().__init__(config)
337
+ self.model = Step1Model(config)
338
+ self.vocab_size = config.vocab_size
339
+ self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
340
+
341
+ # Initialize weights and apply final processing
342
+ self.post_init()
343
+
344
+ def get_input_embeddings(self):
345
+ return self.model.embed_tokens
346
+
347
+ def set_input_embeddings(self, value):
348
+ self.model.embed_tokens = value
349
+
350
+ def set_decoder(self, decoder):
351
+ self.model = decoder
352
+
353
+ def get_decoder(self):
354
+ return self.model
355
+
356
+ def forward(
357
+ self,
358
+ input_ids: torch.LongTensor = None,
359
+ attention_mask: Optional[torch.Tensor] = None,
360
+ position_ids: Optional[torch.LongTensor] = None,
361
+ past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None,
362
+ inputs_embeds: Optional[torch.FloatTensor] = None,
363
+ labels: Optional[torch.LongTensor] = None,
364
+ use_cache: Optional[bool] = None,
365
+ output_attentions: Optional[bool] = None,
366
+ output_hidden_states: Optional[bool] = None,
367
+ return_dict: Optional[bool] = None,
368
+ cache_position: Optional[torch.LongTensor] = None,
369
+ ) -> Union[Tuple, CausalLMOutputWithPast]:
370
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
371
+ output_hidden_states = (
372
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
373
+ )
374
+ return_dict = (
375
+ return_dict if return_dict is not None else self.config.use_return_dict
376
+ )
377
+
378
+ # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
379
+ outputs = self.model(
380
+ input_ids=input_ids,
381
+ attention_mask=attention_mask,
382
+ past_key_values=past_key_values,
383
+ inputs_embeds=inputs_embeds,
384
+ use_cache=use_cache,
385
+ output_attentions=output_attentions,
386
+ output_hidden_states=output_hidden_states,
387
+ return_dict=return_dict,
388
+ cache_position=cache_position,
389
+ )
390
+
391
+ hidden_states = outputs[0]
392
+ # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
393
+
394
+ logits = self.lm_head(hidden_states)
395
+
396
+ loss = None
397
+ if labels is not None:
398
+ loss = self.loss_function(
399
+ logits=logits,
400
+ labels=labels,
401
+ vocab_size=self.config.vocab_size,
402
+ )
403
+
404
+ if not return_dict:
405
+ output = (logits,) + outputs[1:]
406
+ return (loss,) + output if loss is not None else output
407
+
408
+ return CausalLMOutputWithPast(
409
+ loss=loss,
410
+ logits=logits,
411
+ past_key_values=outputs.past_key_values,
412
+ hidden_states=outputs.hidden_states,
413
+ attentions=outputs.attentions,
414
+ )
models/Step-Audio-EditX/source.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ https://huggingface.co/stepfun-ai/Step-Audio-EditX
models/Step-Audio-EditX/tokenizer.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:25e122d9205d035033a9994c4d46a6a1b467a938654e4178fc0e5f4f5d610674
3
+ size 1264044
models/Step-Audio-EditX/tokenizer_config.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": "<s>",
3
+ "clean_up_tokenization_spaces": false,
4
+ "eos_token": "</s>",
5
+ "legacy": false,
6
+ "model_max_length": 65536,
7
+ "pad_token": "<unk>",
8
+ "padding_side": "left",
9
+ "sp_model_kwargs": {},
10
+ "tokenizer_class": "LlamaTokenizer",
11
+ "unk_token": "<unk>",
12
+ "use_default_system_prompt": false
13
+ }
14
+
models/Step-Audio-R1-NVFP4A16/.gitattributes ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ tokenizer.json filter=lfs diff=lfs merge=lfs -text
models/Step-Audio-R1-NVFP4A16/README.md ADDED
@@ -0,0 +1,216 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: apache-2.0
3
+ pipeline_tag: audio-text-to-text
4
+ library_name: transformers
5
+ tags:
6
+ - audio-reasoning
7
+ - chain-of-thought
8
+ - multi-modal
9
+ - step-audio-r1
10
+ ---
11
+ ## Step-Audio-R1-NVFP4A16 (Quantized)
12
+
13
+ This is a **quantized version** of Step-Audio-R1 using NVFP4A16 quantization via [LLM Compressor](https://github.com/vllm-project/llm-compressor).
14
+
15
+ ### Quantization Details
16
+
17
+ - **Scheme**: NVFP4A16 (FP4 weights with FP16 activations)
18
+ - **Target layers**: All Linear layers (except `encoder`, `adapter`, `lm_head`)
19
+ - **Group size**: 16
20
+ - **Method**: Post-Training Quantization (PTQ)
21
+
22
+ ### Quantization Code
23
+ ```python
24
+ from transformers import AutoModelForCausalLM, AutoTokenizer
25
+ from llmcompressor import oneshot
26
+ from llmcompressor.modifiers.quantization import QuantizationModifier
27
+ from llmcompressor.utils import dispatch_for_generation
28
+
29
+ MODEL_ID = "stepfun-ai/Step-Audio-R1"
30
+
31
+ # Load model
32
+ model = AutoModelForCausalLM.from_pretrained(MODEL_ID, torch_dtype="auto")
33
+ tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
34
+
35
+ # Configure the quantization algorithm and scheme
36
+ # Quantize weights to FP4 with per group 16 via PTQ
37
+ recipe = QuantizationModifier(targets="Linear", scheme="NVFP4A16", ignore=["lm_head", "re:encoder.*", "re:adapter.*"])
38
+
39
+ # Apply quantization
40
+ oneshot(model=model, recipe=recipe)
41
+
42
+ # Save to disk in compressed-tensors format
43
+ SAVE_DIR = "Step-Audio-R1-NVFP4A16"
44
+ model.save_pretrained(SAVE_DIR, save_compressed=True)
45
+ tokenizer.save_pretrained(SAVE_DIR)
46
+ ```
47
+
48
+
49
+ ## Step-Audio-R1
50
+
51
+ ✨ [Demo Page](https://stepaudiollm.github.io/step-audio-r1/)&nbsp;
52
+ | 🎮 [Playground](https://huggingface.co/spaces/stepfun-ai/Step-Audio-R1)&nbsp;
53
+ | 🌟 [GitHub](https://github.com/stepfun-ai/Step-Audio-R1)&nbsp;
54
+ | 📑 [Paper](https://arxiv.org/abs/2511.15848)&nbsp;
55
+
56
+ Step-Audio-R1 is the **first audio language model to successfully unlock Chain-of-Thought (CoT) reasoning**.
57
+ It decisively solves the "inverted scaling" problem that plagues existing models, where performance degrades
58
+ with longer reasoning. Step-Audio-R1 is the first model to demonstrate that for audio, like text and vision,
59
+ allocating more compute at test-time predictably improves performance.
60
+
61
+ We found the root cause of this anomaly: models were engaging in **textual surrogate reasoning**
62
+ (analyzing transcripts, not audio) due to a modality mismatch. To solve this, we introduce
63
+ **Modality-Grounded Reasoning Distillation (MGRD)**, an iterative training framework that shifts the model's
64
+ reasoning from textual abstractions to acoustic properties.
65
+
66
+ This new approach allows us to create **Step-Audio-R1**, which:
67
+ - Is the **first audio reasoning model** that successfully benefits from test-time compute scaling.
68
+ - Surpasses **Gemini 2.5 Pro** and is comparable to **Gemini 3** across major audio reasoning tasks.
69
+ - Transforms extended deliberation from a liability into a **powerful asset** for audio intelligence.
70
+
71
+ ## Features
72
+ - **Chain-of-Thought (CoT) Reasoning**
73
+ - First audio language model to successfully unlock Chain-of-Thought reasoning capabilities.
74
+ - Generates audio-relevant reasoning chains that genuinely ground themselves in acoustic features.
75
+
76
+ - **Modality-Grounded Reasoning Distillation (MGRD)**
77
+ - Innovative iterative training framework that shifts reasoning from textual abstractions to acoustic properties.
78
+ - Solves the modality mismatch problem that caused textual surrogate reasoning in previous models.
79
+
80
+ - **Superior Performance**
81
+ - Surpasses **Gemini 2.5 Pro** across comprehensive audio understanding and reasoning benchmarks.
82
+ - Comparable to **Gemini 3** across major audio reasoning tasks.
83
+ - Surpasses **Qwen3** in textual reasoning.
84
+ - Covers speech, environmental sounds, and music domains.
85
+
86
+
87
+ For more examples, see [demo page](https://stepaudiollm.github.io/step-audio-r1/).
88
+
89
+ ## Model Usage
90
+ ### 📜 Requirements
91
+ - **GPU**: NVIDIA GPUs with CUDA support (tested on 4×L40S/H100/H800/H20).
92
+ - **Operating System**: Linux.
93
+ - **Python**: >= 3.10.0.
94
+
95
+ ### ⬇️ Download Model
96
+ First, you need to download the Step-Audio-R1 model weights.
97
+
98
+ **Method A · Git LFS**
99
+ ```bash
100
+ git lfs install
101
+ git clone https://huggingface.co/stepfun-ai/Step-Audio-R1
102
+ ```
103
+
104
+ **Method B · Hugging Face CLI**
105
+ ```bash
106
+ hf download stepfun-ai/Step-Audio-R1 --local-dir ./Step-Audio-R1
107
+ ```
108
+
109
+ ### 🚀 Deployment and Execution
110
+ We provide two ways to serve the model: Docker (recommended) or compiling the customized vLLM backend.
111
+
112
+ #### 🐳 Method 1 · Run with Docker (Recommended)
113
+
114
+ A customized vLLM image is required.
115
+
116
+ 1. **Pull the image**:
117
+ ```bash
118
+ docker pull stepfun2025/vllm:step-audio-2-v20250909
119
+ ```
120
+ 2. **Start the service**:
121
+ Assuming the model is downloaded in the `Step-Audio-R1` folder in the current directory.
122
+
123
+ ```bash
124
+ docker run --rm -ti --gpus all \
125
+ -v $(pwd)/Step-Audio-R1:/Step-Audio-R1 \
126
+ -p 9999:9999 \
127
+ stepfun2025/vllm:step-audio-2-v20250909 \
128
+ -- vllm serve /Step-Audio-R1 \
129
+ --served-model-name Step-Audio-R1 \
130
+ --port 9999 \
131
+ --max-model-len 16384 \
132
+ --max-num-seqs 32 \
133
+ --tensor-parallel-size 4 \
134
+ --chat-template '{%- macro render_content(content) -%}{%- if content is string -%}{{- content.replace("<audio_patch>\n", "<audio_patch>") -}}{%- elif content is mapping -%}{{- content['"'"'value'"'"'] if '"'"'value'"'"' in content else content['"'"'text'"'"'] -}}{%- elif content is iterable -%}{%- for item in content -%}{%- if item.type == '"'"'text'"'"' -%}{{- item['"'"'value'"'"'] if '"'"'value'"'"' in item else item['"'"'text'"'"'] -}}{%- elif item.type == '"'"'audio'"'"' -%}<audio_patch>{%- endif -%}{%- endfor -%}{%- endif -%}{%- endmacro -%}{%- if tools -%}{{- '"'"'<|BOT|>system\n'"'"' -}}{%- if messages[0]['"'"'role'"'"'] == '"'"'system'"'"' -%}{{- render_content(messages[0]['"'"'content'"'"']) + '"'"'<|EOT|>'"'"' -}}{%- endif -%}{{- '"'"'<|BOT|>tool_json_schemas\n'"'"' + tools|tojson + '"'"'<|EOT|>'"'"' -}}{%- else -%}{%- if messages[0]['"'"'role'"'"'] == '"'"'system'"'"' -%}{{- '"'"'<|BOT|>system\n'"'"' + render_content(messages[0]['"'"'content'"'"']) + '"'"'<|EOT|>'"'"' -}}{%- endif -%}{%- endif -%}{%- for message in messages -%}{%- if message["role"] == "user" -%}{{- '"'"'<|BOT|>human\n'"'"' + render_content(message["content"]) + '"'"'<|EOT|>'"'"' -}}{%- elif message["role"] == "assistant" -%}{{- '"'"'<|BOT|>assistant\n'"'"' + (render_content(message["content"]) if message["content"] else '"'"''"'"') -}}{%- set is_last_assistant = true -%}{%- for m in messages[loop.index:] -%}{%- if m["role"] == "assistant" -%}{%- set is_last_assistant = false -%}{%- endif -%}{%- endfor -%}{%- if not is_last_assistant -%}{{- '"'"'<|EOT|>'"'"' -}}{%- endif -%}{%- elif message["role"] == "function_output" -%}{%- else -%}{%- if not (loop.first and message["role"] == "system") -%}{{- '"'"'<|BOT|>'"'"' + message["role"] + '"'"'\n'"'"' + render_content(message["content"]) + '"'"'<|EOT|>'"'"' -}}{%- endif -%}{%- endif -%}{%- endfor -%}{%- if add_generation_prompt -%}{{- '"'"'<|BOT|>assistant\n<think>\n'"'"' -}}{%- endif -%}' \
135
+ --enable-log-requests \
136
+ --interleave-mm-strings \
137
+ --trust-remote-code
138
+ ```
139
+ After the service starts, it will listen on `localhost:9999`.
140
+
141
+ #### 🐳 Method 2 · Run from Source (Compile vLLM)
142
+ Step-Audio-R1 requires a customized vLLM backend.
143
+
144
+ 1. **Download Source Code**:
145
+ ```bash
146
+ git clone https://github.com/stepfun-ai/vllm.git
147
+ cd vllm
148
+ ```
149
+
150
+ 2. **Prepare Environment**:
151
+ ```bash
152
+ python3 -m venv .venv
153
+ source .venv/bin/activate
154
+ ```
155
+
156
+ 3. **Install and Compile**:
157
+ vLLM contains both C++ and Python code. We mainly modified the Python code, so the C++ part can use the pre-compiled version to speed up the process.
158
+
159
+ ```bash
160
+ # Use pre-compiled C++ extensions (Recommended)
161
+ VLLM_USE_PRECOMPILED=1 pip install -e .
162
+ ```
163
+
164
+ 4. **Switch Branch**:
165
+ After compilation, switch to the branch that supports Step-Audio.
166
+ ```bash
167
+ git checkout step-audio-2-mini
168
+ ```
169
+
170
+ 5. **Start the Service**:
171
+ ```bash
172
+ # Ensure you are in the vllm directory and the virtual environment is activated
173
+ source .venv/bin/activate
174
+
175
+ python3 -m vllm.entrypoints.openai.api_server \
176
+ --model ../Step-Audio-R1 \
177
+ --served-model-name Step-Audio-R1 \
178
+ --port 9999 \
179
+ --host 0.0.0.0 \
180
+ --max-model-len 65536 \
181
+ --max-num-seqs 128 \
182
+ --tensor-parallel-size 4 \
183
+ --gpu-memory-utilization 0.85 \
184
+ --trust-remote-code \
185
+ --enable-log-requests \
186
+ --interleave-mm-strings \
187
+ --chat-template '{%- macro render_content(content) -%}{%- if content is string -%}{{- content.replace("<audio_patch>\n", "<audio_patch>") -}}{%- elif content is mapping -%}{{- content['"'"'value'"'"'] if '"'"'value'"'"' in content else content['"'"'text'"'"'] -}}{%- elif content is iterable -%}{%- for item in content -%}{%- if item.type == '"'"'text'"'"' -%}{{- item['"'"'value'"'"'] if '"'"'value'"'"' in item else item['"'"'text'"'"'] -}}{%- elif item.type == '"'"'audio'"'"' -%}<audio_patch>{%- endif -%}{%- endfor -%}{%- endif -%}{%- endmacro -%}{%- if tools -%}{{- '"'"'<|BOT|>system\n'"'"' -}}{%- if messages[0]['"'"'role'"'"'] == '"'"'system'"'"' -%}{{- render_content(messages[0]['"'"'content'"'"']) + '"'"'<|EOT|>'"'"' -}}{%- endif -%}{{- '"'"'<|BOT|>tool_json_schemas\n'"'"' + tools|tojson + '"'"'<|EOT|>'"'"' -}}{%- else -%}{%- if messages[0]['"'"'role'"'"'] == '"'"'system'"'"' -%}{{- '"'"'<|BOT|>system\n'"'"' + render_content(messages[0]['"'"'content'"'"']) + '"'"'<|EOT|>'"'"' -}}{%- endif -%}{%- endif -%}{%- for message in messages -%}{%- if message["role"] == "user" -%}{{- '"'"'<|BOT|>human\n'"'"' + render_content(message["content"]) + '"'"'<|EOT|>'"'"' -}}{%- elif message["role"] == "assistant" -%}{{- '"'"'<|BOT|>assistant\n'"'"' + (render_content(message["content"]) if message["content"] else '"'"''"'"') -}}{%- set is_last_assistant = true -%}{%- for m in messages[loop.index:] -%}{%- if m["role"] == "assistant" -%}{%- set is_last_assistant = false -%}{%- endif -%}{%- endfor -%}{%- if not is_last_assistant -%}{{- '"'"'<|EOT|>'"'"' -}}{%- endif -%}{%- elif message["role"] == "function_output" -%}{%- else -%}{%- if not (loop.first and message["role"] == "system") -%}{{- '"'"'<|BOT|>'"'"' + message["role"] + '"'"'\n'"'"' + render_content(message["content"]) + '"'"'<|EOT|>'"'"' -}}{%- endif -%}{%- endif -%}{%- endfor -%}{%- if add_generation_prompt -%}{{- '"'"'<|BOT|>assistant\n<think>\n'"'"' -}}{%- endif -%}'
188
+ ```
189
+
190
+ After the service starts, it will listen on `localhost:9999`.
191
+
192
+
193
+ ### 🧪 Client Examples
194
+
195
+ Get the example code and run it:
196
+ ```bash
197
+ # Clone the repository containing example scripts
198
+ git clone https://github.com/stepfun-ai/Step-Audio-R1.git r1-scripts
199
+
200
+ # Run the example
201
+ cd r1-scripts
202
+ python examples-vllm_r1.py
203
+ ```
204
+
205
+
206
+ ## Citation
207
+
208
+ ```
209
+ @article{tian2025step,
210
+ title={Step-Audio-R1 Technical Report},
211
+ author={Tian, Fei and Zhang, Xiangyu Tony and Zhang, Yuxin and Zhang, Haoyang and Li, Yuxin and Liu, Daijiao and Deng, Yayue and Wu, Donghang and Chen, Jun and Zhao, Liang and others},
212
+ journal={arXiv preprint arXiv:2511.15848},
213
+ year={2025}
214
+ }
215
+
216
+ ```
models/Step-Audio-R1-NVFP4A16/added_tokens.json ADDED
The diff for this file is too large to render. See raw diff
 
models/Step-Audio-R1-NVFP4A16/chat_template.jinja ADDED
@@ -0,0 +1,74 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {%- if tools %}
2
+ {{- '<|BOT|>system
3
+ ' }}
4
+ {%- if messages[0]['role'] == 'system' %}
5
+ {{- messages[0]['content'] + '<|EOT|>' }}
6
+ {%- else %}
7
+ {{- 'You are a helpful assistant. Please think step by step and provide your reasoning process within <think> </think> tags, followed by your final answer. Format: <think>your reasoning here</think>your final answer<|EOT|>' }}
8
+ {%- endif %}
9
+ {{- '<|BOT|>' }}
10
+ {{- "tool_json_schemas
11
+ " }}
12
+ {{- tools | tojson }}
13
+ {{- '<|EOT|>' }}
14
+ {%- else %}
15
+ {%- if messages[0]['role'] == 'system' %}
16
+ {{- '<|BOT|>system
17
+ ' + messages[0]['content'] + '<|EOT|>' }}
18
+ {%- else %}
19
+ {{- '<|BOT|>system
20
+ You are a helpful assistant. Please think step by step and provide your reasoning process within <think> </think> tags, followed by your final answer. Format: <think>your reasoning here</think>your final answer<|EOT|>' }}
21
+ {%- endif %}
22
+ {%- endif %}
23
+ {%- for message in messages %}
24
+ {%- if message["role"] == "user" %}
25
+ {{- '<|BOT|>human
26
+ ' + message["content"] + '<|EOT|>' }}
27
+ {%- elif (message["role"] == "system" and not loop.first) or (message["role"] == "assistant" and not message["tool_calls"]) %}
28
+ {{- '<|BOT|>' + message["role"] + '
29
+ ' + message["content"] + '<|EOT|>' }}
30
+ {%- elif message["role"] == "assistant" %}
31
+ {{- '<|BOT|>' + message["role"] + '
32
+ ' }}
33
+ {%- if message["content"] %}
34
+ {{- message["content"] }}
35
+ {%- endif %}
36
+ {%- for tool_call in message.tool_calls %}
37
+ {%- if tool_call["function"] is defined %}
38
+ {%- set tool_call = tool_call["function"] %}
39
+ {%- endif %}
40
+ {{- '<|CALL_START|>' + 'function
41
+ ' + tool_call["name"] + '
42
+ ' }}
43
+ {{- tool_call["arguments"] | tojson }}
44
+ {{- '<|CALL_END|>' }}
45
+ {%- endfor %}
46
+ {{- '<|EOT|>' }}
47
+ {%- elif message["role"] == "tool" %}
48
+ {{- '<|BOT|>' }}
49
+ {%- set ns = namespace(function_name="tool") %}
50
+ {%- if message["tool_call_id"] %}
51
+ {%- for prev_msg in messages %}
52
+ {%- if prev_msg["role"] == "assistant" and prev_msg["tool_calls"] %}
53
+ {%- for tool_call in prev_msg["tool_calls"] %}
54
+ {%- if tool_call["id"] == message["tool_call_id"] %}
55
+ {%- if tool_call["function"] is defined %}
56
+ {%- set ns.function_name = tool_call["function"]["name"] %}
57
+ {%- endif %}
58
+ {%- endif %}
59
+ {%- endfor %}
60
+ {%- endif %}
61
+ {%- endfor %}
62
+ {%- endif %}
63
+ {{- 'function_output
64
+ ' + ns.function_name + '
65
+ ' }}
66
+ {{- message["content"] }}
67
+ {{- '<|EOT|>' }}
68
+ {%- endif %}
69
+ {%- endfor %}
70
+ {%- if add_generation_prompt %}
71
+ {{- '<|BOT|>assistant
72
+ <think>
73
+ ' }}
74
+ {%- endif %}
models/Step-Audio-R1-NVFP4A16/config.json ADDED
@@ -0,0 +1,348 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "StepAudio2ForCausalLM"
4
+ ],
5
+ "audio_encoder_config": {
6
+ "adapter_stride": 2,
7
+ "kernel_size": 3,
8
+ "llm_dim": 5120,
9
+ "model_type": "step_audio_2_encoder",
10
+ "n_audio_ctx": 1500,
11
+ "n_audio_head": 20,
12
+ "n_audio_layer": 32,
13
+ "n_audio_state": 1280,
14
+ "n_codebook_size": 4096,
15
+ "n_mels": 128
16
+ },
17
+ "auto_map": {
18
+ "AutoConfig": "configuration_step_audio_2.StepAudio2Config",
19
+ "AutoModelForCausalLM": "modeling_step_audio_2.StepAudio2ForCausalLM"
20
+ },
21
+ "dtype": "bfloat16",
22
+ "max_window_layers": null,
23
+ "model_type": "step_audio_2",
24
+ "quantization_config": {
25
+ "config_groups": {
26
+ "group_0": {
27
+ "format": "nvfp4-pack-quantized",
28
+ "input_activations": null,
29
+ "output_activations": null,
30
+ "targets": [
31
+ "Linear"
32
+ ],
33
+ "weights": {
34
+ "actorder": null,
35
+ "block_structure": null,
36
+ "dynamic": false,
37
+ "group_size": 16,
38
+ "num_bits": 4,
39
+ "observer": "minmax",
40
+ "observer_kwargs": {},
41
+ "strategy": "tensor_group",
42
+ "symmetric": true,
43
+ "type": "float"
44
+ }
45
+ }
46
+ },
47
+ "format": "nvfp4-pack-quantized",
48
+ "global_compression_ratio": null,
49
+ "ignore": [
50
+ "encoder.blocks.0.attn.query",
51
+ "encoder.blocks.0.attn.key",
52
+ "encoder.blocks.0.attn.value",
53
+ "encoder.blocks.0.attn.out",
54
+ "encoder.blocks.0.mlp.0",
55
+ "encoder.blocks.0.mlp.2",
56
+ "encoder.blocks.1.attn.query",
57
+ "encoder.blocks.1.attn.key",
58
+ "encoder.blocks.1.attn.value",
59
+ "encoder.blocks.1.attn.out",
60
+ "encoder.blocks.1.mlp.0",
61
+ "encoder.blocks.1.mlp.2",
62
+ "encoder.blocks.2.attn.query",
63
+ "encoder.blocks.2.attn.key",
64
+ "encoder.blocks.2.attn.value",
65
+ "encoder.blocks.2.attn.out",
66
+ "encoder.blocks.2.mlp.0",
67
+ "encoder.blocks.2.mlp.2",
68
+ "encoder.blocks.3.attn.query",
69
+ "encoder.blocks.3.attn.key",
70
+ "encoder.blocks.3.attn.value",
71
+ "encoder.blocks.3.attn.out",
72
+ "encoder.blocks.3.mlp.0",
73
+ "encoder.blocks.3.mlp.2",
74
+ "encoder.blocks.4.attn.query",
75
+ "encoder.blocks.4.attn.key",
76
+ "encoder.blocks.4.attn.value",
77
+ "encoder.blocks.4.attn.out",
78
+ "encoder.blocks.4.mlp.0",
79
+ "encoder.blocks.4.mlp.2",
80
+ "encoder.blocks.5.attn.query",
81
+ "encoder.blocks.5.attn.key",
82
+ "encoder.blocks.5.attn.value",
83
+ "encoder.blocks.5.attn.out",
84
+ "encoder.blocks.5.mlp.0",
85
+ "encoder.blocks.5.mlp.2",
86
+ "encoder.blocks.6.attn.query",
87
+ "encoder.blocks.6.attn.key",
88
+ "encoder.blocks.6.attn.value",
89
+ "encoder.blocks.6.attn.out",
90
+ "encoder.blocks.6.mlp.0",
91
+ "encoder.blocks.6.mlp.2",
92
+ "encoder.blocks.7.attn.query",
93
+ "encoder.blocks.7.attn.key",
94
+ "encoder.blocks.7.attn.value",
95
+ "encoder.blocks.7.attn.out",
96
+ "encoder.blocks.7.mlp.0",
97
+ "encoder.blocks.7.mlp.2",
98
+ "encoder.blocks.8.attn.query",
99
+ "encoder.blocks.8.attn.key",
100
+ "encoder.blocks.8.attn.value",
101
+ "encoder.blocks.8.attn.out",
102
+ "encoder.blocks.8.mlp.0",
103
+ "encoder.blocks.8.mlp.2",
104
+ "encoder.blocks.9.attn.query",
105
+ "encoder.blocks.9.attn.key",
106
+ "encoder.blocks.9.attn.value",
107
+ "encoder.blocks.9.attn.out",
108
+ "encoder.blocks.9.mlp.0",
109
+ "encoder.blocks.9.mlp.2",
110
+ "encoder.blocks.10.attn.query",
111
+ "encoder.blocks.10.attn.key",
112
+ "encoder.blocks.10.attn.value",
113
+ "encoder.blocks.10.attn.out",
114
+ "encoder.blocks.10.mlp.0",
115
+ "encoder.blocks.10.mlp.2",
116
+ "encoder.blocks.11.attn.query",
117
+ "encoder.blocks.11.attn.key",
118
+ "encoder.blocks.11.attn.value",
119
+ "encoder.blocks.11.attn.out",
120
+ "encoder.blocks.11.mlp.0",
121
+ "encoder.blocks.11.mlp.2",
122
+ "encoder.blocks.12.attn.query",
123
+ "encoder.blocks.12.attn.key",
124
+ "encoder.blocks.12.attn.value",
125
+ "encoder.blocks.12.attn.out",
126
+ "encoder.blocks.12.mlp.0",
127
+ "encoder.blocks.12.mlp.2",
128
+ "encoder.blocks.13.attn.query",
129
+ "encoder.blocks.13.attn.key",
130
+ "encoder.blocks.13.attn.value",
131
+ "encoder.blocks.13.attn.out",
132
+ "encoder.blocks.13.mlp.0",
133
+ "encoder.blocks.13.mlp.2",
134
+ "encoder.blocks.14.attn.query",
135
+ "encoder.blocks.14.attn.key",
136
+ "encoder.blocks.14.attn.value",
137
+ "encoder.blocks.14.attn.out",
138
+ "encoder.blocks.14.mlp.0",
139
+ "encoder.blocks.14.mlp.2",
140
+ "encoder.blocks.15.attn.query",
141
+ "encoder.blocks.15.attn.key",
142
+ "encoder.blocks.15.attn.value",
143
+ "encoder.blocks.15.attn.out",
144
+ "encoder.blocks.15.mlp.0",
145
+ "encoder.blocks.15.mlp.2",
146
+ "encoder.blocks.16.attn.query",
147
+ "encoder.blocks.16.attn.key",
148
+ "encoder.blocks.16.attn.value",
149
+ "encoder.blocks.16.attn.out",
150
+ "encoder.blocks.16.mlp.0",
151
+ "encoder.blocks.16.mlp.2",
152
+ "encoder.blocks.17.attn.query",
153
+ "encoder.blocks.17.attn.key",
154
+ "encoder.blocks.17.attn.value",
155
+ "encoder.blocks.17.attn.out",
156
+ "encoder.blocks.17.mlp.0",
157
+ "encoder.blocks.17.mlp.2",
158
+ "encoder.blocks.18.attn.query",
159
+ "encoder.blocks.18.attn.key",
160
+ "encoder.blocks.18.attn.value",
161
+ "encoder.blocks.18.attn.out",
162
+ "encoder.blocks.18.mlp.0",
163
+ "encoder.blocks.18.mlp.2",
164
+ "encoder.blocks.19.attn.query",
165
+ "encoder.blocks.19.attn.key",
166
+ "encoder.blocks.19.attn.value",
167
+ "encoder.blocks.19.attn.out",
168
+ "encoder.blocks.19.mlp.0",
169
+ "encoder.blocks.19.mlp.2",
170
+ "encoder.blocks.20.attn.query",
171
+ "encoder.blocks.20.attn.key",
172
+ "encoder.blocks.20.attn.value",
173
+ "encoder.blocks.20.attn.out",
174
+ "encoder.blocks.20.mlp.0",
175
+ "encoder.blocks.20.mlp.2",
176
+ "encoder.blocks.21.attn.query",
177
+ "encoder.blocks.21.attn.key",
178
+ "encoder.blocks.21.attn.value",
179
+ "encoder.blocks.21.attn.out",
180
+ "encoder.blocks.21.mlp.0",
181
+ "encoder.blocks.21.mlp.2",
182
+ "encoder.blocks.22.attn.query",
183
+ "encoder.blocks.22.attn.key",
184
+ "encoder.blocks.22.attn.value",
185
+ "encoder.blocks.22.attn.out",
186
+ "encoder.blocks.22.mlp.0",
187
+ "encoder.blocks.22.mlp.2",
188
+ "encoder.blocks.23.attn.query",
189
+ "encoder.blocks.23.attn.key",
190
+ "encoder.blocks.23.attn.value",
191
+ "encoder.blocks.23.attn.out",
192
+ "encoder.blocks.23.mlp.0",
193
+ "encoder.blocks.23.mlp.2",
194
+ "encoder.blocks.24.attn.query",
195
+ "encoder.blocks.24.attn.key",
196
+ "encoder.blocks.24.attn.value",
197
+ "encoder.blocks.24.attn.out",
198
+ "encoder.blocks.24.mlp.0",
199
+ "encoder.blocks.24.mlp.2",
200
+ "encoder.blocks.25.attn.query",
201
+ "encoder.blocks.25.attn.key",
202
+ "encoder.blocks.25.attn.value",
203
+ "encoder.blocks.25.attn.out",
204
+ "encoder.blocks.25.mlp.0",
205
+ "encoder.blocks.25.mlp.2",
206
+ "encoder.blocks.26.attn.query",
207
+ "encoder.blocks.26.attn.key",
208
+ "encoder.blocks.26.attn.value",
209
+ "encoder.blocks.26.attn.out",
210
+ "encoder.blocks.26.mlp.0",
211
+ "encoder.blocks.26.mlp.2",
212
+ "encoder.blocks.27.attn.query",
213
+ "encoder.blocks.27.attn.key",
214
+ "encoder.blocks.27.attn.value",
215
+ "encoder.blocks.27.attn.out",
216
+ "encoder.blocks.27.mlp.0",
217
+ "encoder.blocks.27.mlp.2",
218
+ "encoder.blocks.28.attn.query",
219
+ "encoder.blocks.28.attn.key",
220
+ "encoder.blocks.28.attn.value",
221
+ "encoder.blocks.28.attn.out",
222
+ "encoder.blocks.28.mlp.0",
223
+ "encoder.blocks.28.mlp.2",
224
+ "encoder.blocks.29.attn.query",
225
+ "encoder.blocks.29.attn.key",
226
+ "encoder.blocks.29.attn.value",
227
+ "encoder.blocks.29.attn.out",
228
+ "encoder.blocks.29.mlp.0",
229
+ "encoder.blocks.29.mlp.2",
230
+ "encoder.blocks.30.attn.query",
231
+ "encoder.blocks.30.attn.key",
232
+ "encoder.blocks.30.attn.value",
233
+ "encoder.blocks.30.attn.out",
234
+ "encoder.blocks.30.mlp.0",
235
+ "encoder.blocks.30.mlp.2",
236
+ "encoder.blocks.31.attn.query",
237
+ "encoder.blocks.31.attn.key",
238
+ "encoder.blocks.31.attn.value",
239
+ "encoder.blocks.31.attn.out",
240
+ "encoder.blocks.31.mlp.0",
241
+ "encoder.blocks.31.mlp.2",
242
+ "adapter.linear1",
243
+ "adapter.linear2",
244
+ "lm_head"
245
+ ],
246
+ "kv_cache_scheme": null,
247
+ "quant_method": "compressed-tensors",
248
+ "quantization_status": "compressed",
249
+ "sparsity_config": {},
250
+ "transform_config": {},
251
+ "version": "0.12.2"
252
+ },
253
+ "sliding_window": 2048,
254
+ "text_config": {
255
+ "architectures": [
256
+ "Qwen2ForCausalLM"
257
+ ],
258
+ "attention_dropout": 0.0,
259
+ "dtype": "bfloat16",
260
+ "hidden_act": "silu",
261
+ "hidden_size": 5120,
262
+ "initializer_range": 0.02,
263
+ "intermediate_size": 27648,
264
+ "layer_types": [
265
+ "full_attention",
266
+ "full_attention",
267
+ "full_attention",
268
+ "full_attention",
269
+ "full_attention",
270
+ "full_attention",
271
+ "full_attention",
272
+ "full_attention",
273
+ "full_attention",
274
+ "full_attention",
275
+ "full_attention",
276
+ "full_attention",
277
+ "full_attention",
278
+ "full_attention",
279
+ "full_attention",
280
+ "full_attention",
281
+ "full_attention",
282
+ "full_attention",
283
+ "full_attention",
284
+ "full_attention",
285
+ "full_attention",
286
+ "full_attention",
287
+ "full_attention",
288
+ "full_attention",
289
+ "full_attention",
290
+ "full_attention",
291
+ "full_attention",
292
+ "full_attention",
293
+ "full_attention",
294
+ "full_attention",
295
+ "full_attention",
296
+ "full_attention",
297
+ "full_attention",
298
+ "full_attention",
299
+ "full_attention",
300
+ "full_attention",
301
+ "full_attention",
302
+ "full_attention",
303
+ "full_attention",
304
+ "full_attention",
305
+ "full_attention",
306
+ "full_attention",
307
+ "full_attention",
308
+ "full_attention",
309
+ "full_attention",
310
+ "full_attention",
311
+ "full_attention",
312
+ "full_attention",
313
+ "full_attention",
314
+ "full_attention",
315
+ "full_attention",
316
+ "full_attention",
317
+ "full_attention",
318
+ "full_attention",
319
+ "full_attention",
320
+ "full_attention",
321
+ "full_attention",
322
+ "full_attention",
323
+ "full_attention",
324
+ "full_attention",
325
+ "full_attention",
326
+ "full_attention",
327
+ "full_attention",
328
+ "full_attention"
329
+ ],
330
+ "max_position_embeddings": 65536,
331
+ "max_window_layers": 28,
332
+ "model_type": "qwen2",
333
+ "num_attention_heads": 40,
334
+ "num_hidden_layers": 64,
335
+ "num_attention_groups": 8,
336
+ "num_key_value_heads": 8,
337
+ "rms_norm_eps": 1e-05,
338
+ "rope_scaling": null,
339
+ "rope_theta": 1000000.0,
340
+ "sliding_window": null,
341
+ "use_cache": true,
342
+ "use_sliding_window": false,
343
+ "vocab_size": 158720
344
+ },
345
+ "tie_word_embeddings": false,
346
+ "transformers_version": "4.56.2",
347
+ "use_sliding_window": false
348
+ }
models/Step-Audio-R1-NVFP4A16/configuration_step_audio_2.py ADDED
@@ -0,0 +1,128 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Optional, Union
2
+
3
+ from transformers import Qwen2Config
4
+ from transformers.configuration_utils import PretrainedConfig
5
+
6
+
7
+ class StepAudio2EncoderConfig(PretrainedConfig):
8
+ model_type = "step_audio_2_encoder"
9
+
10
+ def __init__(
11
+ self,
12
+ n_mels=128,
13
+ n_audio_ctx=1500,
14
+ n_audio_state=512,
15
+ n_audio_head=8,
16
+ n_audio_layer=6,
17
+ llm_dim=4096,
18
+ kernel_size=3,
19
+ adapter_stride=2,
20
+ **kwargs,
21
+ ):
22
+ self.n_mels = n_mels
23
+ self.n_audio_ctx = n_audio_ctx
24
+ self.n_audio_state = n_audio_state
25
+ self.n_audio_head = n_audio_head
26
+ self.n_audio_layer = n_audio_layer
27
+ self.llm_dim = llm_dim
28
+ self.kernel_size = kernel_size
29
+ self.adapter_stride = adapter_stride
30
+ super().__init__(**kwargs)
31
+
32
+ class StepAudio2TextConfig(PretrainedConfig):
33
+ model_type = "step_audio_2_text"
34
+
35
+ def __init__(
36
+ self,
37
+ vocab_size=64012,
38
+ hidden_size=4096,
39
+ intermediate_size=11008,
40
+ num_hidden_layers=48,
41
+ num_attention_heads=32,
42
+ num_attention_groups=4,
43
+ num_key_value_heads=4,
44
+ hidden_act="silu",
45
+ max_position_embeddings=8192,
46
+ initializer_range=0.02,
47
+ rms_norm_eps=1e-6,
48
+ rope_theta=1000000.0,
49
+ rope_scaling=None,
50
+ eos_token_id=None,
51
+ **kwargs
52
+ ):
53
+
54
+ if eos_token_id is not None:
55
+ if isinstance(eos_token_id, list):
56
+ eos_token_id = list(set([151643, 151645, 151665] + eos_token_id))
57
+ else:
58
+ eos_token_id = [151643, 151645, 151665, eos_token_id]
59
+ else:
60
+ eos_token_id = [151643, 151645, 151665]
61
+
62
+ super().__init__(
63
+ eos_token_id=eos_token_id,
64
+ **kwargs)
65
+
66
+ self.vocab_size = vocab_size
67
+ self.hidden_size = hidden_size
68
+ self.intermediate_size = intermediate_size
69
+ self.num_hidden_layers = num_hidden_layers
70
+ self.num_attention_heads = num_attention_heads
71
+ self.num_attention_groups = num_attention_groups
72
+ self.num_key_value_heads = num_key_value_heads
73
+ assert self.num_attention_groups == self.num_key_value_heads, "num_attention_groups must be equal to num_key_value_heads"
74
+ self.hidden_act = hidden_act
75
+ self.max_position_embeddings = max_position_embeddings
76
+ self.initializer_range = initializer_range
77
+ self.rms_norm_eps = rms_norm_eps
78
+ self.rope_theta = rope_theta
79
+ self.rope_scaling = rope_scaling
80
+
81
+ self.text_config = Qwen2Config(
82
+ vocab_size=vocab_size,
83
+ hidden_size=hidden_size,
84
+ intermediate_size=intermediate_size,
85
+ num_hidden_layers=num_hidden_layers,
86
+ num_attention_heads=num_attention_heads,
87
+ num_key_value_heads=num_key_value_heads,
88
+ hidden_act=hidden_act,
89
+ max_position_embeddings=max_position_embeddings,
90
+ initializer_range=initializer_range,
91
+ rms_norm_eps=rms_norm_eps,
92
+ rope_theta=rope_theta,
93
+ rope_scaling=rope_scaling,
94
+ architectures=["Qwen2ForCausalLM"],
95
+ torch_dtype=getattr(self, "torch_dtype", "bfloat16"),
96
+ )
97
+
98
+ class StepAudio2Config(PretrainedConfig):
99
+ model_type = "step_audio_2"
100
+ architectures = ["StepAudio2ForCausalLM"]
101
+
102
+ def __init__(
103
+ self,
104
+ audio_encoder_config :Optional[Union[dict, StepAudio2EncoderConfig]] = None,
105
+ text_config: Optional[Union[dict, StepAudio2TextConfig]] = None,
106
+ use_sliding_window: bool = False,
107
+ sliding_window: Optional[int] = 2048,
108
+ max_window_layers: Optional[int] = None,
109
+ **kwargs
110
+ ):
111
+ kwargs.setdefault("use_sliding_window", use_sliding_window)
112
+ kwargs.setdefault("sliding_window", sliding_window)
113
+ if max_window_layers is None:
114
+ max_window_layers = kwargs.get("num_hidden_layers", None)
115
+ kwargs.setdefault("max_window_layers", max_window_layers)
116
+ super().__init__(**kwargs)
117
+
118
+ if text_config is None:
119
+ text_config = StepAudio2TextConfig().text_config
120
+ elif isinstance(text_config, dict):
121
+ text_config = StepAudio2TextConfig(**text_config).text_config
122
+
123
+ self.text_config = text_config
124
+
125
+ if audio_encoder_config is None:
126
+ self.audio_encoder_config = StepAudio2EncoderConfig()
127
+ elif isinstance(audio_encoder_config, dict):
128
+ self.audio_encoder_config = StepAudio2EncoderConfig(**audio_encoder_config)
models/Step-Audio-R1-NVFP4A16/generation_config.json ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 151643,
4
+ "eos_token_id": [
5
+ 151643,
6
+ 151665
7
+ ],
8
+ "pad_token_id": 151643,
9
+ "transformers_version": "4.56.2"
10
+ }
models/Step-Audio-R1-NVFP4A16/merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
models/Step-Audio-R1-NVFP4A16/model-00001-of-00005.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c6a7081330c6f8ab4c50e0f494f5bfbf97600b2e4cb6da7980ec451d2d25b6fc
3
+ size 4952370688
models/Step-Audio-R1-NVFP4A16/model-00002-of-00005.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8163e602803002f0fec5eec86605dcffc1eb2a8d9c78ffff87efed340f3439f0
3
+ size 4937507688
models/Step-Audio-R1-NVFP4A16/model-00003-of-00005.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:102f0220802e3c746c0c2fc4a2709f8efef538f7687e75bba2a6c6a6d86d1a24
3
+ size 4937507688
models/Step-Audio-R1-NVFP4A16/model-00004-of-00005.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5242ab3703db9f7cac5239a6bbe79c771e1085706ae7886cf694f4a376341db6
3
+ size 4997822352
models/Step-Audio-R1-NVFP4A16/model-00005-of-00005.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2d812abb83c44af00a386341fc563d9c15cda01c08e1851ff1c904eb09d793a8
3
+ size 2291022848
models/Step-Audio-R1-NVFP4A16/model.safetensors.index.json ADDED
The diff for this file is too large to render. See raw diff