atMrMattV commited on
Commit
7b73f21
·
verified ·
1 Parent(s): 9933d98

Add files using upload-large-folder tool

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. models/ace-step/Qwen3-Embedding-0.6B/added_tokens.json +28 -0
  2. models/ace-step/Qwen3-Embedding-0.6B/chat_template.jinja +85 -0
  3. models/ace-step/Qwen3-Embedding-0.6B/config.json +60 -0
  4. models/ace-step/Qwen3-Embedding-0.6B/merges.txt +0 -0
  5. models/ace-step/Qwen3-Embedding-0.6B/special_tokens_map.json +31 -0
  6. models/ace-step/Qwen3-Embedding-0.6B/tokenizer_config.json +239 -0
  7. models/ace-step/Qwen3-Embedding-0.6B/vocab.json +0 -0
  8. models/ace-step/acestep-5Hz-lm-4B/special_tokens_map.json +0 -0
  9. models/ace-step/acestep-5Hz-lm-4B/vocab.json +0 -0
  10. models/ace-step/acestep-v15-base/apg_guidance.py +220 -0
  11. models/ace-step/acestep-v15-base/config.json +81 -0
  12. models/ace-step/acestep-v15-base/configuration_acestep_v15.py +263 -0
  13. models/ace-step/acestep-v15-base/modeling_acestep_v15_base.py +0 -0
  14. models/ace-step/acestep-v15-sft/apg_guidance.py +220 -0
  15. models/ace-step/acestep-v15-sft/config.json +81 -0
  16. models/ace-step/acestep-v15-sft/configuration_acestep_v15.py +263 -0
  17. models/ace-step/acestep-v15-sft/modeling_acestep_v15_base.py +0 -0
  18. models/ace-step/acestep-v15-turbo/config.json +82 -0
  19. models/ace-step/acestep-v15-turbo/configuration_acestep_v15.py +263 -0
  20. models/ace-step/acestep-v15-turbo/modeling_acestep_v15_turbo.py +0 -0
  21. models/ace-step/vae/config.json +24 -0
  22. models/dettaglio-restyle/thumbnails/abstract_expressionism.webp +0 -0
  23. models/dettaglio-restyle/thumbnails/academia.webp +0 -0
  24. models/dettaglio-restyle/thumbnails/action_figure.webp +0 -0
  25. models/dettaglio-restyle/thumbnails/adorable_3d_character.webp +0 -0
  26. models/dettaglio-restyle/thumbnails/adorable_kawaii.webp +0 -0
  27. models/dettaglio-restyle/thumbnails/ads-advertising.webp +0 -0
  28. models/dettaglio-restyle/thumbnails/ads-automotive.webp +0 -0
  29. models/dettaglio-restyle/thumbnails/ads-corporate.webp +0 -0
  30. models/dettaglio-restyle/thumbnails/ads-fashion_editorial.webp +0 -0
  31. models/dettaglio-restyle/thumbnails/ads-food_photography.webp +0 -0
  32. models/dettaglio-restyle/thumbnails/ads-gourmet_food_photography.webp +0 -0
  33. models/dettaglio-restyle/thumbnails/ads-luxury.webp +0 -0
  34. models/dettaglio-restyle/thumbnails/ads-luxury.webp.webp +0 -0
  35. models/dettaglio-restyle/thumbnails/ads-retail.webp +0 -0
  36. models/dettaglio-restyle/thumbnails/art_deco.webp +0 -0
  37. models/dettaglio-restyle/thumbnails/art_nouveau.webp +0 -0
  38. models/dettaglio-restyle/thumbnails/artstyle-abstract.webp +0 -0
  39. models/dettaglio-restyle/thumbnails/artstyle-abstract_expressionism.webp +0 -0
  40. models/dettaglio-restyle/thumbnails/artstyle-art_deco.webp +0 -0
  41. models/dettaglio-restyle/thumbnails/artstyle-art_nouveau.webp +0 -0
  42. models/dettaglio-restyle/thumbnails/artstyle-constructivist.webp +0 -0
  43. models/dettaglio-restyle/thumbnails/artstyle-cubist.webp +0 -0
  44. models/dettaglio-restyle/thumbnails/artstyle-expressionist.webp +0 -0
  45. models/dettaglio-restyle/thumbnails/artstyle-graffiti.webp +0 -0
  46. models/dettaglio-restyle/thumbnails/artstyle-hyperrealism.webp +0 -0
  47. models/dettaglio-restyle/thumbnails/artstyle-impressionist.webp +0 -0
  48. models/dettaglio-restyle/thumbnails/artstyle-pointillism.webp +0 -0
  49. models/dettaglio-restyle/thumbnails/artstyle-pop_art.webp +0 -0
  50. models/dettaglio-restyle/thumbnails/artstyle-psychedelic.webp +0 -0
models/ace-step/Qwen3-Embedding-0.6B/added_tokens.json ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "</think>": 151668,
3
+ "</tool_call>": 151658,
4
+ "</tool_response>": 151666,
5
+ "<think>": 151667,
6
+ "<tool_call>": 151657,
7
+ "<tool_response>": 151665,
8
+ "<|box_end|>": 151649,
9
+ "<|box_start|>": 151648,
10
+ "<|endoftext|>": 151643,
11
+ "<|file_sep|>": 151664,
12
+ "<|fim_middle|>": 151660,
13
+ "<|fim_pad|>": 151662,
14
+ "<|fim_prefix|>": 151659,
15
+ "<|fim_suffix|>": 151661,
16
+ "<|im_end|>": 151645,
17
+ "<|im_start|>": 151644,
18
+ "<|image_pad|>": 151655,
19
+ "<|object_ref_end|>": 151647,
20
+ "<|object_ref_start|>": 151646,
21
+ "<|quad_end|>": 151651,
22
+ "<|quad_start|>": 151650,
23
+ "<|repo_name|>": 151663,
24
+ "<|video_pad|>": 151656,
25
+ "<|vision_end|>": 151653,
26
+ "<|vision_pad|>": 151654,
27
+ "<|vision_start|>": 151652
28
+ }
models/ace-step/Qwen3-Embedding-0.6B/chat_template.jinja ADDED
@@ -0,0 +1,85 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {%- if tools %}
2
+ {{- '<|im_start|>system\n' }}
3
+ {%- if messages[0].role == 'system' %}
4
+ {{- messages[0].content + '\n\n' }}
5
+ {%- endif %}
6
+ {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
7
+ {%- for tool in tools %}
8
+ {{- "\n" }}
9
+ {{- tool | tojson }}
10
+ {%- endfor %}
11
+ {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
12
+ {%- else %}
13
+ {%- if messages[0].role == 'system' %}
14
+ {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
15
+ {%- endif %}
16
+ {%- endif %}
17
+ {%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
18
+ {%- for message in messages[::-1] %}
19
+ {%- set index = (messages|length - 1) - loop.index0 %}
20
+ {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('<tool_response>') and message.content.endswith('</tool_response>')) %}
21
+ {%- set ns.multi_step_tool = false %}
22
+ {%- set ns.last_query_index = index %}
23
+ {%- endif %}
24
+ {%- endfor %}
25
+ {%- for message in messages %}
26
+ {%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
27
+ {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
28
+ {%- elif message.role == "assistant" %}
29
+ {%- set content = message.content %}
30
+ {%- set reasoning_content = '' %}
31
+ {%- if message.reasoning_content is defined and message.reasoning_content is not none %}
32
+ {%- set reasoning_content = message.reasoning_content %}
33
+ {%- else %}
34
+ {%- if '</think>' in message.content %}
35
+ {%- set content = message.content.split('</think>')[-1].lstrip('\n') %}
36
+ {%- set reasoning_content = message.content.split('</think>')[0].rstrip('\n').split('<think>')[-1].lstrip('\n') %}
37
+ {%- endif %}
38
+ {%- endif %}
39
+ {%- if loop.index0 > ns.last_query_index %}
40
+ {%- if loop.last or (not loop.last and reasoning_content) %}
41
+ {{- '<|im_start|>' + message.role + '\n<think>\n' + reasoning_content.strip('\n') + '\n</think>\n\n' + content.lstrip('\n') }}
42
+ {%- else %}
43
+ {{- '<|im_start|>' + message.role + '\n' + content }}
44
+ {%- endif %}
45
+ {%- else %}
46
+ {{- '<|im_start|>' + message.role + '\n' + content }}
47
+ {%- endif %}
48
+ {%- if message.tool_calls %}
49
+ {%- for tool_call in message.tool_calls %}
50
+ {%- if (loop.first and content) or (not loop.first) %}
51
+ {{- '\n' }}
52
+ {%- endif %}
53
+ {%- if tool_call.function %}
54
+ {%- set tool_call = tool_call.function %}
55
+ {%- endif %}
56
+ {{- '<tool_call>\n{"name": "' }}
57
+ {{- tool_call.name }}
58
+ {{- '", "arguments": ' }}
59
+ {%- if tool_call.arguments is string %}
60
+ {{- tool_call.arguments }}
61
+ {%- else %}
62
+ {{- tool_call.arguments | tojson }}
63
+ {%- endif %}
64
+ {{- '}\n</tool_call>' }}
65
+ {%- endfor %}
66
+ {%- endif %}
67
+ {{- '<|im_end|>\n' }}
68
+ {%- elif message.role == "tool" %}
69
+ {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
70
+ {{- '<|im_start|>user' }}
71
+ {%- endif %}
72
+ {{- '\n<tool_response>\n' }}
73
+ {{- message.content }}
74
+ {{- '\n</tool_response>' }}
75
+ {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
76
+ {{- '<|im_end|>\n' }}
77
+ {%- endif %}
78
+ {%- endif %}
79
+ {%- endfor %}
80
+ {%- if add_generation_prompt %}
81
+ {{- '<|im_start|>assistant\n' }}
82
+ {%- if enable_thinking is defined and enable_thinking is false %}
83
+ {{- '<think>\n\n</think>\n\n' }}
84
+ {%- endif %}
85
+ {%- endif %}
models/ace-step/Qwen3-Embedding-0.6B/config.json ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "Qwen3Model"
4
+ ],
5
+ "attention_bias": false,
6
+ "attention_dropout": 0.0,
7
+ "bos_token_id": 151643,
8
+ "dtype": "bfloat16",
9
+ "eos_token_id": 151643,
10
+ "head_dim": 128,
11
+ "hidden_act": "silu",
12
+ "hidden_size": 1024,
13
+ "initializer_range": 0.02,
14
+ "intermediate_size": 3072,
15
+ "layer_types": [
16
+ "full_attention",
17
+ "full_attention",
18
+ "full_attention",
19
+ "full_attention",
20
+ "full_attention",
21
+ "full_attention",
22
+ "full_attention",
23
+ "full_attention",
24
+ "full_attention",
25
+ "full_attention",
26
+ "full_attention",
27
+ "full_attention",
28
+ "full_attention",
29
+ "full_attention",
30
+ "full_attention",
31
+ "full_attention",
32
+ "full_attention",
33
+ "full_attention",
34
+ "full_attention",
35
+ "full_attention",
36
+ "full_attention",
37
+ "full_attention",
38
+ "full_attention",
39
+ "full_attention",
40
+ "full_attention",
41
+ "full_attention",
42
+ "full_attention",
43
+ "full_attention"
44
+ ],
45
+ "max_position_embeddings": 32768,
46
+ "max_window_layers": 28,
47
+ "model_type": "qwen3",
48
+ "num_attention_heads": 16,
49
+ "num_hidden_layers": 28,
50
+ "num_key_value_heads": 8,
51
+ "rms_norm_eps": 1e-06,
52
+ "rope_scaling": null,
53
+ "rope_theta": 1000000,
54
+ "sliding_window": null,
55
+ "tie_word_embeddings": true,
56
+ "transformers_version": "4.57.0.dev0",
57
+ "use_cache": true,
58
+ "use_sliding_window": false,
59
+ "vocab_size": 151669
60
+ }
models/ace-step/Qwen3-Embedding-0.6B/merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
models/ace-step/Qwen3-Embedding-0.6B/special_tokens_map.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<|im_start|>",
4
+ "<|im_end|>",
5
+ "<|object_ref_start|>",
6
+ "<|object_ref_end|>",
7
+ "<|box_start|>",
8
+ "<|box_end|>",
9
+ "<|quad_start|>",
10
+ "<|quad_end|>",
11
+ "<|vision_start|>",
12
+ "<|vision_end|>",
13
+ "<|vision_pad|>",
14
+ "<|image_pad|>",
15
+ "<|video_pad|>"
16
+ ],
17
+ "eos_token": {
18
+ "content": "<|im_end|>",
19
+ "lstrip": false,
20
+ "normalized": false,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ },
24
+ "pad_token": {
25
+ "content": "<|endoftext|>",
26
+ "lstrip": false,
27
+ "normalized": false,
28
+ "rstrip": false,
29
+ "single_word": false
30
+ }
31
+ }
models/ace-step/Qwen3-Embedding-0.6B/tokenizer_config.json ADDED
@@ -0,0 +1,239 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": false,
3
+ "add_prefix_space": false,
4
+ "added_tokens_decoder": {
5
+ "151643": {
6
+ "content": "<|endoftext|>",
7
+ "lstrip": false,
8
+ "normalized": false,
9
+ "rstrip": false,
10
+ "single_word": false,
11
+ "special": true
12
+ },
13
+ "151644": {
14
+ "content": "<|im_start|>",
15
+ "lstrip": false,
16
+ "normalized": false,
17
+ "rstrip": false,
18
+ "single_word": false,
19
+ "special": true
20
+ },
21
+ "151645": {
22
+ "content": "<|im_end|>",
23
+ "lstrip": false,
24
+ "normalized": false,
25
+ "rstrip": false,
26
+ "single_word": false,
27
+ "special": true
28
+ },
29
+ "151646": {
30
+ "content": "<|object_ref_start|>",
31
+ "lstrip": false,
32
+ "normalized": false,
33
+ "rstrip": false,
34
+ "single_word": false,
35
+ "special": true
36
+ },
37
+ "151647": {
38
+ "content": "<|object_ref_end|>",
39
+ "lstrip": false,
40
+ "normalized": false,
41
+ "rstrip": false,
42
+ "single_word": false,
43
+ "special": true
44
+ },
45
+ "151648": {
46
+ "content": "<|box_start|>",
47
+ "lstrip": false,
48
+ "normalized": false,
49
+ "rstrip": false,
50
+ "single_word": false,
51
+ "special": true
52
+ },
53
+ "151649": {
54
+ "content": "<|box_end|>",
55
+ "lstrip": false,
56
+ "normalized": false,
57
+ "rstrip": false,
58
+ "single_word": false,
59
+ "special": true
60
+ },
61
+ "151650": {
62
+ "content": "<|quad_start|>",
63
+ "lstrip": false,
64
+ "normalized": false,
65
+ "rstrip": false,
66
+ "single_word": false,
67
+ "special": true
68
+ },
69
+ "151651": {
70
+ "content": "<|quad_end|>",
71
+ "lstrip": false,
72
+ "normalized": false,
73
+ "rstrip": false,
74
+ "single_word": false,
75
+ "special": true
76
+ },
77
+ "151652": {
78
+ "content": "<|vision_start|>",
79
+ "lstrip": false,
80
+ "normalized": false,
81
+ "rstrip": false,
82
+ "single_word": false,
83
+ "special": true
84
+ },
85
+ "151653": {
86
+ "content": "<|vision_end|>",
87
+ "lstrip": false,
88
+ "normalized": false,
89
+ "rstrip": false,
90
+ "single_word": false,
91
+ "special": true
92
+ },
93
+ "151654": {
94
+ "content": "<|vision_pad|>",
95
+ "lstrip": false,
96
+ "normalized": false,
97
+ "rstrip": false,
98
+ "single_word": false,
99
+ "special": true
100
+ },
101
+ "151655": {
102
+ "content": "<|image_pad|>",
103
+ "lstrip": false,
104
+ "normalized": false,
105
+ "rstrip": false,
106
+ "single_word": false,
107
+ "special": true
108
+ },
109
+ "151656": {
110
+ "content": "<|video_pad|>",
111
+ "lstrip": false,
112
+ "normalized": false,
113
+ "rstrip": false,
114
+ "single_word": false,
115
+ "special": true
116
+ },
117
+ "151657": {
118
+ "content": "<tool_call>",
119
+ "lstrip": false,
120
+ "normalized": false,
121
+ "rstrip": false,
122
+ "single_word": false,
123
+ "special": false
124
+ },
125
+ "151658": {
126
+ "content": "</tool_call>",
127
+ "lstrip": false,
128
+ "normalized": false,
129
+ "rstrip": false,
130
+ "single_word": false,
131
+ "special": false
132
+ },
133
+ "151659": {
134
+ "content": "<|fim_prefix|>",
135
+ "lstrip": false,
136
+ "normalized": false,
137
+ "rstrip": false,
138
+ "single_word": false,
139
+ "special": false
140
+ },
141
+ "151660": {
142
+ "content": "<|fim_middle|>",
143
+ "lstrip": false,
144
+ "normalized": false,
145
+ "rstrip": false,
146
+ "single_word": false,
147
+ "special": false
148
+ },
149
+ "151661": {
150
+ "content": "<|fim_suffix|>",
151
+ "lstrip": false,
152
+ "normalized": false,
153
+ "rstrip": false,
154
+ "single_word": false,
155
+ "special": false
156
+ },
157
+ "151662": {
158
+ "content": "<|fim_pad|>",
159
+ "lstrip": false,
160
+ "normalized": false,
161
+ "rstrip": false,
162
+ "single_word": false,
163
+ "special": false
164
+ },
165
+ "151663": {
166
+ "content": "<|repo_name|>",
167
+ "lstrip": false,
168
+ "normalized": false,
169
+ "rstrip": false,
170
+ "single_word": false,
171
+ "special": false
172
+ },
173
+ "151664": {
174
+ "content": "<|file_sep|>",
175
+ "lstrip": false,
176
+ "normalized": false,
177
+ "rstrip": false,
178
+ "single_word": false,
179
+ "special": false
180
+ },
181
+ "151665": {
182
+ "content": "<tool_response>",
183
+ "lstrip": false,
184
+ "normalized": false,
185
+ "rstrip": false,
186
+ "single_word": false,
187
+ "special": false
188
+ },
189
+ "151666": {
190
+ "content": "</tool_response>",
191
+ "lstrip": false,
192
+ "normalized": false,
193
+ "rstrip": false,
194
+ "single_word": false,
195
+ "special": false
196
+ },
197
+ "151667": {
198
+ "content": "<think>",
199
+ "lstrip": false,
200
+ "normalized": false,
201
+ "rstrip": false,
202
+ "single_word": false,
203
+ "special": false
204
+ },
205
+ "151668": {
206
+ "content": "</think>",
207
+ "lstrip": false,
208
+ "normalized": false,
209
+ "rstrip": false,
210
+ "single_word": false,
211
+ "special": false
212
+ }
213
+ },
214
+ "additional_special_tokens": [
215
+ "<|im_start|>",
216
+ "<|im_end|>",
217
+ "<|object_ref_start|>",
218
+ "<|object_ref_end|>",
219
+ "<|box_start|>",
220
+ "<|box_end|>",
221
+ "<|quad_start|>",
222
+ "<|quad_end|>",
223
+ "<|vision_start|>",
224
+ "<|vision_end|>",
225
+ "<|vision_pad|>",
226
+ "<|image_pad|>",
227
+ "<|video_pad|>"
228
+ ],
229
+ "bos_token": null,
230
+ "clean_up_tokenization_spaces": false,
231
+ "eos_token": "<|im_end|>",
232
+ "errors": "replace",
233
+ "extra_special_tokens": {},
234
+ "model_max_length": 131072,
235
+ "pad_token": "<|endoftext|>",
236
+ "split_special_tokens": false,
237
+ "tokenizer_class": "Qwen2Tokenizer",
238
+ "unk_token": null
239
+ }
models/ace-step/Qwen3-Embedding-0.6B/vocab.json ADDED
The diff for this file is too large to render. See raw diff
 
models/ace-step/acestep-5Hz-lm-4B/special_tokens_map.json ADDED
The diff for this file is too large to render. See raw diff
 
models/ace-step/acestep-5Hz-lm-4B/vocab.json ADDED
The diff for this file is too large to render. See raw diff
 
models/ace-step/acestep-v15-base/apg_guidance.py ADDED
@@ -0,0 +1,220 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn.functional as F
3
+
4
+
5
+ class MomentumBuffer:
6
+
7
+ def __init__(self, momentum: float = -0.75):
8
+ self.momentum = momentum
9
+ self.running_average = 0
10
+
11
+ def update(self, update_value: torch.Tensor):
12
+ new_average = self.momentum * self.running_average
13
+ self.running_average = update_value + new_average
14
+
15
+
16
+ def project(
17
+ v0: torch.Tensor, # [B, C, T]
18
+ v1: torch.Tensor, # [B, C, T]
19
+ dims=[-1],
20
+ ):
21
+ dtype = v0.dtype
22
+ device_type = v0.device.type
23
+ if device_type == "mps":
24
+ v0, v1 = v0.cpu(), v1.cpu()
25
+
26
+ v0, v1 = v0.double(), v1.double()
27
+ v1 = torch.nn.functional.normalize(v1, dim=dims)
28
+ v0_parallel = (v0 * v1).sum(dim=dims, keepdim=True) * v1
29
+ v0_orthogonal = v0 - v0_parallel
30
+ return v0_parallel.to(dtype).to(device_type), v0_orthogonal.to(dtype).to(device_type)
31
+
32
+
33
+ def apg_forward(
34
+ pred_cond: torch.Tensor, # [B, C, T]
35
+ pred_uncond: torch.Tensor, # [B, C, T]
36
+ guidance_scale: float,
37
+ momentum_buffer: MomentumBuffer = None,
38
+ eta: float = 0.0,
39
+ norm_threshold: float = 2.5,
40
+ dims=[-1],
41
+ ):
42
+ diff = pred_cond - pred_uncond
43
+ if momentum_buffer is not None:
44
+ momentum_buffer.update(diff)
45
+ diff = momentum_buffer.running_average
46
+
47
+ if norm_threshold > 0:
48
+ ones = torch.ones_like(diff)
49
+ diff_norm = diff.norm(p=2, dim=dims, keepdim=True)
50
+ scale_factor = torch.minimum(ones, norm_threshold / diff_norm)
51
+ diff = diff * scale_factor
52
+
53
+ diff_parallel, diff_orthogonal = project(diff, pred_cond, dims)
54
+ normalized_update = diff_orthogonal + eta * diff_parallel
55
+ pred_guided = pred_cond + (guidance_scale - 1) * normalized_update
56
+ return pred_guided
57
+
58
+
59
+ def cfg_forward(cond_output, uncond_output, cfg_strength):
60
+ return uncond_output + cfg_strength * (cond_output - uncond_output)
61
+
62
+
63
+ def call_cos_tensor(tensor1, tensor2):
64
+ """
65
+ Calculate cosine similarity between two normalized tensors.
66
+
67
+ Args:
68
+ tensor1: First tensor [B, ...]
69
+ tensor2: Second tensor [B, ...]
70
+
71
+ Returns:
72
+ Cosine similarity value [B, 1]
73
+ """
74
+ tensor1 = tensor1 / torch.linalg.norm(tensor1, dim=1, keepdim=True)
75
+ tensor2 = tensor2 / torch.linalg.norm(tensor2, dim=1, keepdim=True)
76
+ cosvalue = torch.sum(tensor1 * tensor2, dim=1, keepdim=True)
77
+ return cosvalue
78
+
79
+
80
+ def compute_perpendicular_component(latent_diff, latent_hat_uncond):
81
+ """
82
+ Decompose latent_diff into parallel and perpendicular components relative to latent_hat_uncond.
83
+
84
+ Args:
85
+ latent_diff: Difference tensor [B, C, ...]
86
+ latent_hat_uncond: Unconditional prediction tensor [B, C, ...]
87
+
88
+ Returns:
89
+ projection: Parallel component
90
+ perpendicular_component: Perpendicular component
91
+ """
92
+ n, t, c = latent_diff.shape
93
+ latent_diff = latent_diff.view(n * t, c).float()
94
+ latent_hat_uncond = latent_hat_uncond.view(n * t, c).float()
95
+
96
+ if latent_diff.size() != latent_hat_uncond.size():
97
+ raise ValueError("latent_diff and latent_hat_uncond must have the same shape [n, d].")
98
+
99
+ dot_product = torch.sum(latent_diff * latent_hat_uncond, dim=1, keepdim=True) # [n, 1]
100
+ norm_square = torch.sum(latent_hat_uncond * latent_hat_uncond, dim=1, keepdim=True) # [n, 1]
101
+ projection = (dot_product / (norm_square + 1e-8)) * latent_hat_uncond
102
+ perpendicular_component = latent_diff - projection
103
+
104
+ return projection.view(n, t, c), perpendicular_component.reshape(n, t, c)
105
+
106
+
107
+ def adg_forward(
108
+ latents: torch.Tensor,
109
+ noise_pred_cond: torch.Tensor,
110
+ noise_pred_uncond: torch.Tensor,
111
+ sigma: torch.Tensor,
112
+ guidance_scale: float,
113
+ angle_clip: float = 3.14 / 6, # pi/6 by default
114
+ apply_norm: bool = False,
115
+ apply_clip: bool = True,
116
+ ):
117
+ """
118
+ ADG (Angle-based Dynamic Guidance) forward pass for Flow Matching.
119
+
120
+ In flow matching (including SD3), sigma represents the current timestep t_curr.
121
+ The predictions are velocity fields v(x_t, t).
122
+
123
+ Args:
124
+ latents: Current state x_t [N, T, d] where d=64
125
+ noise_pred_cond: Conditional velocity prediction v_cond [N, T, d]
126
+ noise_pred_uncond: Unconditional velocity prediction v_uncond [N, T, d]
127
+ sigma: Current timestep t_curr (not t_prev!)
128
+ guidance_scale: Guidance strength
129
+ angle_clip: Maximum angle for clipping (default: pi/6)
130
+ apply_norm: Whether to normalize the result (ADG_w_norm variant)
131
+ apply_clip: Whether to clip the angle (ADG_wo_clip when False)
132
+
133
+ Returns:
134
+ Guided velocity prediction [N, T, d]
135
+ """
136
+ # Get batch size
137
+ n = noise_pred_cond.shape[0]
138
+ noise_pred_text = noise_pred_cond
139
+ n, t, c = noise_pred_text.shape
140
+
141
+ # Ensure sigma/t has the right shape for broadcasting [N, 1, 1]
142
+ if isinstance(sigma, (int, float)):
143
+ sigma = torch.tensor(sigma, device=latents.device, dtype=latents.dtype)
144
+ sigma = sigma.view(1, 1, 1).expand(n, 1, 1)
145
+ elif torch.is_tensor(sigma):
146
+ if sigma.numel() == 1:
147
+ sigma = sigma.view(1, 1, 1).expand(n, 1, 1)
148
+ elif sigma.numel() == n:
149
+ sigma = sigma.view(n, 1, 1)
150
+ else:
151
+ raise ValueError(f"sigma has incompatible shape. Expected scalar or size {n}, got {sigma.shape}")
152
+ else:
153
+ raise TypeError(f"sigma must be a number or tensor, got {type(sigma)}")
154
+
155
+ # Adjust guidance weight
156
+ weight = guidance_scale - 1
157
+ weight = weight * (weight > 0) + 1e-3
158
+
159
+ latent_hat_text = latents - sigma * noise_pred_text
160
+ latent_hat_uncond = latents - sigma * noise_pred_uncond
161
+ latent_diff = latent_hat_text - latent_hat_uncond
162
+
163
+ # Calculate angle between conditional and unconditional predicted data
164
+ latent_theta = torch.acos(
165
+ call_cos_tensor(latent_hat_text.view(-1, c).to(float),
166
+ latent_hat_uncond.reshape(-1, c).contiguous().to(float)))
167
+ latent_theta_new = torch.clip(weight * latent_theta, -angle_clip, angle_clip) if apply_clip else weight * latent_theta
168
+ proj, perp = compute_perpendicular_component(latent_diff, latent_hat_uncond)
169
+ latent_v_new = torch.cos(latent_theta_new) * latent_hat_text
170
+
171
+ latent_p_new = perp * torch.sin(latent_theta_new) / torch.sin(latent_theta) * (
172
+ torch.sin(latent_theta) > 1e-3) + perp * weight * (torch.sin(latent_theta) <= 1e-3)
173
+ latent_new = latent_v_new + latent_p_new
174
+ if apply_norm:
175
+ latent_new = latent_new * torch.linalg.norm(latent_hat_text, dim=1, keepdim=True) / torch.linalg.norm(
176
+ latent_new, dim=1, keepdim=True)
177
+
178
+ noise_pred = (latents - latent_new) / sigma
179
+ noise_pred = noise_pred.reshape(n, t, c).to(latents.dtype)
180
+ return noise_pred
181
+
182
+
183
+ def adg_w_norm_forward(
184
+ latents: torch.Tensor,
185
+ noise_pred_cond: torch.Tensor,
186
+ noise_pred_uncond: torch.Tensor,
187
+ sigma: float,
188
+ guidance_scale: float,
189
+ angle_clip: float = 3.14 / 3,
190
+ ):
191
+ """
192
+ ADG with normalization - preserves the magnitude of latent predictions.
193
+
194
+ This variant normalizes the final latent to maintain the same norm as the
195
+ conditional prediction, which can help preserve image quality.
196
+ """
197
+ return adg_forward(latents,
198
+ noise_pred_cond,
199
+ noise_pred_uncond,
200
+ sigma,
201
+ guidance_scale,
202
+ angle_clip=angle_clip,
203
+ apply_norm=True,
204
+ apply_clip=True)
205
+
206
+
207
+ def adg_wo_clip_forward(
208
+ latents: torch.Tensor,
209
+ noise_pred_cond: torch.Tensor,
210
+ noise_pred_uncond: torch.Tensor,
211
+ sigma: float,
212
+ guidance_scale: float,
213
+ ):
214
+ """
215
+ ADG without angle clipping - allows unbounded angle adjustments.
216
+
217
+ This variant doesn't clip the angle, which may result in more aggressive
218
+ guidance but could be less stable.
219
+ """
220
+ return adg_forward(latents, noise_pred_cond, noise_pred_uncond, sigma, guidance_scale, apply_norm=False, apply_clip=False)
models/ace-step/acestep-v15-base/config.json ADDED
@@ -0,0 +1,81 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "AceStepConditionGenerationModel"
4
+ ],
5
+ "auto_map": {
6
+ "AutoConfig": "configuration_acestep_v15.AceStepConfig",
7
+ "AutoModel": "modeling_acestep_v15_base.AceStepConditionGenerationModel"
8
+ },
9
+ "attention_bias": false,
10
+ "attention_dropout": 0.0,
11
+ "audio_acoustic_hidden_dim": 64,
12
+ "data_proportion": 0.5,
13
+ "dtype": "bfloat16",
14
+ "fsq_dim": 2048,
15
+ "fsq_input_levels": [
16
+ 8,
17
+ 8,
18
+ 8,
19
+ 5,
20
+ 5,
21
+ 5
22
+ ],
23
+ "fsq_input_num_quantizers": 1,
24
+ "head_dim": 128,
25
+ "hidden_act": "silu",
26
+ "hidden_size": 2048,
27
+ "in_channels": 192,
28
+ "initializer_range": 0.02,
29
+ "intermediate_size": 6144,
30
+ "layer_types": [
31
+ "sliding_attention",
32
+ "full_attention",
33
+ "sliding_attention",
34
+ "full_attention",
35
+ "sliding_attention",
36
+ "full_attention",
37
+ "sliding_attention",
38
+ "full_attention",
39
+ "sliding_attention",
40
+ "full_attention",
41
+ "sliding_attention",
42
+ "full_attention",
43
+ "sliding_attention",
44
+ "full_attention",
45
+ "sliding_attention",
46
+ "full_attention",
47
+ "sliding_attention",
48
+ "full_attention",
49
+ "sliding_attention",
50
+ "full_attention",
51
+ "sliding_attention",
52
+ "full_attention",
53
+ "sliding_attention",
54
+ "full_attention"
55
+ ],
56
+ "max_position_embeddings": 32768,
57
+ "model_type": "acestep",
58
+ "num_attention_heads": 16,
59
+ "num_attention_pooler_hidden_layers": 2,
60
+ "num_audio_decoder_hidden_layers": 24,
61
+ "num_hidden_layers": 24,
62
+ "num_key_value_heads": 8,
63
+ "num_lyric_encoder_hidden_layers": 8,
64
+ "num_timbre_encoder_hidden_layers": 4,
65
+ "patch_size": 2,
66
+ "pool_window_size": 5,
67
+ "rms_norm_eps": 1e-06,
68
+ "rope_scaling": null,
69
+ "rope_theta": 1000000,
70
+ "sliding_window": 128,
71
+ "text_hidden_dim": 1024,
72
+ "timbre_fix_frame": 750,
73
+ "timbre_hidden_dim": 64,
74
+ "timestep_mu": -0.4,
75
+ "timestep_sigma": 1.0,
76
+ "transformers_version": "4.57.0.dev0",
77
+ "use_cache": true,
78
+ "use_sliding_window": true,
79
+ "vocab_size": 64003,
80
+ "is_turbo": false
81
+ }
models/ace-step/acestep-v15-base/configuration_acestep_v15.py ADDED
@@ -0,0 +1,263 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2024 The Qwen team, Alibaba Group and the HuggingFace Inc. team. All rights reserved.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ """AceStep model configuration"""
16
+
17
+ from transformers.configuration_utils import PretrainedConfig, layer_type_validation
18
+ from transformers.modeling_rope_utils import rope_config_validation
19
+ from transformers.utils import logging
20
+
21
+
22
+ logger = logging.get_logger(__name__)
23
+
24
+
25
+ class AceStepConfig(PretrainedConfig):
26
+ r"""
27
+ This is the configuration class to store the configuration of a [`AceStepModel`]. It is used to instantiate an
28
+ AceStep model according to the specified arguments, defining the model architecture.
29
+
30
+ Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
31
+ documentation from [`PretrainedConfig`] for more information.
32
+
33
+ Args:
34
+ vocab_size (`int`, *optional*, defaults to 64003):
35
+ Vocabulary size of the AceStep model. Defines the number of different tokens that can be represented by the
36
+ `inputs_ids` passed when calling the model.
37
+ hidden_size (`int`, *optional*, defaults to 4096):
38
+ Dimension of the hidden representations.
39
+ intermediate_size (`int`, *optional*, defaults to 22016):
40
+ Dimension of the MLP representations.
41
+ num_hidden_layers (`int`, *optional*, defaults to 32):
42
+ Number of hidden layers in the Transformer encoder.
43
+ num_attention_heads (`int`, *optional*, defaults to 32):
44
+ Number of attention heads for each attention layer in the Transformer encoder.
45
+ num_key_value_heads (`int`, *optional*, defaults to 32):
46
+ This is the number of key_value heads that should be used to implement Grouped Query Attention. If
47
+ `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
48
+ `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
49
+ converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
50
+ by meanpooling all the original heads within that group. For more details, check out [this
51
+ paper](https://huggingface.co/papers/2305.13245). If it is not specified, will default to `32`.
52
+ head_dim (`int`, *optional*, defaults to 128):
53
+ The attention head dimension.
54
+ hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
55
+ The non-linear activation function (function or string) in the decoder.
56
+ max_position_embeddings (`int`, *optional*, defaults to 32768):
57
+ The maximum sequence length that this model might ever be used with.
58
+ initializer_range (`float`, *optional*, defaults to 0.02):
59
+ The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
60
+ rms_norm_eps (`float`, *optional*, defaults to 1e-06):
61
+ The epsilon used by the rms normalization layers.
62
+ use_cache (`bool`, *optional*, defaults to `True`):
63
+ Whether or not the model should return the last key/values attentions (not used by all models). Only
64
+ relevant if `config.is_decoder=True`.
65
+ tie_word_embeddings (`bool`, *optional*, defaults to `False`):
66
+ Whether the model's input and output word embeddings should be tied.
67
+ rope_theta (`float`, *optional*, defaults to 10000.0):
68
+ The base period of the RoPE embeddings.
69
+ rope_scaling (`Dict`, *optional*):
70
+ Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type
71
+ and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value
72
+ accordingly.
73
+ Expected contents:
74
+ `rope_type` (`str`):
75
+ The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope',
76
+ 'llama3'], with 'default' being the original RoPE implementation.
77
+ `factor` (`float`, *optional*):
78
+ Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In
79
+ most scaling types, a `factor` of x will enable the model to handle sequences of length x *
80
+ original maximum pre-trained length.
81
+ `original_max_position_embeddings` (`int`, *optional*):
82
+ Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during
83
+ pretraining.
84
+ `attention_factor` (`float`, *optional*):
85
+ Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention
86
+ computation. If unspecified, it defaults to value recommended by the implementation, using the
87
+ `factor` field to infer the suggested value.
88
+ `beta_fast` (`float`, *optional*):
89
+ Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear
90
+ ramp function. If unspecified, it defaults to 32.
91
+ `beta_slow` (`float`, *optional*):
92
+ Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear
93
+ ramp function. If unspecified, it defaults to 1.
94
+ `short_factor` (`list[float]`, *optional*):
95
+ Only used with 'longrope'. The scaling factor to be applied to short contexts (<
96
+ `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
97
+ size divided by the number of attention heads divided by 2
98
+ `long_factor` (`list[float]`, *optional*):
99
+ Only used with 'longrope'. The scaling factor to be applied to long contexts (<
100
+ `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
101
+ size divided by the number of attention heads divided by 2
102
+ `low_freq_factor` (`float`, *optional*):
103
+ Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE
104
+ `high_freq_factor` (`float`, *optional*):
105
+ Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE
106
+ attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`):
107
+ Whether to use a bias in the query, key, value and output projection layers during self-attention.
108
+ use_sliding_window (`bool`, *optional*, defaults to `False`):
109
+ Whether to use sliding window attention.
110
+ sliding_window (`int`, *optional*, defaults to 4096):
111
+ Sliding window attention (SWA) window size. If not specified, will default to `4096`.
112
+ layer_types (`list`, *optional*):
113
+ Attention pattern for each layer.
114
+ attention_dropout (`float`, *optional*, defaults to 0.0):
115
+ The dropout ratio for the attention probabilities.
116
+
117
+ ```python
118
+ >>> from acestep.models import AceStepConfig
119
+
120
+ >>> # Initializing an AceStep configuration
121
+ >>> configuration = AceStepConfig()
122
+
123
+ >>> # Initializing a model from the configuration
124
+ >>> model = AceStepConditionGenerationModel(configuration)
125
+
126
+ >>> # Accessing the model configuration
127
+ >>> configuration = model.config
128
+ ```"""
129
+
130
+ model_type = "acestep"
131
+ keys_to_ignore_at_inference = ["past_key_values"]
132
+
133
+ # Default tensor parallel plan for the base model
134
+ base_model_tp_plan = {
135
+ "layers.*.self_attn.q_proj": "colwise",
136
+ "layers.*.self_attn.k_proj": "colwise",
137
+ "layers.*.self_attn.v_proj": "colwise",
138
+ "layers.*.self_attn.o_proj": "rowwise",
139
+ "layers.*.mlp.gate_proj": "colwise",
140
+ "layers.*.mlp.up_proj": "colwise",
141
+ "layers.*.mlp.down_proj": "rowwise",
142
+ }
143
+ base_model_pp_plan = {
144
+ "embed_tokens": (["input_ids"], ["inputs_embeds"]),
145
+ "layers": (["hidden_states", "attention_mask"], ["hidden_states"]),
146
+ "norm": (["hidden_states"], ["hidden_states"]),
147
+ }
148
+ def __init__(
149
+ self,
150
+ vocab_size=64003,
151
+ fsq_dim=2048,
152
+ fsq_input_levels=[8, 8, 8, 5, 5, 5],
153
+ fsq_input_num_quantizers=1,
154
+ hidden_size=2048,
155
+ intermediate_size=6144,
156
+ num_hidden_layers=24,
157
+ num_attention_heads=16,
158
+ num_key_value_heads=8,
159
+ head_dim=128,
160
+ hidden_act="silu",
161
+ max_position_embeddings=32768,
162
+ initializer_range=0.02,
163
+ rms_norm_eps=1e-6,
164
+ use_cache=True,
165
+ tie_word_embeddings=True,
166
+ rope_theta=1000000,
167
+ rope_scaling=None,
168
+ attention_bias=False,
169
+ use_sliding_window=True,
170
+ sliding_window=128,
171
+ layer_types=None,
172
+ attention_dropout=0.0,
173
+ num_lyric_encoder_hidden_layers=8,
174
+ audio_acoustic_hidden_dim=64,
175
+ pool_window_size=5,
176
+ text_hidden_dim=1024,
177
+ in_channels=192,
178
+ data_proportion=0.5,
179
+ timestep_mu=-0.4,
180
+ timestep_sigma=1.0,
181
+ timbre_hidden_dim=64,
182
+ num_timbre_encoder_hidden_layers=4,
183
+ timbre_fix_frame=750,
184
+ patch_size=2,
185
+ num_attention_pooler_hidden_layers=2,
186
+ num_audio_decoder_hidden_layers=24,
187
+ model_version="turbo",
188
+ **kwargs,
189
+ ):
190
+ self.max_position_embeddings = max_position_embeddings
191
+ self.hidden_size = hidden_size
192
+ self.intermediate_size = intermediate_size
193
+ self.num_hidden_layers = num_hidden_layers
194
+ self.num_attention_heads = num_attention_heads
195
+ self.use_sliding_window = use_sliding_window
196
+ self.sliding_window = sliding_window if self.use_sliding_window else None
197
+
198
+ # Text encoder configuration
199
+ self.text_hidden_dim = text_hidden_dim
200
+
201
+ # Lyric encoder configuration
202
+ self.num_lyric_encoder_hidden_layers = num_lyric_encoder_hidden_layers
203
+ self.patch_size = patch_size
204
+
205
+ # Audio semantic token generation configuration
206
+ self.audio_acoustic_hidden_dim = audio_acoustic_hidden_dim
207
+ self.pool_window_size = pool_window_size
208
+ self.in_channels = in_channels
209
+ self.data_proportion = data_proportion
210
+ self.timestep_mu = timestep_mu
211
+ self.timestep_sigma = timestep_sigma
212
+
213
+ # FSQ (Finite Scalar Quantization) configuration
214
+ self.fsq_dim = fsq_dim
215
+ self.fsq_input_levels = fsq_input_levels
216
+ self.fsq_input_num_quantizers = fsq_input_num_quantizers
217
+
218
+ # Timbre encoder configuration
219
+ self.timbre_hidden_dim = timbre_hidden_dim
220
+ self.num_timbre_encoder_hidden_layers = num_timbre_encoder_hidden_layers
221
+ self.timbre_fix_frame = timbre_fix_frame
222
+ self.num_attention_pooler_hidden_layers = num_attention_pooler_hidden_layers
223
+ self.num_audio_decoder_hidden_layers = num_audio_decoder_hidden_layers
224
+ self.vocab_size = vocab_size
225
+
226
+ # Backward compatibility: ensure num_key_value_heads is set
227
+ if num_key_value_heads is None:
228
+ num_key_value_heads = num_attention_heads
229
+
230
+ self.num_key_value_heads = num_key_value_heads
231
+ self.head_dim = head_dim
232
+ self.hidden_act = hidden_act
233
+ self.initializer_range = initializer_range
234
+ self.rms_norm_eps = rms_norm_eps
235
+ self.use_cache = use_cache
236
+ self.rope_theta = rope_theta
237
+ self.rope_scaling = rope_scaling
238
+ self.attention_bias = attention_bias
239
+ self.attention_dropout = attention_dropout
240
+ self.model_version = model_version
241
+
242
+ # Validate rotary position embeddings parameters
243
+ # Backward compatibility: if there is a 'type' field, move it to 'rope_type'
244
+ if self.rope_scaling is not None and "type" in self.rope_scaling:
245
+ self.rope_scaling["rope_type"] = self.rope_scaling["type"]
246
+ rope_config_validation(self)
247
+
248
+ self.layer_types = layer_types
249
+
250
+ # Set default layer types if not specified
251
+ if self.layer_types is None:
252
+ self.layer_types = [
253
+ "sliding_attention" if bool((i + 1) % 2) else "full_attention" for i in range(self.num_hidden_layers)
254
+ ]
255
+ layer_type_validation(self.layer_types)
256
+
257
+ super().__init__(
258
+ tie_word_embeddings=tie_word_embeddings,
259
+ **kwargs,
260
+ )
261
+
262
+
263
+ __all__ = ["AceStepConfig"]
models/ace-step/acestep-v15-base/modeling_acestep_v15_base.py ADDED
The diff for this file is too large to render. See raw diff
 
models/ace-step/acestep-v15-sft/apg_guidance.py ADDED
@@ -0,0 +1,220 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn.functional as F
3
+
4
+
5
+ class MomentumBuffer:
6
+
7
+ def __init__(self, momentum: float = -0.75):
8
+ self.momentum = momentum
9
+ self.running_average = 0
10
+
11
+ def update(self, update_value: torch.Tensor):
12
+ new_average = self.momentum * self.running_average
13
+ self.running_average = update_value + new_average
14
+
15
+
16
+ def project(
17
+ v0: torch.Tensor, # [B, C, T]
18
+ v1: torch.Tensor, # [B, C, T]
19
+ dims=[-1],
20
+ ):
21
+ dtype = v0.dtype
22
+ device_type = v0.device.type
23
+ if device_type == "mps":
24
+ v0, v1 = v0.cpu(), v1.cpu()
25
+
26
+ v0, v1 = v0.double(), v1.double()
27
+ v1 = torch.nn.functional.normalize(v1, dim=dims)
28
+ v0_parallel = (v0 * v1).sum(dim=dims, keepdim=True) * v1
29
+ v0_orthogonal = v0 - v0_parallel
30
+ return v0_parallel.to(dtype).to(device_type), v0_orthogonal.to(dtype).to(device_type)
31
+
32
+
33
+ def apg_forward(
34
+ pred_cond: torch.Tensor, # [B, C, T]
35
+ pred_uncond: torch.Tensor, # [B, C, T]
36
+ guidance_scale: float,
37
+ momentum_buffer: MomentumBuffer = None,
38
+ eta: float = 0.0,
39
+ norm_threshold: float = 2.5,
40
+ dims=[-1],
41
+ ):
42
+ diff = pred_cond - pred_uncond
43
+ if momentum_buffer is not None:
44
+ momentum_buffer.update(diff)
45
+ diff = momentum_buffer.running_average
46
+
47
+ if norm_threshold > 0:
48
+ ones = torch.ones_like(diff)
49
+ diff_norm = diff.norm(p=2, dim=dims, keepdim=True)
50
+ scale_factor = torch.minimum(ones, norm_threshold / diff_norm)
51
+ diff = diff * scale_factor
52
+
53
+ diff_parallel, diff_orthogonal = project(diff, pred_cond, dims)
54
+ normalized_update = diff_orthogonal + eta * diff_parallel
55
+ pred_guided = pred_cond + (guidance_scale - 1) * normalized_update
56
+ return pred_guided
57
+
58
+
59
+ def cfg_forward(cond_output, uncond_output, cfg_strength):
60
+ return uncond_output + cfg_strength * (cond_output - uncond_output)
61
+
62
+
63
+ def call_cos_tensor(tensor1, tensor2):
64
+ """
65
+ Calculate cosine similarity between two normalized tensors.
66
+
67
+ Args:
68
+ tensor1: First tensor [B, ...]
69
+ tensor2: Second tensor [B, ...]
70
+
71
+ Returns:
72
+ Cosine similarity value [B, 1]
73
+ """
74
+ tensor1 = tensor1 / torch.linalg.norm(tensor1, dim=1, keepdim=True)
75
+ tensor2 = tensor2 / torch.linalg.norm(tensor2, dim=1, keepdim=True)
76
+ cosvalue = torch.sum(tensor1 * tensor2, dim=1, keepdim=True)
77
+ return cosvalue
78
+
79
+
80
+ def compute_perpendicular_component(latent_diff, latent_hat_uncond):
81
+ """
82
+ Decompose latent_diff into parallel and perpendicular components relative to latent_hat_uncond.
83
+
84
+ Args:
85
+ latent_diff: Difference tensor [B, C, ...]
86
+ latent_hat_uncond: Unconditional prediction tensor [B, C, ...]
87
+
88
+ Returns:
89
+ projection: Parallel component
90
+ perpendicular_component: Perpendicular component
91
+ """
92
+ n, t, c = latent_diff.shape
93
+ latent_diff = latent_diff.view(n * t, c).float()
94
+ latent_hat_uncond = latent_hat_uncond.view(n * t, c).float()
95
+
96
+ if latent_diff.size() != latent_hat_uncond.size():
97
+ raise ValueError("latent_diff and latent_hat_uncond must have the same shape [n, d].")
98
+
99
+ dot_product = torch.sum(latent_diff * latent_hat_uncond, dim=1, keepdim=True) # [n, 1]
100
+ norm_square = torch.sum(latent_hat_uncond * latent_hat_uncond, dim=1, keepdim=True) # [n, 1]
101
+ projection = (dot_product / (norm_square + 1e-8)) * latent_hat_uncond
102
+ perpendicular_component = latent_diff - projection
103
+
104
+ return projection.view(n, t, c), perpendicular_component.reshape(n, t, c)
105
+
106
+
107
+ def adg_forward(
108
+ latents: torch.Tensor,
109
+ noise_pred_cond: torch.Tensor,
110
+ noise_pred_uncond: torch.Tensor,
111
+ sigma: torch.Tensor,
112
+ guidance_scale: float,
113
+ angle_clip: float = 3.14 / 6, # pi/6 by default
114
+ apply_norm: bool = False,
115
+ apply_clip: bool = True,
116
+ ):
117
+ """
118
+ ADG (Angle-based Dynamic Guidance) forward pass for Flow Matching.
119
+
120
+ In flow matching (including SD3), sigma represents the current timestep t_curr.
121
+ The predictions are velocity fields v(x_t, t).
122
+
123
+ Args:
124
+ latents: Current state x_t [N, T, d] where d=64
125
+ noise_pred_cond: Conditional velocity prediction v_cond [N, T, d]
126
+ noise_pred_uncond: Unconditional velocity prediction v_uncond [N, T, d]
127
+ sigma: Current timestep t_curr (not t_prev!)
128
+ guidance_scale: Guidance strength
129
+ angle_clip: Maximum angle for clipping (default: pi/6)
130
+ apply_norm: Whether to normalize the result (ADG_w_norm variant)
131
+ apply_clip: Whether to clip the angle (ADG_wo_clip when False)
132
+
133
+ Returns:
134
+ Guided velocity prediction [N, T, d]
135
+ """
136
+ # Get batch size
137
+ n = noise_pred_cond.shape[0]
138
+ noise_pred_text = noise_pred_cond
139
+ n, t, c = noise_pred_text.shape
140
+
141
+ # Ensure sigma/t has the right shape for broadcasting [N, 1, 1]
142
+ if isinstance(sigma, (int, float)):
143
+ sigma = torch.tensor(sigma, device=latents.device, dtype=latents.dtype)
144
+ sigma = sigma.view(1, 1, 1).expand(n, 1, 1)
145
+ elif torch.is_tensor(sigma):
146
+ if sigma.numel() == 1:
147
+ sigma = sigma.view(1, 1, 1).expand(n, 1, 1)
148
+ elif sigma.numel() == n:
149
+ sigma = sigma.view(n, 1, 1)
150
+ else:
151
+ raise ValueError(f"sigma has incompatible shape. Expected scalar or size {n}, got {sigma.shape}")
152
+ else:
153
+ raise TypeError(f"sigma must be a number or tensor, got {type(sigma)}")
154
+
155
+ # Adjust guidance weight
156
+ weight = guidance_scale - 1
157
+ weight = weight * (weight > 0) + 1e-3
158
+
159
+ latent_hat_text = latents - sigma * noise_pred_text
160
+ latent_hat_uncond = latents - sigma * noise_pred_uncond
161
+ latent_diff = latent_hat_text - latent_hat_uncond
162
+
163
+ # Calculate angle between conditional and unconditional predicted data
164
+ latent_theta = torch.acos(
165
+ call_cos_tensor(latent_hat_text.view(-1, c).to(float),
166
+ latent_hat_uncond.reshape(-1, c).contiguous().to(float)))
167
+ latent_theta_new = torch.clip(weight * latent_theta, -angle_clip, angle_clip) if apply_clip else weight * latent_theta
168
+ proj, perp = compute_perpendicular_component(latent_diff, latent_hat_uncond)
169
+ latent_v_new = torch.cos(latent_theta_new) * latent_hat_text
170
+
171
+ latent_p_new = perp * torch.sin(latent_theta_new) / torch.sin(latent_theta) * (
172
+ torch.sin(latent_theta) > 1e-3) + perp * weight * (torch.sin(latent_theta) <= 1e-3)
173
+ latent_new = latent_v_new + latent_p_new
174
+ if apply_norm:
175
+ latent_new = latent_new * torch.linalg.norm(latent_hat_text, dim=1, keepdim=True) / torch.linalg.norm(
176
+ latent_new, dim=1, keepdim=True)
177
+
178
+ noise_pred = (latents - latent_new) / sigma
179
+ noise_pred = noise_pred.reshape(n, t, c).to(latents.dtype)
180
+ return noise_pred
181
+
182
+
183
+ def adg_w_norm_forward(
184
+ latents: torch.Tensor,
185
+ noise_pred_cond: torch.Tensor,
186
+ noise_pred_uncond: torch.Tensor,
187
+ sigma: float,
188
+ guidance_scale: float,
189
+ angle_clip: float = 3.14 / 3,
190
+ ):
191
+ """
192
+ ADG with normalization - preserves the magnitude of latent predictions.
193
+
194
+ This variant normalizes the final latent to maintain the same norm as the
195
+ conditional prediction, which can help preserve image quality.
196
+ """
197
+ return adg_forward(latents,
198
+ noise_pred_cond,
199
+ noise_pred_uncond,
200
+ sigma,
201
+ guidance_scale,
202
+ angle_clip=angle_clip,
203
+ apply_norm=True,
204
+ apply_clip=True)
205
+
206
+
207
+ def adg_wo_clip_forward(
208
+ latents: torch.Tensor,
209
+ noise_pred_cond: torch.Tensor,
210
+ noise_pred_uncond: torch.Tensor,
211
+ sigma: float,
212
+ guidance_scale: float,
213
+ ):
214
+ """
215
+ ADG without angle clipping - allows unbounded angle adjustments.
216
+
217
+ This variant doesn't clip the angle, which may result in more aggressive
218
+ guidance but could be less stable.
219
+ """
220
+ return adg_forward(latents, noise_pred_cond, noise_pred_uncond, sigma, guidance_scale, apply_norm=False, apply_clip=False)
models/ace-step/acestep-v15-sft/config.json ADDED
@@ -0,0 +1,81 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "AceStepConditionGenerationModel"
4
+ ],
5
+ "auto_map": {
6
+ "AutoConfig": "configuration_acestep_v15.AceStepConfig",
7
+ "AutoModel": "modeling_acestep_v15_base.AceStepConditionGenerationModel"
8
+ },
9
+ "attention_bias": false,
10
+ "attention_dropout": 0.0,
11
+ "audio_acoustic_hidden_dim": 64,
12
+ "data_proportion": 0.5,
13
+ "dtype": "bfloat16",
14
+ "fsq_dim": 2048,
15
+ "fsq_input_levels": [
16
+ 8,
17
+ 8,
18
+ 8,
19
+ 5,
20
+ 5,
21
+ 5
22
+ ],
23
+ "fsq_input_num_quantizers": 1,
24
+ "head_dim": 128,
25
+ "hidden_act": "silu",
26
+ "hidden_size": 2048,
27
+ "in_channels": 192,
28
+ "initializer_range": 0.02,
29
+ "intermediate_size": 6144,
30
+ "layer_types": [
31
+ "sliding_attention",
32
+ "full_attention",
33
+ "sliding_attention",
34
+ "full_attention",
35
+ "sliding_attention",
36
+ "full_attention",
37
+ "sliding_attention",
38
+ "full_attention",
39
+ "sliding_attention",
40
+ "full_attention",
41
+ "sliding_attention",
42
+ "full_attention",
43
+ "sliding_attention",
44
+ "full_attention",
45
+ "sliding_attention",
46
+ "full_attention",
47
+ "sliding_attention",
48
+ "full_attention",
49
+ "sliding_attention",
50
+ "full_attention",
51
+ "sliding_attention",
52
+ "full_attention",
53
+ "sliding_attention",
54
+ "full_attention"
55
+ ],
56
+ "max_position_embeddings": 32768,
57
+ "model_type": "acestep",
58
+ "num_attention_heads": 16,
59
+ "num_attention_pooler_hidden_layers": 2,
60
+ "num_audio_decoder_hidden_layers": 24,
61
+ "num_hidden_layers": 24,
62
+ "num_key_value_heads": 8,
63
+ "num_lyric_encoder_hidden_layers": 8,
64
+ "num_timbre_encoder_hidden_layers": 4,
65
+ "patch_size": 2,
66
+ "pool_window_size": 5,
67
+ "rms_norm_eps": 1e-06,
68
+ "rope_scaling": null,
69
+ "rope_theta": 1000000,
70
+ "sliding_window": 128,
71
+ "text_hidden_dim": 1024,
72
+ "timbre_fix_frame": 750,
73
+ "timbre_hidden_dim": 64,
74
+ "timestep_mu": -0.4,
75
+ "timestep_sigma": 1.0,
76
+ "transformers_version": "4.57.0.dev0",
77
+ "use_cache": true,
78
+ "use_sliding_window": true,
79
+ "vocab_size": 64003,
80
+ "is_turbo": false
81
+ }
models/ace-step/acestep-v15-sft/configuration_acestep_v15.py ADDED
@@ -0,0 +1,263 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2024 The Qwen team, Alibaba Group and the HuggingFace Inc. team. All rights reserved.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ """AceStep model configuration"""
16
+
17
+ from transformers.configuration_utils import PretrainedConfig, layer_type_validation
18
+ from transformers.modeling_rope_utils import rope_config_validation
19
+ from transformers.utils import logging
20
+
21
+
22
+ logger = logging.get_logger(__name__)
23
+
24
+
25
+ class AceStepConfig(PretrainedConfig):
26
+ r"""
27
+ This is the configuration class to store the configuration of a [`AceStepModel`]. It is used to instantiate an
28
+ AceStep model according to the specified arguments, defining the model architecture.
29
+
30
+ Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
31
+ documentation from [`PretrainedConfig`] for more information.
32
+
33
+ Args:
34
+ vocab_size (`int`, *optional*, defaults to 64003):
35
+ Vocabulary size of the AceStep model. Defines the number of different tokens that can be represented by the
36
+ `inputs_ids` passed when calling the model.
37
+ hidden_size (`int`, *optional*, defaults to 4096):
38
+ Dimension of the hidden representations.
39
+ intermediate_size (`int`, *optional*, defaults to 22016):
40
+ Dimension of the MLP representations.
41
+ num_hidden_layers (`int`, *optional*, defaults to 32):
42
+ Number of hidden layers in the Transformer encoder.
43
+ num_attention_heads (`int`, *optional*, defaults to 32):
44
+ Number of attention heads for each attention layer in the Transformer encoder.
45
+ num_key_value_heads (`int`, *optional*, defaults to 32):
46
+ This is the number of key_value heads that should be used to implement Grouped Query Attention. If
47
+ `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
48
+ `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
49
+ converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
50
+ by meanpooling all the original heads within that group. For more details, check out [this
51
+ paper](https://huggingface.co/papers/2305.13245). If it is not specified, will default to `32`.
52
+ head_dim (`int`, *optional*, defaults to 128):
53
+ The attention head dimension.
54
+ hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
55
+ The non-linear activation function (function or string) in the decoder.
56
+ max_position_embeddings (`int`, *optional*, defaults to 32768):
57
+ The maximum sequence length that this model might ever be used with.
58
+ initializer_range (`float`, *optional*, defaults to 0.02):
59
+ The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
60
+ rms_norm_eps (`float`, *optional*, defaults to 1e-06):
61
+ The epsilon used by the rms normalization layers.
62
+ use_cache (`bool`, *optional*, defaults to `True`):
63
+ Whether or not the model should return the last key/values attentions (not used by all models). Only
64
+ relevant if `config.is_decoder=True`.
65
+ tie_word_embeddings (`bool`, *optional*, defaults to `False`):
66
+ Whether the model's input and output word embeddings should be tied.
67
+ rope_theta (`float`, *optional*, defaults to 10000.0):
68
+ The base period of the RoPE embeddings.
69
+ rope_scaling (`Dict`, *optional*):
70
+ Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type
71
+ and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value
72
+ accordingly.
73
+ Expected contents:
74
+ `rope_type` (`str`):
75
+ The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope',
76
+ 'llama3'], with 'default' being the original RoPE implementation.
77
+ `factor` (`float`, *optional*):
78
+ Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In
79
+ most scaling types, a `factor` of x will enable the model to handle sequences of length x *
80
+ original maximum pre-trained length.
81
+ `original_max_position_embeddings` (`int`, *optional*):
82
+ Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during
83
+ pretraining.
84
+ `attention_factor` (`float`, *optional*):
85
+ Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention
86
+ computation. If unspecified, it defaults to value recommended by the implementation, using the
87
+ `factor` field to infer the suggested value.
88
+ `beta_fast` (`float`, *optional*):
89
+ Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear
90
+ ramp function. If unspecified, it defaults to 32.
91
+ `beta_slow` (`float`, *optional*):
92
+ Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear
93
+ ramp function. If unspecified, it defaults to 1.
94
+ `short_factor` (`list[float]`, *optional*):
95
+ Only used with 'longrope'. The scaling factor to be applied to short contexts (<
96
+ `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
97
+ size divided by the number of attention heads divided by 2
98
+ `long_factor` (`list[float]`, *optional*):
99
+ Only used with 'longrope'. The scaling factor to be applied to long contexts (<
100
+ `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
101
+ size divided by the number of attention heads divided by 2
102
+ `low_freq_factor` (`float`, *optional*):
103
+ Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE
104
+ `high_freq_factor` (`float`, *optional*):
105
+ Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE
106
+ attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`):
107
+ Whether to use a bias in the query, key, value and output projection layers during self-attention.
108
+ use_sliding_window (`bool`, *optional*, defaults to `False`):
109
+ Whether to use sliding window attention.
110
+ sliding_window (`int`, *optional*, defaults to 4096):
111
+ Sliding window attention (SWA) window size. If not specified, will default to `4096`.
112
+ layer_types (`list`, *optional*):
113
+ Attention pattern for each layer.
114
+ attention_dropout (`float`, *optional*, defaults to 0.0):
115
+ The dropout ratio for the attention probabilities.
116
+
117
+ ```python
118
+ >>> from acestep.models import AceStepConfig
119
+
120
+ >>> # Initializing an AceStep configuration
121
+ >>> configuration = AceStepConfig()
122
+
123
+ >>> # Initializing a model from the configuration
124
+ >>> model = AceStepConditionGenerationModel(configuration)
125
+
126
+ >>> # Accessing the model configuration
127
+ >>> configuration = model.config
128
+ ```"""
129
+
130
+ model_type = "acestep"
131
+ keys_to_ignore_at_inference = ["past_key_values"]
132
+
133
+ # Default tensor parallel plan for the base model
134
+ base_model_tp_plan = {
135
+ "layers.*.self_attn.q_proj": "colwise",
136
+ "layers.*.self_attn.k_proj": "colwise",
137
+ "layers.*.self_attn.v_proj": "colwise",
138
+ "layers.*.self_attn.o_proj": "rowwise",
139
+ "layers.*.mlp.gate_proj": "colwise",
140
+ "layers.*.mlp.up_proj": "colwise",
141
+ "layers.*.mlp.down_proj": "rowwise",
142
+ }
143
+ base_model_pp_plan = {
144
+ "embed_tokens": (["input_ids"], ["inputs_embeds"]),
145
+ "layers": (["hidden_states", "attention_mask"], ["hidden_states"]),
146
+ "norm": (["hidden_states"], ["hidden_states"]),
147
+ }
148
+ def __init__(
149
+ self,
150
+ vocab_size=64003,
151
+ fsq_dim=2048,
152
+ fsq_input_levels=[8, 8, 8, 5, 5, 5],
153
+ fsq_input_num_quantizers=1,
154
+ hidden_size=2048,
155
+ intermediate_size=6144,
156
+ num_hidden_layers=24,
157
+ num_attention_heads=16,
158
+ num_key_value_heads=8,
159
+ head_dim=128,
160
+ hidden_act="silu",
161
+ max_position_embeddings=32768,
162
+ initializer_range=0.02,
163
+ rms_norm_eps=1e-6,
164
+ use_cache=True,
165
+ tie_word_embeddings=True,
166
+ rope_theta=1000000,
167
+ rope_scaling=None,
168
+ attention_bias=False,
169
+ use_sliding_window=True,
170
+ sliding_window=128,
171
+ layer_types=None,
172
+ attention_dropout=0.0,
173
+ num_lyric_encoder_hidden_layers=8,
174
+ audio_acoustic_hidden_dim=64,
175
+ pool_window_size=5,
176
+ text_hidden_dim=1024,
177
+ in_channels=192,
178
+ data_proportion=0.5,
179
+ timestep_mu=-0.4,
180
+ timestep_sigma=1.0,
181
+ timbre_hidden_dim=64,
182
+ num_timbre_encoder_hidden_layers=4,
183
+ timbre_fix_frame=750,
184
+ patch_size=2,
185
+ num_attention_pooler_hidden_layers=2,
186
+ num_audio_decoder_hidden_layers=24,
187
+ model_version="turbo",
188
+ **kwargs,
189
+ ):
190
+ self.max_position_embeddings = max_position_embeddings
191
+ self.hidden_size = hidden_size
192
+ self.intermediate_size = intermediate_size
193
+ self.num_hidden_layers = num_hidden_layers
194
+ self.num_attention_heads = num_attention_heads
195
+ self.use_sliding_window = use_sliding_window
196
+ self.sliding_window = sliding_window if self.use_sliding_window else None
197
+
198
+ # Text encoder configuration
199
+ self.text_hidden_dim = text_hidden_dim
200
+
201
+ # Lyric encoder configuration
202
+ self.num_lyric_encoder_hidden_layers = num_lyric_encoder_hidden_layers
203
+ self.patch_size = patch_size
204
+
205
+ # Audio semantic token generation configuration
206
+ self.audio_acoustic_hidden_dim = audio_acoustic_hidden_dim
207
+ self.pool_window_size = pool_window_size
208
+ self.in_channels = in_channels
209
+ self.data_proportion = data_proportion
210
+ self.timestep_mu = timestep_mu
211
+ self.timestep_sigma = timestep_sigma
212
+
213
+ # FSQ (Finite Scalar Quantization) configuration
214
+ self.fsq_dim = fsq_dim
215
+ self.fsq_input_levels = fsq_input_levels
216
+ self.fsq_input_num_quantizers = fsq_input_num_quantizers
217
+
218
+ # Timbre encoder configuration
219
+ self.timbre_hidden_dim = timbre_hidden_dim
220
+ self.num_timbre_encoder_hidden_layers = num_timbre_encoder_hidden_layers
221
+ self.timbre_fix_frame = timbre_fix_frame
222
+ self.num_attention_pooler_hidden_layers = num_attention_pooler_hidden_layers
223
+ self.num_audio_decoder_hidden_layers = num_audio_decoder_hidden_layers
224
+ self.vocab_size = vocab_size
225
+
226
+ # Backward compatibility: ensure num_key_value_heads is set
227
+ if num_key_value_heads is None:
228
+ num_key_value_heads = num_attention_heads
229
+
230
+ self.num_key_value_heads = num_key_value_heads
231
+ self.head_dim = head_dim
232
+ self.hidden_act = hidden_act
233
+ self.initializer_range = initializer_range
234
+ self.rms_norm_eps = rms_norm_eps
235
+ self.use_cache = use_cache
236
+ self.rope_theta = rope_theta
237
+ self.rope_scaling = rope_scaling
238
+ self.attention_bias = attention_bias
239
+ self.attention_dropout = attention_dropout
240
+ self.model_version = model_version
241
+
242
+ # Validate rotary position embeddings parameters
243
+ # Backward compatibility: if there is a 'type' field, move it to 'rope_type'
244
+ if self.rope_scaling is not None and "type" in self.rope_scaling:
245
+ self.rope_scaling["rope_type"] = self.rope_scaling["type"]
246
+ rope_config_validation(self)
247
+
248
+ self.layer_types = layer_types
249
+
250
+ # Set default layer types if not specified
251
+ if self.layer_types is None:
252
+ self.layer_types = [
253
+ "sliding_attention" if bool((i + 1) % 2) else "full_attention" for i in range(self.num_hidden_layers)
254
+ ]
255
+ layer_type_validation(self.layer_types)
256
+
257
+ super().__init__(
258
+ tie_word_embeddings=tie_word_embeddings,
259
+ **kwargs,
260
+ )
261
+
262
+
263
+ __all__ = ["AceStepConfig"]
models/ace-step/acestep-v15-sft/modeling_acestep_v15_base.py ADDED
The diff for this file is too large to render. See raw diff
 
models/ace-step/acestep-v15-turbo/config.json ADDED
@@ -0,0 +1,82 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "AceStepConditionGenerationModel"
4
+ ],
5
+ "attention_bias": false,
6
+ "attention_dropout": 0.0,
7
+ "audio_acoustic_hidden_dim": 64,
8
+ "auto_map": {
9
+ "AutoConfig": "configuration_acestep_v15.AceStepConfig",
10
+ "AutoModel": "modeling_acestep_v15_turbo.AceStepConditionGenerationModel"
11
+ },
12
+ "data_proportion": 0.5,
13
+ "dtype": "bfloat16",
14
+ "fsq_dim": 2048,
15
+ "fsq_input_levels": [
16
+ 8,
17
+ 8,
18
+ 8,
19
+ 5,
20
+ 5,
21
+ 5
22
+ ],
23
+ "fsq_input_num_quantizers": 1,
24
+ "head_dim": 128,
25
+ "hidden_act": "silu",
26
+ "hidden_size": 2048,
27
+ "in_channels": 192,
28
+ "initializer_range": 0.02,
29
+ "intermediate_size": 6144,
30
+ "is_turbo": true,
31
+ "layer_types": [
32
+ "sliding_attention",
33
+ "full_attention",
34
+ "sliding_attention",
35
+ "full_attention",
36
+ "sliding_attention",
37
+ "full_attention",
38
+ "sliding_attention",
39
+ "full_attention",
40
+ "sliding_attention",
41
+ "full_attention",
42
+ "sliding_attention",
43
+ "full_attention",
44
+ "sliding_attention",
45
+ "full_attention",
46
+ "sliding_attention",
47
+ "full_attention",
48
+ "sliding_attention",
49
+ "full_attention",
50
+ "sliding_attention",
51
+ "full_attention",
52
+ "sliding_attention",
53
+ "full_attention",
54
+ "sliding_attention",
55
+ "full_attention"
56
+ ],
57
+ "max_position_embeddings": 32768,
58
+ "model_type": "acestep",
59
+ "model_version": "turbo",
60
+ "num_attention_heads": 16,
61
+ "num_attention_pooler_hidden_layers": 2,
62
+ "num_audio_decoder_hidden_layers": 24,
63
+ "num_hidden_layers": 24,
64
+ "num_key_value_heads": 8,
65
+ "num_lyric_encoder_hidden_layers": 8,
66
+ "num_timbre_encoder_hidden_layers": 4,
67
+ "patch_size": 2,
68
+ "pool_window_size": 5,
69
+ "rms_norm_eps": 1e-06,
70
+ "rope_scaling": null,
71
+ "rope_theta": 1000000,
72
+ "sliding_window": 128,
73
+ "text_hidden_dim": 1024,
74
+ "timbre_fix_frame": 750,
75
+ "timbre_hidden_dim": 64,
76
+ "timestep_mu": -0.4,
77
+ "timestep_sigma": 1.0,
78
+ "transformers_version": "4.57.0.dev0",
79
+ "use_cache": true,
80
+ "use_sliding_window": true,
81
+ "vocab_size": 64003
82
+ }
models/ace-step/acestep-v15-turbo/configuration_acestep_v15.py ADDED
@@ -0,0 +1,263 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2024 The Qwen team, Alibaba Group and the HuggingFace Inc. team. All rights reserved.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ """AceStep model configuration"""
16
+
17
+ from transformers.configuration_utils import PretrainedConfig, layer_type_validation
18
+ from transformers.modeling_rope_utils import rope_config_validation
19
+ from transformers.utils import logging
20
+
21
+
22
+ logger = logging.get_logger(__name__)
23
+
24
+
25
+ class AceStepConfig(PretrainedConfig):
26
+ r"""
27
+ This is the configuration class to store the configuration of a [`AceStepModel`]. It is used to instantiate an
28
+ AceStep model according to the specified arguments, defining the model architecture.
29
+
30
+ Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
31
+ documentation from [`PretrainedConfig`] for more information.
32
+
33
+ Args:
34
+ vocab_size (`int`, *optional*, defaults to 64003):
35
+ Vocabulary size of the AceStep model. Defines the number of different tokens that can be represented by the
36
+ `inputs_ids` passed when calling the model.
37
+ hidden_size (`int`, *optional*, defaults to 4096):
38
+ Dimension of the hidden representations.
39
+ intermediate_size (`int`, *optional*, defaults to 22016):
40
+ Dimension of the MLP representations.
41
+ num_hidden_layers (`int`, *optional*, defaults to 32):
42
+ Number of hidden layers in the Transformer encoder.
43
+ num_attention_heads (`int`, *optional*, defaults to 32):
44
+ Number of attention heads for each attention layer in the Transformer encoder.
45
+ num_key_value_heads (`int`, *optional*, defaults to 32):
46
+ This is the number of key_value heads that should be used to implement Grouped Query Attention. If
47
+ `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
48
+ `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
49
+ converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
50
+ by meanpooling all the original heads within that group. For more details, check out [this
51
+ paper](https://huggingface.co/papers/2305.13245). If it is not specified, will default to `32`.
52
+ head_dim (`int`, *optional*, defaults to 128):
53
+ The attention head dimension.
54
+ hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
55
+ The non-linear activation function (function or string) in the decoder.
56
+ max_position_embeddings (`int`, *optional*, defaults to 32768):
57
+ The maximum sequence length that this model might ever be used with.
58
+ initializer_range (`float`, *optional*, defaults to 0.02):
59
+ The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
60
+ rms_norm_eps (`float`, *optional*, defaults to 1e-06):
61
+ The epsilon used by the rms normalization layers.
62
+ use_cache (`bool`, *optional*, defaults to `True`):
63
+ Whether or not the model should return the last key/values attentions (not used by all models). Only
64
+ relevant if `config.is_decoder=True`.
65
+ tie_word_embeddings (`bool`, *optional*, defaults to `False`):
66
+ Whether the model's input and output word embeddings should be tied.
67
+ rope_theta (`float`, *optional*, defaults to 10000.0):
68
+ The base period of the RoPE embeddings.
69
+ rope_scaling (`Dict`, *optional*):
70
+ Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type
71
+ and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value
72
+ accordingly.
73
+ Expected contents:
74
+ `rope_type` (`str`):
75
+ The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope',
76
+ 'llama3'], with 'default' being the original RoPE implementation.
77
+ `factor` (`float`, *optional*):
78
+ Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In
79
+ most scaling types, a `factor` of x will enable the model to handle sequences of length x *
80
+ original maximum pre-trained length.
81
+ `original_max_position_embeddings` (`int`, *optional*):
82
+ Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during
83
+ pretraining.
84
+ `attention_factor` (`float`, *optional*):
85
+ Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention
86
+ computation. If unspecified, it defaults to value recommended by the implementation, using the
87
+ `factor` field to infer the suggested value.
88
+ `beta_fast` (`float`, *optional*):
89
+ Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear
90
+ ramp function. If unspecified, it defaults to 32.
91
+ `beta_slow` (`float`, *optional*):
92
+ Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear
93
+ ramp function. If unspecified, it defaults to 1.
94
+ `short_factor` (`list[float]`, *optional*):
95
+ Only used with 'longrope'. The scaling factor to be applied to short contexts (<
96
+ `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
97
+ size divided by the number of attention heads divided by 2
98
+ `long_factor` (`list[float]`, *optional*):
99
+ Only used with 'longrope'. The scaling factor to be applied to long contexts (<
100
+ `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
101
+ size divided by the number of attention heads divided by 2
102
+ `low_freq_factor` (`float`, *optional*):
103
+ Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE
104
+ `high_freq_factor` (`float`, *optional*):
105
+ Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE
106
+ attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`):
107
+ Whether to use a bias in the query, key, value and output projection layers during self-attention.
108
+ use_sliding_window (`bool`, *optional*, defaults to `False`):
109
+ Whether to use sliding window attention.
110
+ sliding_window (`int`, *optional*, defaults to 4096):
111
+ Sliding window attention (SWA) window size. If not specified, will default to `4096`.
112
+ layer_types (`list`, *optional*):
113
+ Attention pattern for each layer.
114
+ attention_dropout (`float`, *optional*, defaults to 0.0):
115
+ The dropout ratio for the attention probabilities.
116
+
117
+ ```python
118
+ >>> from acestep.models import AceStepConfig
119
+
120
+ >>> # Initializing an AceStep configuration
121
+ >>> configuration = AceStepConfig()
122
+
123
+ >>> # Initializing a model from the configuration
124
+ >>> model = AceStepConditionGenerationModel(configuration)
125
+
126
+ >>> # Accessing the model configuration
127
+ >>> configuration = model.config
128
+ ```"""
129
+
130
+ model_type = "acestep"
131
+ keys_to_ignore_at_inference = ["past_key_values"]
132
+
133
+ # Default tensor parallel plan for the base model
134
+ base_model_tp_plan = {
135
+ "layers.*.self_attn.q_proj": "colwise",
136
+ "layers.*.self_attn.k_proj": "colwise",
137
+ "layers.*.self_attn.v_proj": "colwise",
138
+ "layers.*.self_attn.o_proj": "rowwise",
139
+ "layers.*.mlp.gate_proj": "colwise",
140
+ "layers.*.mlp.up_proj": "colwise",
141
+ "layers.*.mlp.down_proj": "rowwise",
142
+ }
143
+ base_model_pp_plan = {
144
+ "embed_tokens": (["input_ids"], ["inputs_embeds"]),
145
+ "layers": (["hidden_states", "attention_mask"], ["hidden_states"]),
146
+ "norm": (["hidden_states"], ["hidden_states"]),
147
+ }
148
+ def __init__(
149
+ self,
150
+ vocab_size=64003,
151
+ fsq_dim=2048,
152
+ fsq_input_levels=[8, 8, 8, 5, 5, 5],
153
+ fsq_input_num_quantizers=1,
154
+ hidden_size=2048,
155
+ intermediate_size=6144,
156
+ num_hidden_layers=24,
157
+ num_attention_heads=16,
158
+ num_key_value_heads=8,
159
+ head_dim=128,
160
+ hidden_act="silu",
161
+ max_position_embeddings=32768,
162
+ initializer_range=0.02,
163
+ rms_norm_eps=1e-6,
164
+ use_cache=True,
165
+ tie_word_embeddings=True,
166
+ rope_theta=1000000,
167
+ rope_scaling=None,
168
+ attention_bias=False,
169
+ use_sliding_window=True,
170
+ sliding_window=128,
171
+ layer_types=None,
172
+ attention_dropout=0.0,
173
+ num_lyric_encoder_hidden_layers=8,
174
+ audio_acoustic_hidden_dim=64,
175
+ pool_window_size=5,
176
+ text_hidden_dim=1024,
177
+ in_channels=192,
178
+ data_proportion=0.5,
179
+ timestep_mu=-0.4,
180
+ timestep_sigma=1.0,
181
+ timbre_hidden_dim=64,
182
+ num_timbre_encoder_hidden_layers=4,
183
+ timbre_fix_frame=750,
184
+ patch_size=2,
185
+ num_attention_pooler_hidden_layers=2,
186
+ num_audio_decoder_hidden_layers=24,
187
+ model_version="turbo",
188
+ **kwargs,
189
+ ):
190
+ self.max_position_embeddings = max_position_embeddings
191
+ self.hidden_size = hidden_size
192
+ self.intermediate_size = intermediate_size
193
+ self.num_hidden_layers = num_hidden_layers
194
+ self.num_attention_heads = num_attention_heads
195
+ self.use_sliding_window = use_sliding_window
196
+ self.sliding_window = sliding_window if self.use_sliding_window else None
197
+
198
+ # Text encoder configuration
199
+ self.text_hidden_dim = text_hidden_dim
200
+
201
+ # Lyric encoder configuration
202
+ self.num_lyric_encoder_hidden_layers = num_lyric_encoder_hidden_layers
203
+ self.patch_size = patch_size
204
+
205
+ # Audio semantic token generation configuration
206
+ self.audio_acoustic_hidden_dim = audio_acoustic_hidden_dim
207
+ self.pool_window_size = pool_window_size
208
+ self.in_channels = in_channels
209
+ self.data_proportion = data_proportion
210
+ self.timestep_mu = timestep_mu
211
+ self.timestep_sigma = timestep_sigma
212
+
213
+ # FSQ (Finite Scalar Quantization) configuration
214
+ self.fsq_dim = fsq_dim
215
+ self.fsq_input_levels = fsq_input_levels
216
+ self.fsq_input_num_quantizers = fsq_input_num_quantizers
217
+
218
+ # Timbre encoder configuration
219
+ self.timbre_hidden_dim = timbre_hidden_dim
220
+ self.num_timbre_encoder_hidden_layers = num_timbre_encoder_hidden_layers
221
+ self.timbre_fix_frame = timbre_fix_frame
222
+ self.num_attention_pooler_hidden_layers = num_attention_pooler_hidden_layers
223
+ self.num_audio_decoder_hidden_layers = num_audio_decoder_hidden_layers
224
+ self.vocab_size = vocab_size
225
+
226
+ # Backward compatibility: ensure num_key_value_heads is set
227
+ if num_key_value_heads is None:
228
+ num_key_value_heads = num_attention_heads
229
+
230
+ self.num_key_value_heads = num_key_value_heads
231
+ self.head_dim = head_dim
232
+ self.hidden_act = hidden_act
233
+ self.initializer_range = initializer_range
234
+ self.rms_norm_eps = rms_norm_eps
235
+ self.use_cache = use_cache
236
+ self.rope_theta = rope_theta
237
+ self.rope_scaling = rope_scaling
238
+ self.attention_bias = attention_bias
239
+ self.attention_dropout = attention_dropout
240
+ self.model_version = model_version
241
+
242
+ # Validate rotary position embeddings parameters
243
+ # Backward compatibility: if there is a 'type' field, move it to 'rope_type'
244
+ if self.rope_scaling is not None and "type" in self.rope_scaling:
245
+ self.rope_scaling["rope_type"] = self.rope_scaling["type"]
246
+ rope_config_validation(self)
247
+
248
+ self.layer_types = layer_types
249
+
250
+ # Set default layer types if not specified
251
+ if self.layer_types is None:
252
+ self.layer_types = [
253
+ "sliding_attention" if bool((i + 1) % 2) else "full_attention" for i in range(self.num_hidden_layers)
254
+ ]
255
+ layer_type_validation(self.layer_types)
256
+
257
+ super().__init__(
258
+ tie_word_embeddings=tie_word_embeddings,
259
+ **kwargs,
260
+ )
261
+
262
+
263
+ __all__ = ["AceStepConfig"]
models/ace-step/acestep-v15-turbo/modeling_acestep_v15_turbo.py ADDED
The diff for this file is too large to render. See raw diff
 
models/ace-step/vae/config.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_class_name": "AutoencoderOobleck",
3
+ "_diffusers_version": "0.34.0",
4
+ "_name_or_path": "/root/data/repo/gongjunmin/ACE-Step-1.5/checkpoints/vae/",
5
+ "audio_channels": 2,
6
+ "channel_multiples": [
7
+ 1,
8
+ 2,
9
+ 4,
10
+ 8,
11
+ 16
12
+ ],
13
+ "decoder_channels": 128,
14
+ "decoder_input_channels": 64,
15
+ "downsampling_ratios": [
16
+ 2,
17
+ 4,
18
+ 4,
19
+ 6,
20
+ 10
21
+ ],
22
+ "encoder_hidden_size": 128,
23
+ "sampling_rate": 48000
24
+ }
models/dettaglio-restyle/thumbnails/abstract_expressionism.webp ADDED
models/dettaglio-restyle/thumbnails/academia.webp ADDED
models/dettaglio-restyle/thumbnails/action_figure.webp ADDED
models/dettaglio-restyle/thumbnails/adorable_3d_character.webp ADDED
models/dettaglio-restyle/thumbnails/adorable_kawaii.webp ADDED
models/dettaglio-restyle/thumbnails/ads-advertising.webp ADDED
models/dettaglio-restyle/thumbnails/ads-automotive.webp ADDED
models/dettaglio-restyle/thumbnails/ads-corporate.webp ADDED
models/dettaglio-restyle/thumbnails/ads-fashion_editorial.webp ADDED
models/dettaglio-restyle/thumbnails/ads-food_photography.webp ADDED
models/dettaglio-restyle/thumbnails/ads-gourmet_food_photography.webp ADDED
models/dettaglio-restyle/thumbnails/ads-luxury.webp ADDED
models/dettaglio-restyle/thumbnails/ads-luxury.webp.webp ADDED
models/dettaglio-restyle/thumbnails/ads-retail.webp ADDED
models/dettaglio-restyle/thumbnails/art_deco.webp ADDED
models/dettaglio-restyle/thumbnails/art_nouveau.webp ADDED
models/dettaglio-restyle/thumbnails/artstyle-abstract.webp ADDED
models/dettaglio-restyle/thumbnails/artstyle-abstract_expressionism.webp ADDED
models/dettaglio-restyle/thumbnails/artstyle-art_deco.webp ADDED
models/dettaglio-restyle/thumbnails/artstyle-art_nouveau.webp ADDED
models/dettaglio-restyle/thumbnails/artstyle-constructivist.webp ADDED
models/dettaglio-restyle/thumbnails/artstyle-cubist.webp ADDED
models/dettaglio-restyle/thumbnails/artstyle-expressionist.webp ADDED
models/dettaglio-restyle/thumbnails/artstyle-graffiti.webp ADDED
models/dettaglio-restyle/thumbnails/artstyle-hyperrealism.webp ADDED
models/dettaglio-restyle/thumbnails/artstyle-impressionist.webp ADDED
models/dettaglio-restyle/thumbnails/artstyle-pointillism.webp ADDED
models/dettaglio-restyle/thumbnails/artstyle-pop_art.webp ADDED
models/dettaglio-restyle/thumbnails/artstyle-psychedelic.webp ADDED