atMrMattV commited on
Commit
c90ac06
·
verified ·
1 Parent(s): 0c60560

Clear root for models subfolder reorganization

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +0 -74
  2. ace-step/.gitattributes +0 -38
  3. ace-step/Qwen3-Embedding-0.6B/added_tokens.json +0 -28
  4. ace-step/Qwen3-Embedding-0.6B/chat_template.jinja +0 -85
  5. ace-step/Qwen3-Embedding-0.6B/config.json +0 -60
  6. ace-step/Qwen3-Embedding-0.6B/merges.txt +0 -0
  7. ace-step/Qwen3-Embedding-0.6B/model.safetensors +0 -3
  8. ace-step/Qwen3-Embedding-0.6B/special_tokens_map.json +0 -31
  9. ace-step/Qwen3-Embedding-0.6B/tokenizer.json +0 -3
  10. ace-step/Qwen3-Embedding-0.6B/tokenizer_config.json +0 -239
  11. ace-step/Qwen3-Embedding-0.6B/vocab.json +0 -0
  12. ace-step/README.md +0 -99
  13. ace-step/acestep-5Hz-lm-1.7B/added_tokens.json +0 -0
  14. ace-step/acestep-5Hz-lm-1.7B/chat_template.jinja +0 -89
  15. ace-step/acestep-5Hz-lm-1.7B/config.json +0 -61
  16. ace-step/acestep-5Hz-lm-1.7B/merges.txt +0 -0
  17. ace-step/acestep-5Hz-lm-1.7B/model.safetensors +0 -3
  18. ace-step/acestep-5Hz-lm-1.7B/special_tokens_map.json +0 -0
  19. ace-step/acestep-5Hz-lm-1.7B/tokenizer.json +0 -3
  20. ace-step/acestep-5Hz-lm-1.7B/tokenizer_config.json +0 -3
  21. ace-step/acestep-5Hz-lm-1.7B/vocab.json +0 -0
  22. ace-step/acestep-5Hz-lm-4B/Unconfirmed 786712.crdownload +0 -3
  23. ace-step/acestep-5Hz-lm-4B/added_tokens.json +0 -0
  24. ace-step/acestep-5Hz-lm-4B/config.json +0 -69
  25. ace-step/acestep-5Hz-lm-4B/merges.txt +0 -0
  26. ace-step/acestep-5Hz-lm-4B/model.safetensors.index.json +0 -405
  27. ace-step/acestep-5Hz-lm-4B/special_tokens_map.json +0 -0
  28. ace-step/acestep-5Hz-lm-4B/tokenizer.json +0 -3
  29. ace-step/acestep-5Hz-lm-4B/tokenizer_config.json +0 -3
  30. ace-step/acestep-5Hz-lm-4B/vocab.json +0 -0
  31. ace-step/acestep-v15-base/apg_guidance.py +0 -220
  32. ace-step/acestep-v15-base/config.json +0 -81
  33. ace-step/acestep-v15-base/configuration_acestep_v15.py +0 -263
  34. ace-step/acestep-v15-base/modeling_acestep_v15_base.py +0 -0
  35. ace-step/acestep-v15-base/silence_latent.pt +0 -3
  36. ace-step/acestep-v15-sft/apg_guidance.py +0 -220
  37. ace-step/acestep-v15-sft/config.json +0 -81
  38. ace-step/acestep-v15-sft/configuration_acestep_v15.py +0 -263
  39. ace-step/acestep-v15-sft/modeling_acestep_v15_base.py +0 -0
  40. ace-step/acestep-v15-sft/silence_latent.pt +0 -3
  41. ace-step/acestep-v15-turbo/config.json +0 -82
  42. ace-step/acestep-v15-turbo/configuration_acestep_v15.py +0 -263
  43. ace-step/acestep-v15-turbo/modeling_acestep_v15_turbo.py +0 -0
  44. ace-step/acestep-v15-turbo/silence_latent.pt +0 -3
  45. ace-step/config.json +0 -82
  46. ace-step/vae/config.json +0 -24
  47. ace-step/vae/diffusion_pytorch_model.safetensors +0 -3
  48. depth/dpt-large/.no_exist/bc15f29aa3a80d532f2ed650b5e16ac48d8958f9/processor_config.json +0 -0
  49. depth/dpt-large/refs/main +0 -1
  50. depth/dpt-large/snapshots/bc15f29aa3a80d532f2ed650b5e16ac48d8958f9/config.json +0 -47
.gitattributes DELETED
@@ -1,74 +0,0 @@
1
- *.7z filter=lfs diff=lfs merge=lfs -text
2
- *.arrow filter=lfs diff=lfs merge=lfs -text
3
- *.bin filter=lfs diff=lfs merge=lfs -text
4
- *.bz2 filter=lfs diff=lfs merge=lfs -text
5
- *.ckpt filter=lfs diff=lfs merge=lfs -text
6
- *.ftz filter=lfs diff=lfs merge=lfs -text
7
- *.gz filter=lfs diff=lfs merge=lfs -text
8
- *.h5 filter=lfs diff=lfs merge=lfs -text
9
- *.joblib filter=lfs diff=lfs merge=lfs -text
10
- *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
- *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
- *.model filter=lfs diff=lfs merge=lfs -text
13
- *.msgpack filter=lfs diff=lfs merge=lfs -text
14
- *.npy filter=lfs diff=lfs merge=lfs -text
15
- *.npz filter=lfs diff=lfs merge=lfs -text
16
- *.onnx filter=lfs diff=lfs merge=lfs -text
17
- *.ot filter=lfs diff=lfs merge=lfs -text
18
- *.parquet filter=lfs diff=lfs merge=lfs -text
19
- *.pb filter=lfs diff=lfs merge=lfs -text
20
- *.pickle filter=lfs diff=lfs merge=lfs -text
21
- *.pkl filter=lfs diff=lfs merge=lfs -text
22
- *.pt filter=lfs diff=lfs merge=lfs -text
23
- *.pth filter=lfs diff=lfs merge=lfs -text
24
- *.rar filter=lfs diff=lfs merge=lfs -text
25
- *.safetensors filter=lfs diff=lfs merge=lfs -text
26
- saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
- *.tar.* filter=lfs diff=lfs merge=lfs -text
28
- *.tar filter=lfs diff=lfs merge=lfs -text
29
- *.tflite filter=lfs diff=lfs merge=lfs -text
30
- *.tgz filter=lfs diff=lfs merge=lfs -text
31
- *.wasm filter=lfs diff=lfs merge=lfs -text
32
- *.xz filter=lfs diff=lfs merge=lfs -text
33
- *.zip filter=lfs diff=lfs merge=lfs -text
34
- *.zst filter=lfs diff=lfs merge=lfs -text
35
- *tfevents* filter=lfs diff=lfs merge=lfs -text
36
- llm/mmproj-F16.gguf filter=lfs diff=lfs merge=lfs -text
37
- voice-presets/anna.wav filter=lfs diff=lfs merge=lfs -text
38
- voice-presets/bertrand.wav filter=lfs diff=lfs merge=lfs -text
39
- voice-presets/cate.wav filter=lfs diff=lfs merge=lfs -text
40
- voice-presets/coralie.wav filter=lfs diff=lfs merge=lfs -text
41
- voice-presets/corrado.wav filter=lfs diff=lfs merge=lfs -text
42
- voice-presets/daniela.wav filter=lfs diff=lfs merge=lfs -text
43
- voice-presets/denzel.wav filter=lfs diff=lfs merge=lfs -text
44
- voice-presets/estelle.wav filter=lfs diff=lfs merge=lfs -text
45
- voice-presets/fabio.wav filter=lfs diff=lfs merge=lfs -text
46
- voice-presets/gerald.wav filter=lfs diff=lfs merge=lfs -text
47
- voice-presets/marion.wav filter=lfs diff=lfs merge=lfs -text
48
- voice-presets/mel.wav filter=lfs diff=lfs merge=lfs -text
49
- voice-presets/rita.wav filter=lfs diff=lfs merge=lfs -text
50
- voice-presets/roberto.wav filter=lfs diff=lfs merge=lfs -text
51
- voice-presets/ruggero.wav filter=lfs diff=lfs merge=lfs -text
52
- voice-presets/stefania.wav filter=lfs diff=lfs merge=lfs -text
53
- ace-step/acestep-5Hz-lm-1.7B/tokenizer.json filter=lfs diff=lfs merge=lfs -text
54
- ace-step/acestep-5Hz-lm-1.7B/tokenizer_config.json filter=lfs diff=lfs merge=lfs -text
55
- ace-step/acestep-5Hz-lm-4B/tokenizer.json filter=lfs diff=lfs merge=lfs -text
56
- ace-step/acestep-5Hz-lm-4B/tokenizer_config.json filter=lfs diff=lfs merge=lfs -text
57
- ace-step/Qwen3-Embedding-0.6B/tokenizer.json filter=lfs diff=lfs merge=lfs -text
58
- llm/Qwen3.5-4B-Q4_K_M.gguf filter=lfs diff=lfs merge=lfs -text
59
- flux2-klein/tokenizer/tokenizer.json filter=lfs diff=lfs merge=lfs -text
60
- llm/Llama-3.2-3B-Instruct-uncensored-Q8_0.gguf filter=lfs diff=lfs merge=lfs -text
61
- ace-step/acestep-5Hz-lm-4B/Unconfirmed[[:space:]]786712.crdownload filter=lfs diff=lfs merge=lfs -text
62
- stylemaster/tokenizer/tokenizer.json filter=lfs diff=lfs merge=lfs -text
63
- voice-presets/convert/anna.mp3 filter=lfs diff=lfs merge=lfs -text
64
- voice-presets/convert/bertrand.mp3 filter=lfs diff=lfs merge=lfs -text
65
- voice-presets/convert/coralie.mp3 filter=lfs diff=lfs merge=lfs -text
66
- voice-presets/convert/corrado.mp3 filter=lfs diff=lfs merge=lfs -text
67
- voice-presets/convert/daniela.mp3 filter=lfs diff=lfs merge=lfs -text
68
- voice-presets/convert/estelle.mp3 filter=lfs diff=lfs merge=lfs -text
69
- voice-presets/convert/fabio.mp3 filter=lfs diff=lfs merge=lfs -text
70
- voice-presets/convert/marion.mp3 filter=lfs diff=lfs merge=lfs -text
71
- voice-presets/convert/rita.mp3 filter=lfs diff=lfs merge=lfs -text
72
- voice-presets/convert/roberto.mp3 filter=lfs diff=lfs merge=lfs -text
73
- voice-presets/convert/ruggero.mp3 filter=lfs diff=lfs merge=lfs -text
74
- voice-presets/convert/stefania.mp3 filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ace-step/.gitattributes DELETED
@@ -1,38 +0,0 @@
1
- *.7z filter=lfs diff=lfs merge=lfs -text
2
- *.arrow filter=lfs diff=lfs merge=lfs -text
3
- *.bin filter=lfs diff=lfs merge=lfs -text
4
- *.bz2 filter=lfs diff=lfs merge=lfs -text
5
- *.ckpt filter=lfs diff=lfs merge=lfs -text
6
- *.ftz filter=lfs diff=lfs merge=lfs -text
7
- *.gz filter=lfs diff=lfs merge=lfs -text
8
- *.h5 filter=lfs diff=lfs merge=lfs -text
9
- *.joblib filter=lfs diff=lfs merge=lfs -text
10
- *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
- *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
- *.model filter=lfs diff=lfs merge=lfs -text
13
- *.msgpack filter=lfs diff=lfs merge=lfs -text
14
- *.npy filter=lfs diff=lfs merge=lfs -text
15
- *.npz filter=lfs diff=lfs merge=lfs -text
16
- *.onnx filter=lfs diff=lfs merge=lfs -text
17
- *.ot filter=lfs diff=lfs merge=lfs -text
18
- *.parquet filter=lfs diff=lfs merge=lfs -text
19
- *.pb filter=lfs diff=lfs merge=lfs -text
20
- *.pickle filter=lfs diff=lfs merge=lfs -text
21
- *.pkl filter=lfs diff=lfs merge=lfs -text
22
- *.pt filter=lfs diff=lfs merge=lfs -text
23
- *.pth filter=lfs diff=lfs merge=lfs -text
24
- *.rar filter=lfs diff=lfs merge=lfs -text
25
- *.safetensors filter=lfs diff=lfs merge=lfs -text
26
- saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
- *.tar.* filter=lfs diff=lfs merge=lfs -text
28
- *.tar filter=lfs diff=lfs merge=lfs -text
29
- *.tflite filter=lfs diff=lfs merge=lfs -text
30
- *.tgz filter=lfs diff=lfs merge=lfs -text
31
- *.wasm filter=lfs diff=lfs merge=lfs -text
32
- *.xz filter=lfs diff=lfs merge=lfs -text
33
- *.zip filter=lfs diff=lfs merge=lfs -text
34
- *.zst filter=lfs diff=lfs merge=lfs -text
35
- *tfevents* filter=lfs diff=lfs merge=lfs -text
36
- acestep-5Hz-lm-1.7B/tokenizer.json filter=lfs diff=lfs merge=lfs -text
37
- acestep-5Hz-lm-1.7B/tokenizer_config.json filter=lfs diff=lfs merge=lfs -text
38
- Qwen3-Embedding-0.6B/tokenizer.json filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ace-step/Qwen3-Embedding-0.6B/added_tokens.json DELETED
@@ -1,28 +0,0 @@
1
- {
2
- "</think>": 151668,
3
- "</tool_call>": 151658,
4
- "</tool_response>": 151666,
5
- "<think>": 151667,
6
- "<tool_call>": 151657,
7
- "<tool_response>": 151665,
8
- "<|box_end|>": 151649,
9
- "<|box_start|>": 151648,
10
- "<|endoftext|>": 151643,
11
- "<|file_sep|>": 151664,
12
- "<|fim_middle|>": 151660,
13
- "<|fim_pad|>": 151662,
14
- "<|fim_prefix|>": 151659,
15
- "<|fim_suffix|>": 151661,
16
- "<|im_end|>": 151645,
17
- "<|im_start|>": 151644,
18
- "<|image_pad|>": 151655,
19
- "<|object_ref_end|>": 151647,
20
- "<|object_ref_start|>": 151646,
21
- "<|quad_end|>": 151651,
22
- "<|quad_start|>": 151650,
23
- "<|repo_name|>": 151663,
24
- "<|video_pad|>": 151656,
25
- "<|vision_end|>": 151653,
26
- "<|vision_pad|>": 151654,
27
- "<|vision_start|>": 151652
28
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ace-step/Qwen3-Embedding-0.6B/chat_template.jinja DELETED
@@ -1,85 +0,0 @@
1
- {%- if tools %}
2
- {{- '<|im_start|>system\n' }}
3
- {%- if messages[0].role == 'system' %}
4
- {{- messages[0].content + '\n\n' }}
5
- {%- endif %}
6
- {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
7
- {%- for tool in tools %}
8
- {{- "\n" }}
9
- {{- tool | tojson }}
10
- {%- endfor %}
11
- {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
12
- {%- else %}
13
- {%- if messages[0].role == 'system' %}
14
- {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
15
- {%- endif %}
16
- {%- endif %}
17
- {%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
18
- {%- for message in messages[::-1] %}
19
- {%- set index = (messages|length - 1) - loop.index0 %}
20
- {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('<tool_response>') and message.content.endswith('</tool_response>')) %}
21
- {%- set ns.multi_step_tool = false %}
22
- {%- set ns.last_query_index = index %}
23
- {%- endif %}
24
- {%- endfor %}
25
- {%- for message in messages %}
26
- {%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
27
- {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
28
- {%- elif message.role == "assistant" %}
29
- {%- set content = message.content %}
30
- {%- set reasoning_content = '' %}
31
- {%- if message.reasoning_content is defined and message.reasoning_content is not none %}
32
- {%- set reasoning_content = message.reasoning_content %}
33
- {%- else %}
34
- {%- if '</think>' in message.content %}
35
- {%- set content = message.content.split('</think>')[-1].lstrip('\n') %}
36
- {%- set reasoning_content = message.content.split('</think>')[0].rstrip('\n').split('<think>')[-1].lstrip('\n') %}
37
- {%- endif %}
38
- {%- endif %}
39
- {%- if loop.index0 > ns.last_query_index %}
40
- {%- if loop.last or (not loop.last and reasoning_content) %}
41
- {{- '<|im_start|>' + message.role + '\n<think>\n' + reasoning_content.strip('\n') + '\n</think>\n\n' + content.lstrip('\n') }}
42
- {%- else %}
43
- {{- '<|im_start|>' + message.role + '\n' + content }}
44
- {%- endif %}
45
- {%- else %}
46
- {{- '<|im_start|>' + message.role + '\n' + content }}
47
- {%- endif %}
48
- {%- if message.tool_calls %}
49
- {%- for tool_call in message.tool_calls %}
50
- {%- if (loop.first and content) or (not loop.first) %}
51
- {{- '\n' }}
52
- {%- endif %}
53
- {%- if tool_call.function %}
54
- {%- set tool_call = tool_call.function %}
55
- {%- endif %}
56
- {{- '<tool_call>\n{"name": "' }}
57
- {{- tool_call.name }}
58
- {{- '", "arguments": ' }}
59
- {%- if tool_call.arguments is string %}
60
- {{- tool_call.arguments }}
61
- {%- else %}
62
- {{- tool_call.arguments | tojson }}
63
- {%- endif %}
64
- {{- '}\n</tool_call>' }}
65
- {%- endfor %}
66
- {%- endif %}
67
- {{- '<|im_end|>\n' }}
68
- {%- elif message.role == "tool" %}
69
- {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
70
- {{- '<|im_start|>user' }}
71
- {%- endif %}
72
- {{- '\n<tool_response>\n' }}
73
- {{- message.content }}
74
- {{- '\n</tool_response>' }}
75
- {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
76
- {{- '<|im_end|>\n' }}
77
- {%- endif %}
78
- {%- endif %}
79
- {%- endfor %}
80
- {%- if add_generation_prompt %}
81
- {{- '<|im_start|>assistant\n' }}
82
- {%- if enable_thinking is defined and enable_thinking is false %}
83
- {{- '<think>\n\n</think>\n\n' }}
84
- {%- endif %}
85
- {%- endif %}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ace-step/Qwen3-Embedding-0.6B/config.json DELETED
@@ -1,60 +0,0 @@
1
- {
2
- "architectures": [
3
- "Qwen3Model"
4
- ],
5
- "attention_bias": false,
6
- "attention_dropout": 0.0,
7
- "bos_token_id": 151643,
8
- "dtype": "bfloat16",
9
- "eos_token_id": 151643,
10
- "head_dim": 128,
11
- "hidden_act": "silu",
12
- "hidden_size": 1024,
13
- "initializer_range": 0.02,
14
- "intermediate_size": 3072,
15
- "layer_types": [
16
- "full_attention",
17
- "full_attention",
18
- "full_attention",
19
- "full_attention",
20
- "full_attention",
21
- "full_attention",
22
- "full_attention",
23
- "full_attention",
24
- "full_attention",
25
- "full_attention",
26
- "full_attention",
27
- "full_attention",
28
- "full_attention",
29
- "full_attention",
30
- "full_attention",
31
- "full_attention",
32
- "full_attention",
33
- "full_attention",
34
- "full_attention",
35
- "full_attention",
36
- "full_attention",
37
- "full_attention",
38
- "full_attention",
39
- "full_attention",
40
- "full_attention",
41
- "full_attention",
42
- "full_attention",
43
- "full_attention"
44
- ],
45
- "max_position_embeddings": 32768,
46
- "max_window_layers": 28,
47
- "model_type": "qwen3",
48
- "num_attention_heads": 16,
49
- "num_hidden_layers": 28,
50
- "num_key_value_heads": 8,
51
- "rms_norm_eps": 1e-06,
52
- "rope_scaling": null,
53
- "rope_theta": 1000000,
54
- "sliding_window": null,
55
- "tie_word_embeddings": true,
56
- "transformers_version": "4.57.0.dev0",
57
- "use_cache": true,
58
- "use_sliding_window": false,
59
- "vocab_size": 151669
60
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ace-step/Qwen3-Embedding-0.6B/merges.txt DELETED
The diff for this file is too large to render. See raw diff
 
ace-step/Qwen3-Embedding-0.6B/model.safetensors DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:0437e45c94563b09e13cb7a64478fc406947a93cb34a7e05870fc8dcd48e23fd
3
- size 1191586416
 
 
 
 
ace-step/Qwen3-Embedding-0.6B/special_tokens_map.json DELETED
@@ -1,31 +0,0 @@
1
- {
2
- "additional_special_tokens": [
3
- "<|im_start|>",
4
- "<|im_end|>",
5
- "<|object_ref_start|>",
6
- "<|object_ref_end|>",
7
- "<|box_start|>",
8
- "<|box_end|>",
9
- "<|quad_start|>",
10
- "<|quad_end|>",
11
- "<|vision_start|>",
12
- "<|vision_end|>",
13
- "<|vision_pad|>",
14
- "<|image_pad|>",
15
- "<|video_pad|>"
16
- ],
17
- "eos_token": {
18
- "content": "<|im_end|>",
19
- "lstrip": false,
20
- "normalized": false,
21
- "rstrip": false,
22
- "single_word": false
23
- },
24
- "pad_token": {
25
- "content": "<|endoftext|>",
26
- "lstrip": false,
27
- "normalized": false,
28
- "rstrip": false,
29
- "single_word": false
30
- }
31
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ace-step/Qwen3-Embedding-0.6B/tokenizer.json DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:def76fb086971c7867b829c23a26261e38d9d74e02139253b38aeb9df8b4b50a
3
- size 11423705
 
 
 
 
ace-step/Qwen3-Embedding-0.6B/tokenizer_config.json DELETED
@@ -1,239 +0,0 @@
1
- {
2
- "add_bos_token": false,
3
- "add_prefix_space": false,
4
- "added_tokens_decoder": {
5
- "151643": {
6
- "content": "<|endoftext|>",
7
- "lstrip": false,
8
- "normalized": false,
9
- "rstrip": false,
10
- "single_word": false,
11
- "special": true
12
- },
13
- "151644": {
14
- "content": "<|im_start|>",
15
- "lstrip": false,
16
- "normalized": false,
17
- "rstrip": false,
18
- "single_word": false,
19
- "special": true
20
- },
21
- "151645": {
22
- "content": "<|im_end|>",
23
- "lstrip": false,
24
- "normalized": false,
25
- "rstrip": false,
26
- "single_word": false,
27
- "special": true
28
- },
29
- "151646": {
30
- "content": "<|object_ref_start|>",
31
- "lstrip": false,
32
- "normalized": false,
33
- "rstrip": false,
34
- "single_word": false,
35
- "special": true
36
- },
37
- "151647": {
38
- "content": "<|object_ref_end|>",
39
- "lstrip": false,
40
- "normalized": false,
41
- "rstrip": false,
42
- "single_word": false,
43
- "special": true
44
- },
45
- "151648": {
46
- "content": "<|box_start|>",
47
- "lstrip": false,
48
- "normalized": false,
49
- "rstrip": false,
50
- "single_word": false,
51
- "special": true
52
- },
53
- "151649": {
54
- "content": "<|box_end|>",
55
- "lstrip": false,
56
- "normalized": false,
57
- "rstrip": false,
58
- "single_word": false,
59
- "special": true
60
- },
61
- "151650": {
62
- "content": "<|quad_start|>",
63
- "lstrip": false,
64
- "normalized": false,
65
- "rstrip": false,
66
- "single_word": false,
67
- "special": true
68
- },
69
- "151651": {
70
- "content": "<|quad_end|>",
71
- "lstrip": false,
72
- "normalized": false,
73
- "rstrip": false,
74
- "single_word": false,
75
- "special": true
76
- },
77
- "151652": {
78
- "content": "<|vision_start|>",
79
- "lstrip": false,
80
- "normalized": false,
81
- "rstrip": false,
82
- "single_word": false,
83
- "special": true
84
- },
85
- "151653": {
86
- "content": "<|vision_end|>",
87
- "lstrip": false,
88
- "normalized": false,
89
- "rstrip": false,
90
- "single_word": false,
91
- "special": true
92
- },
93
- "151654": {
94
- "content": "<|vision_pad|>",
95
- "lstrip": false,
96
- "normalized": false,
97
- "rstrip": false,
98
- "single_word": false,
99
- "special": true
100
- },
101
- "151655": {
102
- "content": "<|image_pad|>",
103
- "lstrip": false,
104
- "normalized": false,
105
- "rstrip": false,
106
- "single_word": false,
107
- "special": true
108
- },
109
- "151656": {
110
- "content": "<|video_pad|>",
111
- "lstrip": false,
112
- "normalized": false,
113
- "rstrip": false,
114
- "single_word": false,
115
- "special": true
116
- },
117
- "151657": {
118
- "content": "<tool_call>",
119
- "lstrip": false,
120
- "normalized": false,
121
- "rstrip": false,
122
- "single_word": false,
123
- "special": false
124
- },
125
- "151658": {
126
- "content": "</tool_call>",
127
- "lstrip": false,
128
- "normalized": false,
129
- "rstrip": false,
130
- "single_word": false,
131
- "special": false
132
- },
133
- "151659": {
134
- "content": "<|fim_prefix|>",
135
- "lstrip": false,
136
- "normalized": false,
137
- "rstrip": false,
138
- "single_word": false,
139
- "special": false
140
- },
141
- "151660": {
142
- "content": "<|fim_middle|>",
143
- "lstrip": false,
144
- "normalized": false,
145
- "rstrip": false,
146
- "single_word": false,
147
- "special": false
148
- },
149
- "151661": {
150
- "content": "<|fim_suffix|>",
151
- "lstrip": false,
152
- "normalized": false,
153
- "rstrip": false,
154
- "single_word": false,
155
- "special": false
156
- },
157
- "151662": {
158
- "content": "<|fim_pad|>",
159
- "lstrip": false,
160
- "normalized": false,
161
- "rstrip": false,
162
- "single_word": false,
163
- "special": false
164
- },
165
- "151663": {
166
- "content": "<|repo_name|>",
167
- "lstrip": false,
168
- "normalized": false,
169
- "rstrip": false,
170
- "single_word": false,
171
- "special": false
172
- },
173
- "151664": {
174
- "content": "<|file_sep|>",
175
- "lstrip": false,
176
- "normalized": false,
177
- "rstrip": false,
178
- "single_word": false,
179
- "special": false
180
- },
181
- "151665": {
182
- "content": "<tool_response>",
183
- "lstrip": false,
184
- "normalized": false,
185
- "rstrip": false,
186
- "single_word": false,
187
- "special": false
188
- },
189
- "151666": {
190
- "content": "</tool_response>",
191
- "lstrip": false,
192
- "normalized": false,
193
- "rstrip": false,
194
- "single_word": false,
195
- "special": false
196
- },
197
- "151667": {
198
- "content": "<think>",
199
- "lstrip": false,
200
- "normalized": false,
201
- "rstrip": false,
202
- "single_word": false,
203
- "special": false
204
- },
205
- "151668": {
206
- "content": "</think>",
207
- "lstrip": false,
208
- "normalized": false,
209
- "rstrip": false,
210
- "single_word": false,
211
- "special": false
212
- }
213
- },
214
- "additional_special_tokens": [
215
- "<|im_start|>",
216
- "<|im_end|>",
217
- "<|object_ref_start|>",
218
- "<|object_ref_end|>",
219
- "<|box_start|>",
220
- "<|box_end|>",
221
- "<|quad_start|>",
222
- "<|quad_end|>",
223
- "<|vision_start|>",
224
- "<|vision_end|>",
225
- "<|vision_pad|>",
226
- "<|image_pad|>",
227
- "<|video_pad|>"
228
- ],
229
- "bos_token": null,
230
- "clean_up_tokenization_spaces": false,
231
- "eos_token": "<|im_end|>",
232
- "errors": "replace",
233
- "extra_special_tokens": {},
234
- "model_max_length": 131072,
235
- "pad_token": "<|endoftext|>",
236
- "split_special_tokens": false,
237
- "tokenizer_class": "Qwen2Tokenizer",
238
- "unk_token": null
239
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ace-step/Qwen3-Embedding-0.6B/vocab.json DELETED
The diff for this file is too large to render. See raw diff
 
ace-step/README.md DELETED
@@ -1,99 +0,0 @@
1
- ---
2
- library_name: transformers
3
- license: mit
4
- pipeline_tag: text-to-audio
5
- tags:
6
- - audio
7
- - music
8
- - text2music
9
- ---
10
-
11
- <h1 align="center">ACE-Step 1.5</h1>
12
- <h1 align="center">Pushing the Boundaries of Open-Source Music Generation</h1>
13
- <p align="center">
14
- <a href="https://ace-step.github.io/ace-step-v1.5.github.io/">Project</a> |
15
- <a href="https://huggingface.co/collections/ACE-Step/ace-step-15">Hugging Face</a> |
16
- <a href="https://modelscope.cn/models/ACE-Step/Ace-Step1.5">ModelScope</a> |
17
- <a href="https://huggingface.co/spaces/ACE-Step/Ace-Step-v1.5">Space Demo</a> |
18
- <a href="https://discord.gg/PeWDxrkdj7">Discord</a>
19
- <a href="https://arxiv.org/abs/2602.00744">Tech Report</a>
20
- </p>
21
-
22
-
23
- ![image](https://cdn-uploads.huggingface.co/production/uploads/62dfaf90c42558bcbd0a4f6f/b84r7t0viIw7rKSr_ja9_.png)
24
-
25
- ## Model Details
26
-
27
- 🚀 **ACE-Step v1.5** is a highly efficient open-source music foundation model designed to bring commercial-grade music generation to consumer hardware.
28
-
29
- ### Key Features
30
-
31
- * **💰 Commercial-Ready:** Unlike many models trained on ambiguous datasets, ACE-Step v1.5 is designed for creators. You can strictly use the generated music for **commercial purposes**.
32
- * **📚 Safe & Robust Training Data:** The model is trained on a massive, legally compliant dataset consisting of:
33
- * **Licensed Data:** Professionally licensed music tracks.
34
- * **Royalty-Free / No-Copyright Data:** A vast collection of public domain and royalty-free music.
35
- * **Synthetic Data:** High-quality audio generated via advanced MIDI-to-Audio conversion.
36
- * **⚡ Extreme Speed:** Generates a full song in under 2 seconds on an A100 and under 10 seconds on an RTX 3090.
37
- * **🖥️ Consumer Hardware Friendly:** Runs locally with less than 4GB of VRAM.
38
-
39
- ### Technical Capabilities
40
-
41
- 🌉 At its core lies a novel hybrid architecture where the Language Model (LM) functions as an omni-capable planner: it transforms simple user queries into comprehensive song blueprints—scaling from short loops to 10-minute compositions—while synthesizing metadata, lyrics, and captions via Chain-of-Thought to guide the Diffusion Transformer (DiT). ⚡ Uniquely, this alignment is achieved through intrinsic reinforcement learning relying solely on the model's internal mechanisms, thereby eliminating the biases inherent in external reward models or human preferences. 🎚️
42
-
43
- 🔮 Beyond standard synthesis, ACE-Step v1.5 unifies precise stylistic control with versatile editing capabilities—such as cover generation, repainting, and vocal-to-BGM conversion—while maintaining strict adherence to prompts across 50+ languages. This paves the way for powerful tools that seamlessly integrate into the creative workflows of music artists, producers, and content creators. 🎸
44
-
45
- - **Developed by:** [ACE-STEP]
46
- - **Model type:** [Text2Music]
47
- - **Language(s):** [50+ languages]
48
- - **License:** [MIT]
49
-
50
- ## Evaluation
51
-
52
- ![image](https://cdn-uploads.huggingface.co/production/uploads/62dfaf90c42558bcbd0a4f6f/n9aKi_NhSmlMOgmGzahZi.png)
53
-
54
- ## 🏗️ Architecture
55
-
56
-
57
- ![image](https://cdn-uploads.huggingface.co/production/uploads/62dfaf90c42558bcbd0a4f6f/V_d1rTdqkQyoSM8td7OWl.png)
58
-
59
-
60
- ## 🦁 Model Zoo
61
-
62
-
63
- ![image](https://cdn-uploads.huggingface.co/production/uploads/62dfaf90c42558bcbd0a4f6f/B49V0OTKse_FRefTmTPsQ.png)
64
-
65
- ### DiT Models
66
-
67
- | DiT Model | Pre-Training | SFT | RL | CFG | Step | Refer audio | Text2Music | Cover | Repaint | Extract | Lego | Complete | Quality | Diversity | Fine-Tunability | Hugging Face |
68
- |-----------|:------------:|:---:|:--:|:---:|:----:|:-----------:|:----------:|:-----:|:-------:|:-------:|:----:|:--------:|:-------:|:---------:|:---------------:|--------------|
69
- | `acestep-v15-base` | ✅ | ❌ | ❌ | ✅ | 50 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | Medium | High | Easy | [Link](https://huggingface.co/ACE-Step/acestep-v15-base) |
70
- | `acestep-v15-sft` | ✅ | ✅ | ❌ | ✅ | 50 | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ | High | Medium | Easy | [Link](https://huggingface.co/ACE-Step/acestep-v15-sft) |
71
- | `acestep-v15-turbo` | ✅ | ✅ | ❌ | ❌ | 8 | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ | Very High | Medium | Medium | [Link](https://huggingface.co/ACE-Step/Ace-Step1.5) |
72
- | `acestep-v15-turbo-rl` | ✅ | ✅ | ✅ | ❌ | 8 | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ | Very High | Medium | Medium | To be released |
73
-
74
- ### LM Models
75
-
76
- | LM Model | Pretrain from | Pre-Training | SFT | RL | CoT metas | Query rewrite | Audio Understanding | Composition Capability | Copy Melody | Hugging Face |
77
- |----------|---------------|:------------:|:---:|:--:|:---------:|:-------------:|:-------------------:|:----------------------:|:-----------:|--------------|
78
- | `acestep-5Hz-lm-0.6B` | Qwen3-0.6B | ✅ | ✅ | ✅ | ✅ | ✅ | Medium | Medium | Weak | ✅ |
79
- | `acestep-5Hz-lm-1.7B` | Qwen3-1.7B | ✅ | ✅ | ✅ | ✅ | ✅ | Medium | Medium | Medium | ✅ |
80
- | `acestep-5Hz-lm-4B` | Qwen3-4B | ✅ | ✅ | ✅ | ✅ | ✅ | Strong | Strong | Strong | ✅ |
81
-
82
-
83
- ## 🙏 Acknowledgements
84
-
85
- This project is co-led by ACE Studio and StepFun.
86
-
87
-
88
- ## 📖 Citation
89
-
90
- If you find this project useful for your research, please consider citing:
91
-
92
- ```BibTeX
93
- @misc{gong2026acestep,
94
- title={ACE-Step 1.5: Pushing the Boundaries of Open-Source Music Generation},
95
- author={Junmin Gong, Yulin Song, Wenxiao Zhao, Sen Wang, Shengyuan Xu, Jing Guo},
96
- howpublished={\url{https://github.com/ace-step/ACE-Step-1.5}},
97
- year={2026},
98
- note={GitHub repository}
99
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ace-step/acestep-5Hz-lm-1.7B/added_tokens.json DELETED
The diff for this file is too large to render. See raw diff
 
ace-step/acestep-5Hz-lm-1.7B/chat_template.jinja DELETED
@@ -1,89 +0,0 @@
1
- {%- if tools %}
2
- {{- '<|im_start|>system\n' }}
3
- {%- if messages[0].role == 'system' %}
4
- {{- messages[0].content + '\n\n' }}
5
- {%- endif %}
6
- {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
7
- {%- for tool in tools %}
8
- {{- "\n" }}
9
- {{- tool | tojson }}
10
- {%- endfor %}
11
- {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
12
- {%- else %}
13
- {%- if messages[0].role == 'system' %}
14
- {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
15
- {%- endif %}
16
- {%- endif %}
17
- {%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
18
- {%- for message in messages[::-1] %}
19
- {%- set index = (messages|length - 1) - loop.index0 %}
20
- {%- if ns.multi_step_tool and message.role == "user" and message.content is string and not(message.content.startswith('<tool_response>') and message.content.endswith('</tool_response>')) %}
21
- {%- set ns.multi_step_tool = false %}
22
- {%- set ns.last_query_index = index %}
23
- {%- endif %}
24
- {%- endfor %}
25
- {%- for message in messages %}
26
- {%- if message.content is string %}
27
- {%- set content = message.content %}
28
- {%- else %}
29
- {%- set content = '' %}
30
- {%- endif %}
31
- {%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
32
- {{- '<|im_start|>' + message.role + '\n' + content + '<|im_end|>' + '\n' }}
33
- {%- elif message.role == "assistant" %}
34
- {%- set reasoning_content = '' %}
35
- {%- if message.reasoning_content is string %}
36
- {%- set reasoning_content = message.reasoning_content %}
37
- {%- else %}
38
- {%- if '</think>' in content %}
39
- {%- set reasoning_content = content.split('</think>')[0].rstrip('\n').split('<think>')[-1].lstrip('\n') %}
40
- {%- set content = content.split('</think>')[-1].lstrip('\n') %}
41
- {%- endif %}
42
- {%- endif %}
43
- {%- if loop.index0 > ns.last_query_index %}
44
- {%- if loop.last or (not loop.last and reasoning_content) %}
45
- {{- '<|im_start|>' + message.role + '\n<think>\n' + reasoning_content.strip('\n') + '\n</think>\n\n' + content.lstrip('\n') }}
46
- {%- else %}
47
- {{- '<|im_start|>' + message.role + '\n' + content }}
48
- {%- endif %}
49
- {%- else %}
50
- {{- '<|im_start|>' + message.role + '\n' + content }}
51
- {%- endif %}
52
- {%- if message.tool_calls %}
53
- {%- for tool_call in message.tool_calls %}
54
- {%- if (loop.first and content) or (not loop.first) %}
55
- {{- '\n' }}
56
- {%- endif %}
57
- {%- if tool_call.function %}
58
- {%- set tool_call = tool_call.function %}
59
- {%- endif %}
60
- {{- '<tool_call>\n{"name": "' }}
61
- {{- tool_call.name }}
62
- {{- '", "arguments": ' }}
63
- {%- if tool_call.arguments is string %}
64
- {{- tool_call.arguments }}
65
- {%- else %}
66
- {{- tool_call.arguments | tojson }}
67
- {%- endif %}
68
- {{- '}\n</tool_call>' }}
69
- {%- endfor %}
70
- {%- endif %}
71
- {{- '<|im_end|>\n' }}
72
- {%- elif message.role == "tool" %}
73
- {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
74
- {{- '<|im_start|>user' }}
75
- {%- endif %}
76
- {{- '\n<tool_response>\n' }}
77
- {{- content }}
78
- {{- '\n</tool_response>' }}
79
- {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
80
- {{- '<|im_end|>\n' }}
81
- {%- endif %}
82
- {%- endif %}
83
- {%- endfor %}
84
- {%- if add_generation_prompt %}
85
- {{- '<|im_start|>assistant\n' }}
86
- {%- if enable_thinking is defined and enable_thinking is false %}
87
- {{- '<think>\n\n</think>\n\n' }}
88
- {%- endif %}
89
- {%- endif %}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ace-step/acestep-5Hz-lm-1.7B/config.json DELETED
@@ -1,61 +0,0 @@
1
- {
2
- "architectures": [
3
- "Qwen3Model"
4
- ],
5
- "attention_bias": false,
6
- "attention_dropout": 0.0,
7
- "bos_token_id": 151643,
8
- "dtype": "bfloat16",
9
- "eos_token_id": 151645,
10
- "head_dim": 128,
11
- "hidden_act": "silu",
12
- "hidden_size": 2048,
13
- "initializer_range": 0.02,
14
- "intermediate_size": 6144,
15
- "layer_types": [
16
- "full_attention",
17
- "full_attention",
18
- "full_attention",
19
- "full_attention",
20
- "full_attention",
21
- "full_attention",
22
- "full_attention",
23
- "full_attention",
24
- "full_attention",
25
- "full_attention",
26
- "full_attention",
27
- "full_attention",
28
- "full_attention",
29
- "full_attention",
30
- "full_attention",
31
- "full_attention",
32
- "full_attention",
33
- "full_attention",
34
- "full_attention",
35
- "full_attention",
36
- "full_attention",
37
- "full_attention",
38
- "full_attention",
39
- "full_attention",
40
- "full_attention",
41
- "full_attention",
42
- "full_attention",
43
- "full_attention"
44
- ],
45
- "max_position_embeddings": 40960,
46
- "max_window_layers": 28,
47
- "model_type": "qwen3",
48
- "num_attention_heads": 16,
49
- "num_hidden_layers": 28,
50
- "num_key_value_heads": 8,
51
- "pad_token_id": 151643,
52
- "rms_norm_eps": 1e-06,
53
- "rope_scaling": null,
54
- "rope_theta": 1000000,
55
- "sliding_window": null,
56
- "tie_word_embeddings": true,
57
- "transformers_version": "4.57.0.dev0",
58
- "use_cache": true,
59
- "use_sliding_window": false,
60
- "vocab_size": 217204
61
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ace-step/acestep-5Hz-lm-1.7B/merges.txt DELETED
The diff for this file is too large to render. See raw diff
 
ace-step/acestep-5Hz-lm-1.7B/model.safetensors DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:f161689da73e5ecefa28ff780d51c2d92a00f056d021d7933c779ed5c6cd7db8
3
- size 3708521528
 
 
 
 
ace-step/acestep-5Hz-lm-1.7B/special_tokens_map.json DELETED
The diff for this file is too large to render. See raw diff
 
ace-step/acestep-5Hz-lm-1.7B/tokenizer.json DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:35af56c3f5cb3ea2cc578aa28a8937770981d504f183ac5c8c38baf4bbd4af4d
3
- size 24321939
 
 
 
 
ace-step/acestep-5Hz-lm-1.7B/tokenizer_config.json DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:6cd70cdd89425971794f5235562edcc608b0629a6c4686ae51a8b8c8b8ba5e95
3
- size 14072925
 
 
 
 
ace-step/acestep-5Hz-lm-1.7B/vocab.json DELETED
The diff for this file is too large to render. See raw diff
 
ace-step/acestep-5Hz-lm-4B/Unconfirmed 786712.crdownload DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:75f193be8e6ec67e0cd154b6b8891af451f248458058ae6589c64cbdd78d8601
3
- size 3161911734
 
 
 
 
ace-step/acestep-5Hz-lm-4B/added_tokens.json DELETED
The diff for this file is too large to render. See raw diff
 
ace-step/acestep-5Hz-lm-4B/config.json DELETED
@@ -1,69 +0,0 @@
1
- {
2
- "architectures": [
3
- "Qwen3ForCausalLM"
4
- ],
5
- "attention_bias": false,
6
- "attention_dropout": 0.0,
7
- "bos_token_id": 151643,
8
- "dtype": "bfloat16",
9
- "eos_token_id": 151645,
10
- "head_dim": 128,
11
- "hidden_act": "silu",
12
- "hidden_size": 2560,
13
- "initializer_range": 0.02,
14
- "intermediate_size": 9728,
15
- "layer_types": [
16
- "full_attention",
17
- "full_attention",
18
- "full_attention",
19
- "full_attention",
20
- "full_attention",
21
- "full_attention",
22
- "full_attention",
23
- "full_attention",
24
- "full_attention",
25
- "full_attention",
26
- "full_attention",
27
- "full_attention",
28
- "full_attention",
29
- "full_attention",
30
- "full_attention",
31
- "full_attention",
32
- "full_attention",
33
- "full_attention",
34
- "full_attention",
35
- "full_attention",
36
- "full_attention",
37
- "full_attention",
38
- "full_attention",
39
- "full_attention",
40
- "full_attention",
41
- "full_attention",
42
- "full_attention",
43
- "full_attention",
44
- "full_attention",
45
- "full_attention",
46
- "full_attention",
47
- "full_attention",
48
- "full_attention",
49
- "full_attention",
50
- "full_attention",
51
- "full_attention"
52
- ],
53
- "max_position_embeddings": 40960,
54
- "max_window_layers": 36,
55
- "model_type": "qwen3",
56
- "num_attention_heads": 32,
57
- "num_hidden_layers": 36,
58
- "num_key_value_heads": 8,
59
- "pad_token_id": 151643,
60
- "rms_norm_eps": 1e-06,
61
- "rope_scaling": null,
62
- "rope_theta": 1000000,
63
- "sliding_window": null,
64
- "tie_word_embeddings": true,
65
- "transformers_version": "4.57.1",
66
- "use_cache": true,
67
- "use_sliding_window": false,
68
- "vocab_size": 217204
69
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ace-step/acestep-5Hz-lm-4B/merges.txt DELETED
The diff for this file is too large to render. See raw diff
 
ace-step/acestep-5Hz-lm-4B/model.safetensors.index.json DELETED
@@ -1,405 +0,0 @@
1
- {
2
- "metadata": {
3
- "total_size": 8379108352
4
- },
5
- "weight_map": {
6
- "model.embed_tokens.weight": "model-00001-of-00002.safetensors",
7
- "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
8
- "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
9
- "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
10
- "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
11
- "model.layers.0.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
12
- "model.layers.0.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
13
- "model.layers.0.input_layernorm.weight": "model-00001-of-00002.safetensors",
14
- "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
15
- "model.layers.0.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
16
- "model.layers.0.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
17
- "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
18
- "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
19
- "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
20
- "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
21
- "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
22
- "model.layers.1.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
23
- "model.layers.1.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
24
- "model.layers.1.input_layernorm.weight": "model-00001-of-00002.safetensors",
25
- "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
26
- "model.layers.1.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
27
- "model.layers.1.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
28
- "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
29
- "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
30
- "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
31
- "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
32
- "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
33
- "model.layers.2.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
34
- "model.layers.2.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
35
- "model.layers.2.input_layernorm.weight": "model-00001-of-00002.safetensors",
36
- "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
37
- "model.layers.2.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
38
- "model.layers.2.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
39
- "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
40
- "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
41
- "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
42
- "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
43
- "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
44
- "model.layers.3.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
45
- "model.layers.3.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
46
- "model.layers.3.input_layernorm.weight": "model-00001-of-00002.safetensors",
47
- "model.layers.3.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
48
- "model.layers.3.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
49
- "model.layers.3.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
50
- "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
51
- "model.layers.4.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
52
- "model.layers.4.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
53
- "model.layers.4.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
54
- "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
55
- "model.layers.4.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
56
- "model.layers.4.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
57
- "model.layers.4.input_layernorm.weight": "model-00001-of-00002.safetensors",
58
- "model.layers.4.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
59
- "model.layers.4.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
60
- "model.layers.4.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
61
- "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
62
- "model.layers.5.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
63
- "model.layers.5.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
64
- "model.layers.5.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
65
- "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
66
- "model.layers.5.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
67
- "model.layers.5.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
68
- "model.layers.5.input_layernorm.weight": "model-00001-of-00002.safetensors",
69
- "model.layers.5.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
70
- "model.layers.5.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
71
- "model.layers.5.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
72
- "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
73
- "model.layers.6.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
74
- "model.layers.6.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
75
- "model.layers.6.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
76
- "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
77
- "model.layers.6.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
78
- "model.layers.6.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
79
- "model.layers.6.input_layernorm.weight": "model-00001-of-00002.safetensors",
80
- "model.layers.6.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
81
- "model.layers.6.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
82
- "model.layers.6.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
83
- "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
84
- "model.layers.7.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
85
- "model.layers.7.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
86
- "model.layers.7.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
87
- "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
88
- "model.layers.7.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
89
- "model.layers.7.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
90
- "model.layers.7.input_layernorm.weight": "model-00001-of-00002.safetensors",
91
- "model.layers.7.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
92
- "model.layers.7.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
93
- "model.layers.7.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
94
- "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
95
- "model.layers.8.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
96
- "model.layers.8.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
97
- "model.layers.8.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
98
- "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
99
- "model.layers.8.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
100
- "model.layers.8.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
101
- "model.layers.8.input_layernorm.weight": "model-00001-of-00002.safetensors",
102
- "model.layers.8.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
103
- "model.layers.8.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
104
- "model.layers.8.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
105
- "model.layers.8.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
106
- "model.layers.9.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
107
- "model.layers.9.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
108
- "model.layers.9.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
109
- "model.layers.9.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
110
- "model.layers.9.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
111
- "model.layers.9.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
112
- "model.layers.9.input_layernorm.weight": "model-00001-of-00002.safetensors",
113
- "model.layers.9.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
114
- "model.layers.9.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
115
- "model.layers.9.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
116
- "model.layers.9.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
117
- "model.layers.10.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
118
- "model.layers.10.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
119
- "model.layers.10.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
120
- "model.layers.10.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
121
- "model.layers.10.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
122
- "model.layers.10.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
123
- "model.layers.10.input_layernorm.weight": "model-00001-of-00002.safetensors",
124
- "model.layers.10.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
125
- "model.layers.10.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
126
- "model.layers.10.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
127
- "model.layers.10.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
128
- "model.layers.11.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
129
- "model.layers.11.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
130
- "model.layers.11.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
131
- "model.layers.11.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
132
- "model.layers.11.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
133
- "model.layers.11.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
134
- "model.layers.11.input_layernorm.weight": "model-00001-of-00002.safetensors",
135
- "model.layers.11.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
136
- "model.layers.11.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
137
- "model.layers.11.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
138
- "model.layers.11.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
139
- "model.layers.12.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
140
- "model.layers.12.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
141
- "model.layers.12.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
142
- "model.layers.12.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
143
- "model.layers.12.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
144
- "model.layers.12.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
145
- "model.layers.12.input_layernorm.weight": "model-00001-of-00002.safetensors",
146
- "model.layers.12.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
147
- "model.layers.12.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
148
- "model.layers.12.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
149
- "model.layers.12.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
150
- "model.layers.13.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
151
- "model.layers.13.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
152
- "model.layers.13.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
153
- "model.layers.13.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
154
- "model.layers.13.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
155
- "model.layers.13.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
156
- "model.layers.13.input_layernorm.weight": "model-00001-of-00002.safetensors",
157
- "model.layers.13.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
158
- "model.layers.13.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
159
- "model.layers.13.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
160
- "model.layers.13.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
161
- "model.layers.14.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
162
- "model.layers.14.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
163
- "model.layers.14.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
164
- "model.layers.14.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
165
- "model.layers.14.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
166
- "model.layers.14.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
167
- "model.layers.14.input_layernorm.weight": "model-00001-of-00002.safetensors",
168
- "model.layers.14.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
169
- "model.layers.14.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
170
- "model.layers.14.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
171
- "model.layers.14.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
172
- "model.layers.15.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
173
- "model.layers.15.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
174
- "model.layers.15.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
175
- "model.layers.15.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
176
- "model.layers.15.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
177
- "model.layers.15.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
178
- "model.layers.15.input_layernorm.weight": "model-00001-of-00002.safetensors",
179
- "model.layers.15.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
180
- "model.layers.15.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
181
- "model.layers.15.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
182
- "model.layers.15.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
183
- "model.layers.16.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
184
- "model.layers.16.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
185
- "model.layers.16.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
186
- "model.layers.16.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
187
- "model.layers.16.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
188
- "model.layers.16.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
189
- "model.layers.16.input_layernorm.weight": "model-00001-of-00002.safetensors",
190
- "model.layers.16.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
191
- "model.layers.16.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
192
- "model.layers.16.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
193
- "model.layers.16.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
194
- "model.layers.17.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
195
- "model.layers.17.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
196
- "model.layers.17.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
197
- "model.layers.17.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
198
- "model.layers.17.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
199
- "model.layers.17.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
200
- "model.layers.17.input_layernorm.weight": "model-00001-of-00002.safetensors",
201
- "model.layers.17.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
202
- "model.layers.17.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
203
- "model.layers.17.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
204
- "model.layers.17.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
205
- "model.layers.18.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
206
- "model.layers.18.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
207
- "model.layers.18.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
208
- "model.layers.18.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
209
- "model.layers.18.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
210
- "model.layers.18.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
211
- "model.layers.18.input_layernorm.weight": "model-00001-of-00002.safetensors",
212
- "model.layers.18.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
213
- "model.layers.18.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
214
- "model.layers.18.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
215
- "model.layers.18.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
216
- "model.layers.19.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
217
- "model.layers.19.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
218
- "model.layers.19.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
219
- "model.layers.19.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
220
- "model.layers.19.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
221
- "model.layers.19.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
222
- "model.layers.19.input_layernorm.weight": "model-00001-of-00002.safetensors",
223
- "model.layers.19.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
224
- "model.layers.19.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
225
- "model.layers.19.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
226
- "model.layers.19.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
227
- "model.layers.20.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
228
- "model.layers.20.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
229
- "model.layers.20.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
230
- "model.layers.20.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
231
- "model.layers.20.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
232
- "model.layers.20.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
233
- "model.layers.20.input_layernorm.weight": "model-00002-of-00002.safetensors",
234
- "model.layers.20.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
235
- "model.layers.20.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
236
- "model.layers.20.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
237
- "model.layers.20.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
238
- "model.layers.21.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
239
- "model.layers.21.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
240
- "model.layers.21.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
241
- "model.layers.21.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
242
- "model.layers.21.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
243
- "model.layers.21.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
244
- "model.layers.21.input_layernorm.weight": "model-00002-of-00002.safetensors",
245
- "model.layers.21.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
246
- "model.layers.21.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
247
- "model.layers.21.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
248
- "model.layers.21.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
249
- "model.layers.22.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
250
- "model.layers.22.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
251
- "model.layers.22.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
252
- "model.layers.22.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
253
- "model.layers.22.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
254
- "model.layers.22.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
255
- "model.layers.22.input_layernorm.weight": "model-00002-of-00002.safetensors",
256
- "model.layers.22.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
257
- "model.layers.22.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
258
- "model.layers.22.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
259
- "model.layers.22.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
260
- "model.layers.23.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
261
- "model.layers.23.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
262
- "model.layers.23.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
263
- "model.layers.23.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
264
- "model.layers.23.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
265
- "model.layers.23.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
266
- "model.layers.23.input_layernorm.weight": "model-00002-of-00002.safetensors",
267
- "model.layers.23.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
268
- "model.layers.23.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
269
- "model.layers.23.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
270
- "model.layers.23.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
271
- "model.layers.24.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
272
- "model.layers.24.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
273
- "model.layers.24.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
274
- "model.layers.24.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
275
- "model.layers.24.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
276
- "model.layers.24.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
277
- "model.layers.24.input_layernorm.weight": "model-00002-of-00002.safetensors",
278
- "model.layers.24.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
279
- "model.layers.24.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
280
- "model.layers.24.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
281
- "model.layers.24.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
282
- "model.layers.25.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
283
- "model.layers.25.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
284
- "model.layers.25.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
285
- "model.layers.25.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
286
- "model.layers.25.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
287
- "model.layers.25.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
288
- "model.layers.25.input_layernorm.weight": "model-00002-of-00002.safetensors",
289
- "model.layers.25.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
290
- "model.layers.25.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
291
- "model.layers.25.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
292
- "model.layers.25.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
293
- "model.layers.26.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
294
- "model.layers.26.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
295
- "model.layers.26.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
296
- "model.layers.26.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
297
- "model.layers.26.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
298
- "model.layers.26.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
299
- "model.layers.26.input_layernorm.weight": "model-00002-of-00002.safetensors",
300
- "model.layers.26.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
301
- "model.layers.26.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
302
- "model.layers.26.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
303
- "model.layers.26.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
304
- "model.layers.27.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
305
- "model.layers.27.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
306
- "model.layers.27.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
307
- "model.layers.27.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
308
- "model.layers.27.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
309
- "model.layers.27.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
310
- "model.layers.27.input_layernorm.weight": "model-00002-of-00002.safetensors",
311
- "model.layers.27.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
312
- "model.layers.27.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
313
- "model.layers.27.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
314
- "model.layers.27.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
315
- "model.layers.28.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
316
- "model.layers.28.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
317
- "model.layers.28.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
318
- "model.layers.28.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
319
- "model.layers.28.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
320
- "model.layers.28.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
321
- "model.layers.28.input_layernorm.weight": "model-00002-of-00002.safetensors",
322
- "model.layers.28.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
323
- "model.layers.28.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
324
- "model.layers.28.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
325
- "model.layers.28.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
326
- "model.layers.29.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
327
- "model.layers.29.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
328
- "model.layers.29.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
329
- "model.layers.29.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
330
- "model.layers.29.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
331
- "model.layers.29.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
332
- "model.layers.29.input_layernorm.weight": "model-00002-of-00002.safetensors",
333
- "model.layers.29.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
334
- "model.layers.29.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
335
- "model.layers.29.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
336
- "model.layers.29.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
337
- "model.layers.30.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
338
- "model.layers.30.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
339
- "model.layers.30.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
340
- "model.layers.30.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
341
- "model.layers.30.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
342
- "model.layers.30.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
343
- "model.layers.30.input_layernorm.weight": "model-00002-of-00002.safetensors",
344
- "model.layers.30.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
345
- "model.layers.30.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
346
- "model.layers.30.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
347
- "model.layers.30.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
348
- "model.layers.31.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
349
- "model.layers.31.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
350
- "model.layers.31.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
351
- "model.layers.31.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
352
- "model.layers.31.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
353
- "model.layers.31.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
354
- "model.layers.31.input_layernorm.weight": "model-00002-of-00002.safetensors",
355
- "model.layers.31.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
356
- "model.layers.31.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
357
- "model.layers.31.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
358
- "model.layers.31.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
359
- "model.layers.32.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
360
- "model.layers.32.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
361
- "model.layers.32.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
362
- "model.layers.32.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
363
- "model.layers.32.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
364
- "model.layers.32.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
365
- "model.layers.32.input_layernorm.weight": "model-00002-of-00002.safetensors",
366
- "model.layers.32.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
367
- "model.layers.32.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
368
- "model.layers.32.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
369
- "model.layers.32.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
370
- "model.layers.33.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
371
- "model.layers.33.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
372
- "model.layers.33.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
373
- "model.layers.33.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
374
- "model.layers.33.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
375
- "model.layers.33.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
376
- "model.layers.33.input_layernorm.weight": "model-00002-of-00002.safetensors",
377
- "model.layers.33.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
378
- "model.layers.33.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
379
- "model.layers.33.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
380
- "model.layers.33.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
381
- "model.layers.34.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
382
- "model.layers.34.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
383
- "model.layers.34.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
384
- "model.layers.34.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
385
- "model.layers.34.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
386
- "model.layers.34.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
387
- "model.layers.34.input_layernorm.weight": "model-00002-of-00002.safetensors",
388
- "model.layers.34.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
389
- "model.layers.34.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
390
- "model.layers.34.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
391
- "model.layers.34.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
392
- "model.layers.35.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
393
- "model.layers.35.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
394
- "model.layers.35.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
395
- "model.layers.35.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
396
- "model.layers.35.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
397
- "model.layers.35.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
398
- "model.layers.35.input_layernorm.weight": "model-00002-of-00002.safetensors",
399
- "model.layers.35.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
400
- "model.layers.35.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
401
- "model.layers.35.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
402
- "model.layers.35.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
403
- "model.norm.weight": "model-00002-of-00002.safetensors"
404
- }
405
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ace-step/acestep-5Hz-lm-4B/special_tokens_map.json DELETED
The diff for this file is too large to render. See raw diff
 
ace-step/acestep-5Hz-lm-4B/tokenizer.json DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:35af56c3f5cb3ea2cc578aa28a8937770981d504f183ac5c8c38baf4bbd4af4d
3
- size 24321939
 
 
 
 
ace-step/acestep-5Hz-lm-4B/tokenizer_config.json DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:6cd70cdd89425971794f5235562edcc608b0629a6c4686ae51a8b8c8b8ba5e95
3
- size 14072925
 
 
 
 
ace-step/acestep-5Hz-lm-4B/vocab.json DELETED
The diff for this file is too large to render. See raw diff
 
ace-step/acestep-v15-base/apg_guidance.py DELETED
@@ -1,220 +0,0 @@
1
- import torch
2
- import torch.nn.functional as F
3
-
4
-
5
- class MomentumBuffer:
6
-
7
- def __init__(self, momentum: float = -0.75):
8
- self.momentum = momentum
9
- self.running_average = 0
10
-
11
- def update(self, update_value: torch.Tensor):
12
- new_average = self.momentum * self.running_average
13
- self.running_average = update_value + new_average
14
-
15
-
16
- def project(
17
- v0: torch.Tensor, # [B, C, T]
18
- v1: torch.Tensor, # [B, C, T]
19
- dims=[-1],
20
- ):
21
- dtype = v0.dtype
22
- device_type = v0.device.type
23
- if device_type == "mps":
24
- v0, v1 = v0.cpu(), v1.cpu()
25
-
26
- v0, v1 = v0.double(), v1.double()
27
- v1 = torch.nn.functional.normalize(v1, dim=dims)
28
- v0_parallel = (v0 * v1).sum(dim=dims, keepdim=True) * v1
29
- v0_orthogonal = v0 - v0_parallel
30
- return v0_parallel.to(dtype).to(device_type), v0_orthogonal.to(dtype).to(device_type)
31
-
32
-
33
- def apg_forward(
34
- pred_cond: torch.Tensor, # [B, C, T]
35
- pred_uncond: torch.Tensor, # [B, C, T]
36
- guidance_scale: float,
37
- momentum_buffer: MomentumBuffer = None,
38
- eta: float = 0.0,
39
- norm_threshold: float = 2.5,
40
- dims=[-1],
41
- ):
42
- diff = pred_cond - pred_uncond
43
- if momentum_buffer is not None:
44
- momentum_buffer.update(diff)
45
- diff = momentum_buffer.running_average
46
-
47
- if norm_threshold > 0:
48
- ones = torch.ones_like(diff)
49
- diff_norm = diff.norm(p=2, dim=dims, keepdim=True)
50
- scale_factor = torch.minimum(ones, norm_threshold / diff_norm)
51
- diff = diff * scale_factor
52
-
53
- diff_parallel, diff_orthogonal = project(diff, pred_cond, dims)
54
- normalized_update = diff_orthogonal + eta * diff_parallel
55
- pred_guided = pred_cond + (guidance_scale - 1) * normalized_update
56
- return pred_guided
57
-
58
-
59
- def cfg_forward(cond_output, uncond_output, cfg_strength):
60
- return uncond_output + cfg_strength * (cond_output - uncond_output)
61
-
62
-
63
- def call_cos_tensor(tensor1, tensor2):
64
- """
65
- Calculate cosine similarity between two normalized tensors.
66
-
67
- Args:
68
- tensor1: First tensor [B, ...]
69
- tensor2: Second tensor [B, ...]
70
-
71
- Returns:
72
- Cosine similarity value [B, 1]
73
- """
74
- tensor1 = tensor1 / torch.linalg.norm(tensor1, dim=1, keepdim=True)
75
- tensor2 = tensor2 / torch.linalg.norm(tensor2, dim=1, keepdim=True)
76
- cosvalue = torch.sum(tensor1 * tensor2, dim=1, keepdim=True)
77
- return cosvalue
78
-
79
-
80
- def compute_perpendicular_component(latent_diff, latent_hat_uncond):
81
- """
82
- Decompose latent_diff into parallel and perpendicular components relative to latent_hat_uncond.
83
-
84
- Args:
85
- latent_diff: Difference tensor [B, C, ...]
86
- latent_hat_uncond: Unconditional prediction tensor [B, C, ...]
87
-
88
- Returns:
89
- projection: Parallel component
90
- perpendicular_component: Perpendicular component
91
- """
92
- n, t, c = latent_diff.shape
93
- latent_diff = latent_diff.view(n * t, c).float()
94
- latent_hat_uncond = latent_hat_uncond.view(n * t, c).float()
95
-
96
- if latent_diff.size() != latent_hat_uncond.size():
97
- raise ValueError("latent_diff and latent_hat_uncond must have the same shape [n, d].")
98
-
99
- dot_product = torch.sum(latent_diff * latent_hat_uncond, dim=1, keepdim=True) # [n, 1]
100
- norm_square = torch.sum(latent_hat_uncond * latent_hat_uncond, dim=1, keepdim=True) # [n, 1]
101
- projection = (dot_product / (norm_square + 1e-8)) * latent_hat_uncond
102
- perpendicular_component = latent_diff - projection
103
-
104
- return projection.view(n, t, c), perpendicular_component.reshape(n, t, c)
105
-
106
-
107
- def adg_forward(
108
- latents: torch.Tensor,
109
- noise_pred_cond: torch.Tensor,
110
- noise_pred_uncond: torch.Tensor,
111
- sigma: torch.Tensor,
112
- guidance_scale: float,
113
- angle_clip: float = 3.14 / 6, # pi/6 by default
114
- apply_norm: bool = False,
115
- apply_clip: bool = True,
116
- ):
117
- """
118
- ADG (Angle-based Dynamic Guidance) forward pass for Flow Matching.
119
-
120
- In flow matching (including SD3), sigma represents the current timestep t_curr.
121
- The predictions are velocity fields v(x_t, t).
122
-
123
- Args:
124
- latents: Current state x_t [N, T, d] where d=64
125
- noise_pred_cond: Conditional velocity prediction v_cond [N, T, d]
126
- noise_pred_uncond: Unconditional velocity prediction v_uncond [N, T, d]
127
- sigma: Current timestep t_curr (not t_prev!)
128
- guidance_scale: Guidance strength
129
- angle_clip: Maximum angle for clipping (default: pi/6)
130
- apply_norm: Whether to normalize the result (ADG_w_norm variant)
131
- apply_clip: Whether to clip the angle (ADG_wo_clip when False)
132
-
133
- Returns:
134
- Guided velocity prediction [N, T, d]
135
- """
136
- # Get batch size
137
- n = noise_pred_cond.shape[0]
138
- noise_pred_text = noise_pred_cond
139
- n, t, c = noise_pred_text.shape
140
-
141
- # Ensure sigma/t has the right shape for broadcasting [N, 1, 1]
142
- if isinstance(sigma, (int, float)):
143
- sigma = torch.tensor(sigma, device=latents.device, dtype=latents.dtype)
144
- sigma = sigma.view(1, 1, 1).expand(n, 1, 1)
145
- elif torch.is_tensor(sigma):
146
- if sigma.numel() == 1:
147
- sigma = sigma.view(1, 1, 1).expand(n, 1, 1)
148
- elif sigma.numel() == n:
149
- sigma = sigma.view(n, 1, 1)
150
- else:
151
- raise ValueError(f"sigma has incompatible shape. Expected scalar or size {n}, got {sigma.shape}")
152
- else:
153
- raise TypeError(f"sigma must be a number or tensor, got {type(sigma)}")
154
-
155
- # Adjust guidance weight
156
- weight = guidance_scale - 1
157
- weight = weight * (weight > 0) + 1e-3
158
-
159
- latent_hat_text = latents - sigma * noise_pred_text
160
- latent_hat_uncond = latents - sigma * noise_pred_uncond
161
- latent_diff = latent_hat_text - latent_hat_uncond
162
-
163
- # Calculate angle between conditional and unconditional predicted data
164
- latent_theta = torch.acos(
165
- call_cos_tensor(latent_hat_text.view(-1, c).to(float),
166
- latent_hat_uncond.reshape(-1, c).contiguous().to(float)))
167
- latent_theta_new = torch.clip(weight * latent_theta, -angle_clip, angle_clip) if apply_clip else weight * latent_theta
168
- proj, perp = compute_perpendicular_component(latent_diff, latent_hat_uncond)
169
- latent_v_new = torch.cos(latent_theta_new) * latent_hat_text
170
-
171
- latent_p_new = perp * torch.sin(latent_theta_new) / torch.sin(latent_theta) * (
172
- torch.sin(latent_theta) > 1e-3) + perp * weight * (torch.sin(latent_theta) <= 1e-3)
173
- latent_new = latent_v_new + latent_p_new
174
- if apply_norm:
175
- latent_new = latent_new * torch.linalg.norm(latent_hat_text, dim=1, keepdim=True) / torch.linalg.norm(
176
- latent_new, dim=1, keepdim=True)
177
-
178
- noise_pred = (latents - latent_new) / sigma
179
- noise_pred = noise_pred.reshape(n, t, c).to(latents.dtype)
180
- return noise_pred
181
-
182
-
183
- def adg_w_norm_forward(
184
- latents: torch.Tensor,
185
- noise_pred_cond: torch.Tensor,
186
- noise_pred_uncond: torch.Tensor,
187
- sigma: float,
188
- guidance_scale: float,
189
- angle_clip: float = 3.14 / 3,
190
- ):
191
- """
192
- ADG with normalization - preserves the magnitude of latent predictions.
193
-
194
- This variant normalizes the final latent to maintain the same norm as the
195
- conditional prediction, which can help preserve image quality.
196
- """
197
- return adg_forward(latents,
198
- noise_pred_cond,
199
- noise_pred_uncond,
200
- sigma,
201
- guidance_scale,
202
- angle_clip=angle_clip,
203
- apply_norm=True,
204
- apply_clip=True)
205
-
206
-
207
- def adg_wo_clip_forward(
208
- latents: torch.Tensor,
209
- noise_pred_cond: torch.Tensor,
210
- noise_pred_uncond: torch.Tensor,
211
- sigma: float,
212
- guidance_scale: float,
213
- ):
214
- """
215
- ADG without angle clipping - allows unbounded angle adjustments.
216
-
217
- This variant doesn't clip the angle, which may result in more aggressive
218
- guidance but could be less stable.
219
- """
220
- return adg_forward(latents, noise_pred_cond, noise_pred_uncond, sigma, guidance_scale, apply_norm=False, apply_clip=False)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ace-step/acestep-v15-base/config.json DELETED
@@ -1,81 +0,0 @@
1
- {
2
- "architectures": [
3
- "AceStepConditionGenerationModel"
4
- ],
5
- "auto_map": {
6
- "AutoConfig": "configuration_acestep_v15.AceStepConfig",
7
- "AutoModel": "modeling_acestep_v15_base.AceStepConditionGenerationModel"
8
- },
9
- "attention_bias": false,
10
- "attention_dropout": 0.0,
11
- "audio_acoustic_hidden_dim": 64,
12
- "data_proportion": 0.5,
13
- "dtype": "bfloat16",
14
- "fsq_dim": 2048,
15
- "fsq_input_levels": [
16
- 8,
17
- 8,
18
- 8,
19
- 5,
20
- 5,
21
- 5
22
- ],
23
- "fsq_input_num_quantizers": 1,
24
- "head_dim": 128,
25
- "hidden_act": "silu",
26
- "hidden_size": 2048,
27
- "in_channels": 192,
28
- "initializer_range": 0.02,
29
- "intermediate_size": 6144,
30
- "layer_types": [
31
- "sliding_attention",
32
- "full_attention",
33
- "sliding_attention",
34
- "full_attention",
35
- "sliding_attention",
36
- "full_attention",
37
- "sliding_attention",
38
- "full_attention",
39
- "sliding_attention",
40
- "full_attention",
41
- "sliding_attention",
42
- "full_attention",
43
- "sliding_attention",
44
- "full_attention",
45
- "sliding_attention",
46
- "full_attention",
47
- "sliding_attention",
48
- "full_attention",
49
- "sliding_attention",
50
- "full_attention",
51
- "sliding_attention",
52
- "full_attention",
53
- "sliding_attention",
54
- "full_attention"
55
- ],
56
- "max_position_embeddings": 32768,
57
- "model_type": "acestep",
58
- "num_attention_heads": 16,
59
- "num_attention_pooler_hidden_layers": 2,
60
- "num_audio_decoder_hidden_layers": 24,
61
- "num_hidden_layers": 24,
62
- "num_key_value_heads": 8,
63
- "num_lyric_encoder_hidden_layers": 8,
64
- "num_timbre_encoder_hidden_layers": 4,
65
- "patch_size": 2,
66
- "pool_window_size": 5,
67
- "rms_norm_eps": 1e-06,
68
- "rope_scaling": null,
69
- "rope_theta": 1000000,
70
- "sliding_window": 128,
71
- "text_hidden_dim": 1024,
72
- "timbre_fix_frame": 750,
73
- "timbre_hidden_dim": 64,
74
- "timestep_mu": -0.4,
75
- "timestep_sigma": 1.0,
76
- "transformers_version": "4.57.0.dev0",
77
- "use_cache": true,
78
- "use_sliding_window": true,
79
- "vocab_size": 64003,
80
- "is_turbo": false
81
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ace-step/acestep-v15-base/configuration_acestep_v15.py DELETED
@@ -1,263 +0,0 @@
1
- # coding=utf-8
2
- # Copyright 2024 The Qwen team, Alibaba Group and the HuggingFace Inc. team. All rights reserved.
3
- #
4
- # Licensed under the Apache License, Version 2.0 (the "License");
5
- # you may not use this file except in compliance with the License.
6
- # You may obtain a copy of the License at
7
- #
8
- # http://www.apache.org/licenses/LICENSE-2.0
9
- #
10
- # Unless required by applicable law or agreed to in writing, software
11
- # distributed under the License is distributed on an "AS IS" BASIS,
12
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
- # See the License for the specific language governing permissions and
14
- # limitations under the License.
15
- """AceStep model configuration"""
16
-
17
- from transformers.configuration_utils import PretrainedConfig, layer_type_validation
18
- from transformers.modeling_rope_utils import rope_config_validation
19
- from transformers.utils import logging
20
-
21
-
22
- logger = logging.get_logger(__name__)
23
-
24
-
25
- class AceStepConfig(PretrainedConfig):
26
- r"""
27
- This is the configuration class to store the configuration of a [`AceStepModel`]. It is used to instantiate an
28
- AceStep model according to the specified arguments, defining the model architecture.
29
-
30
- Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
31
- documentation from [`PretrainedConfig`] for more information.
32
-
33
- Args:
34
- vocab_size (`int`, *optional*, defaults to 64003):
35
- Vocabulary size of the AceStep model. Defines the number of different tokens that can be represented by the
36
- `inputs_ids` passed when calling the model.
37
- hidden_size (`int`, *optional*, defaults to 4096):
38
- Dimension of the hidden representations.
39
- intermediate_size (`int`, *optional*, defaults to 22016):
40
- Dimension of the MLP representations.
41
- num_hidden_layers (`int`, *optional*, defaults to 32):
42
- Number of hidden layers in the Transformer encoder.
43
- num_attention_heads (`int`, *optional*, defaults to 32):
44
- Number of attention heads for each attention layer in the Transformer encoder.
45
- num_key_value_heads (`int`, *optional*, defaults to 32):
46
- This is the number of key_value heads that should be used to implement Grouped Query Attention. If
47
- `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
48
- `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
49
- converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
50
- by meanpooling all the original heads within that group. For more details, check out [this
51
- paper](https://huggingface.co/papers/2305.13245). If it is not specified, will default to `32`.
52
- head_dim (`int`, *optional*, defaults to 128):
53
- The attention head dimension.
54
- hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
55
- The non-linear activation function (function or string) in the decoder.
56
- max_position_embeddings (`int`, *optional*, defaults to 32768):
57
- The maximum sequence length that this model might ever be used with.
58
- initializer_range (`float`, *optional*, defaults to 0.02):
59
- The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
60
- rms_norm_eps (`float`, *optional*, defaults to 1e-06):
61
- The epsilon used by the rms normalization layers.
62
- use_cache (`bool`, *optional*, defaults to `True`):
63
- Whether or not the model should return the last key/values attentions (not used by all models). Only
64
- relevant if `config.is_decoder=True`.
65
- tie_word_embeddings (`bool`, *optional*, defaults to `False`):
66
- Whether the model's input and output word embeddings should be tied.
67
- rope_theta (`float`, *optional*, defaults to 10000.0):
68
- The base period of the RoPE embeddings.
69
- rope_scaling (`Dict`, *optional*):
70
- Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type
71
- and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value
72
- accordingly.
73
- Expected contents:
74
- `rope_type` (`str`):
75
- The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope',
76
- 'llama3'], with 'default' being the original RoPE implementation.
77
- `factor` (`float`, *optional*):
78
- Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In
79
- most scaling types, a `factor` of x will enable the model to handle sequences of length x *
80
- original maximum pre-trained length.
81
- `original_max_position_embeddings` (`int`, *optional*):
82
- Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during
83
- pretraining.
84
- `attention_factor` (`float`, *optional*):
85
- Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention
86
- computation. If unspecified, it defaults to value recommended by the implementation, using the
87
- `factor` field to infer the suggested value.
88
- `beta_fast` (`float`, *optional*):
89
- Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear
90
- ramp function. If unspecified, it defaults to 32.
91
- `beta_slow` (`float`, *optional*):
92
- Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear
93
- ramp function. If unspecified, it defaults to 1.
94
- `short_factor` (`list[float]`, *optional*):
95
- Only used with 'longrope'. The scaling factor to be applied to short contexts (<
96
- `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
97
- size divided by the number of attention heads divided by 2
98
- `long_factor` (`list[float]`, *optional*):
99
- Only used with 'longrope'. The scaling factor to be applied to long contexts (<
100
- `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
101
- size divided by the number of attention heads divided by 2
102
- `low_freq_factor` (`float`, *optional*):
103
- Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE
104
- `high_freq_factor` (`float`, *optional*):
105
- Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE
106
- attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`):
107
- Whether to use a bias in the query, key, value and output projection layers during self-attention.
108
- use_sliding_window (`bool`, *optional*, defaults to `False`):
109
- Whether to use sliding window attention.
110
- sliding_window (`int`, *optional*, defaults to 4096):
111
- Sliding window attention (SWA) window size. If not specified, will default to `4096`.
112
- layer_types (`list`, *optional*):
113
- Attention pattern for each layer.
114
- attention_dropout (`float`, *optional*, defaults to 0.0):
115
- The dropout ratio for the attention probabilities.
116
-
117
- ```python
118
- >>> from acestep.models import AceStepConfig
119
-
120
- >>> # Initializing an AceStep configuration
121
- >>> configuration = AceStepConfig()
122
-
123
- >>> # Initializing a model from the configuration
124
- >>> model = AceStepConditionGenerationModel(configuration)
125
-
126
- >>> # Accessing the model configuration
127
- >>> configuration = model.config
128
- ```"""
129
-
130
- model_type = "acestep"
131
- keys_to_ignore_at_inference = ["past_key_values"]
132
-
133
- # Default tensor parallel plan for the base model
134
- base_model_tp_plan = {
135
- "layers.*.self_attn.q_proj": "colwise",
136
- "layers.*.self_attn.k_proj": "colwise",
137
- "layers.*.self_attn.v_proj": "colwise",
138
- "layers.*.self_attn.o_proj": "rowwise",
139
- "layers.*.mlp.gate_proj": "colwise",
140
- "layers.*.mlp.up_proj": "colwise",
141
- "layers.*.mlp.down_proj": "rowwise",
142
- }
143
- base_model_pp_plan = {
144
- "embed_tokens": (["input_ids"], ["inputs_embeds"]),
145
- "layers": (["hidden_states", "attention_mask"], ["hidden_states"]),
146
- "norm": (["hidden_states"], ["hidden_states"]),
147
- }
148
- def __init__(
149
- self,
150
- vocab_size=64003,
151
- fsq_dim=2048,
152
- fsq_input_levels=[8, 8, 8, 5, 5, 5],
153
- fsq_input_num_quantizers=1,
154
- hidden_size=2048,
155
- intermediate_size=6144,
156
- num_hidden_layers=24,
157
- num_attention_heads=16,
158
- num_key_value_heads=8,
159
- head_dim=128,
160
- hidden_act="silu",
161
- max_position_embeddings=32768,
162
- initializer_range=0.02,
163
- rms_norm_eps=1e-6,
164
- use_cache=True,
165
- tie_word_embeddings=True,
166
- rope_theta=1000000,
167
- rope_scaling=None,
168
- attention_bias=False,
169
- use_sliding_window=True,
170
- sliding_window=128,
171
- layer_types=None,
172
- attention_dropout=0.0,
173
- num_lyric_encoder_hidden_layers=8,
174
- audio_acoustic_hidden_dim=64,
175
- pool_window_size=5,
176
- text_hidden_dim=1024,
177
- in_channels=192,
178
- data_proportion=0.5,
179
- timestep_mu=-0.4,
180
- timestep_sigma=1.0,
181
- timbre_hidden_dim=64,
182
- num_timbre_encoder_hidden_layers=4,
183
- timbre_fix_frame=750,
184
- patch_size=2,
185
- num_attention_pooler_hidden_layers=2,
186
- num_audio_decoder_hidden_layers=24,
187
- model_version="turbo",
188
- **kwargs,
189
- ):
190
- self.max_position_embeddings = max_position_embeddings
191
- self.hidden_size = hidden_size
192
- self.intermediate_size = intermediate_size
193
- self.num_hidden_layers = num_hidden_layers
194
- self.num_attention_heads = num_attention_heads
195
- self.use_sliding_window = use_sliding_window
196
- self.sliding_window = sliding_window if self.use_sliding_window else None
197
-
198
- # Text encoder configuration
199
- self.text_hidden_dim = text_hidden_dim
200
-
201
- # Lyric encoder configuration
202
- self.num_lyric_encoder_hidden_layers = num_lyric_encoder_hidden_layers
203
- self.patch_size = patch_size
204
-
205
- # Audio semantic token generation configuration
206
- self.audio_acoustic_hidden_dim = audio_acoustic_hidden_dim
207
- self.pool_window_size = pool_window_size
208
- self.in_channels = in_channels
209
- self.data_proportion = data_proportion
210
- self.timestep_mu = timestep_mu
211
- self.timestep_sigma = timestep_sigma
212
-
213
- # FSQ (Finite Scalar Quantization) configuration
214
- self.fsq_dim = fsq_dim
215
- self.fsq_input_levels = fsq_input_levels
216
- self.fsq_input_num_quantizers = fsq_input_num_quantizers
217
-
218
- # Timbre encoder configuration
219
- self.timbre_hidden_dim = timbre_hidden_dim
220
- self.num_timbre_encoder_hidden_layers = num_timbre_encoder_hidden_layers
221
- self.timbre_fix_frame = timbre_fix_frame
222
- self.num_attention_pooler_hidden_layers = num_attention_pooler_hidden_layers
223
- self.num_audio_decoder_hidden_layers = num_audio_decoder_hidden_layers
224
- self.vocab_size = vocab_size
225
-
226
- # Backward compatibility: ensure num_key_value_heads is set
227
- if num_key_value_heads is None:
228
- num_key_value_heads = num_attention_heads
229
-
230
- self.num_key_value_heads = num_key_value_heads
231
- self.head_dim = head_dim
232
- self.hidden_act = hidden_act
233
- self.initializer_range = initializer_range
234
- self.rms_norm_eps = rms_norm_eps
235
- self.use_cache = use_cache
236
- self.rope_theta = rope_theta
237
- self.rope_scaling = rope_scaling
238
- self.attention_bias = attention_bias
239
- self.attention_dropout = attention_dropout
240
- self.model_version = model_version
241
-
242
- # Validate rotary position embeddings parameters
243
- # Backward compatibility: if there is a 'type' field, move it to 'rope_type'
244
- if self.rope_scaling is not None and "type" in self.rope_scaling:
245
- self.rope_scaling["rope_type"] = self.rope_scaling["type"]
246
- rope_config_validation(self)
247
-
248
- self.layer_types = layer_types
249
-
250
- # Set default layer types if not specified
251
- if self.layer_types is None:
252
- self.layer_types = [
253
- "sliding_attention" if bool((i + 1) % 2) else "full_attention" for i in range(self.num_hidden_layers)
254
- ]
255
- layer_type_validation(self.layer_types)
256
-
257
- super().__init__(
258
- tie_word_embeddings=tie_word_embeddings,
259
- **kwargs,
260
- )
261
-
262
-
263
- __all__ = ["AceStepConfig"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ace-step/acestep-v15-base/modeling_acestep_v15_base.py DELETED
The diff for this file is too large to render. See raw diff
 
ace-step/acestep-v15-base/silence_latent.pt DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:a778e9dd942f5e8b2c09c55370782d318834432b03dabbcdf70e6ed49ad6358b
3
- size 3841215
 
 
 
 
ace-step/acestep-v15-sft/apg_guidance.py DELETED
@@ -1,220 +0,0 @@
1
- import torch
2
- import torch.nn.functional as F
3
-
4
-
5
- class MomentumBuffer:
6
-
7
- def __init__(self, momentum: float = -0.75):
8
- self.momentum = momentum
9
- self.running_average = 0
10
-
11
- def update(self, update_value: torch.Tensor):
12
- new_average = self.momentum * self.running_average
13
- self.running_average = update_value + new_average
14
-
15
-
16
- def project(
17
- v0: torch.Tensor, # [B, C, T]
18
- v1: torch.Tensor, # [B, C, T]
19
- dims=[-1],
20
- ):
21
- dtype = v0.dtype
22
- device_type = v0.device.type
23
- if device_type == "mps":
24
- v0, v1 = v0.cpu(), v1.cpu()
25
-
26
- v0, v1 = v0.double(), v1.double()
27
- v1 = torch.nn.functional.normalize(v1, dim=dims)
28
- v0_parallel = (v0 * v1).sum(dim=dims, keepdim=True) * v1
29
- v0_orthogonal = v0 - v0_parallel
30
- return v0_parallel.to(dtype).to(device_type), v0_orthogonal.to(dtype).to(device_type)
31
-
32
-
33
- def apg_forward(
34
- pred_cond: torch.Tensor, # [B, C, T]
35
- pred_uncond: torch.Tensor, # [B, C, T]
36
- guidance_scale: float,
37
- momentum_buffer: MomentumBuffer = None,
38
- eta: float = 0.0,
39
- norm_threshold: float = 2.5,
40
- dims=[-1],
41
- ):
42
- diff = pred_cond - pred_uncond
43
- if momentum_buffer is not None:
44
- momentum_buffer.update(diff)
45
- diff = momentum_buffer.running_average
46
-
47
- if norm_threshold > 0:
48
- ones = torch.ones_like(diff)
49
- diff_norm = diff.norm(p=2, dim=dims, keepdim=True)
50
- scale_factor = torch.minimum(ones, norm_threshold / diff_norm)
51
- diff = diff * scale_factor
52
-
53
- diff_parallel, diff_orthogonal = project(diff, pred_cond, dims)
54
- normalized_update = diff_orthogonal + eta * diff_parallel
55
- pred_guided = pred_cond + (guidance_scale - 1) * normalized_update
56
- return pred_guided
57
-
58
-
59
- def cfg_forward(cond_output, uncond_output, cfg_strength):
60
- return uncond_output + cfg_strength * (cond_output - uncond_output)
61
-
62
-
63
- def call_cos_tensor(tensor1, tensor2):
64
- """
65
- Calculate cosine similarity between two normalized tensors.
66
-
67
- Args:
68
- tensor1: First tensor [B, ...]
69
- tensor2: Second tensor [B, ...]
70
-
71
- Returns:
72
- Cosine similarity value [B, 1]
73
- """
74
- tensor1 = tensor1 / torch.linalg.norm(tensor1, dim=1, keepdim=True)
75
- tensor2 = tensor2 / torch.linalg.norm(tensor2, dim=1, keepdim=True)
76
- cosvalue = torch.sum(tensor1 * tensor2, dim=1, keepdim=True)
77
- return cosvalue
78
-
79
-
80
- def compute_perpendicular_component(latent_diff, latent_hat_uncond):
81
- """
82
- Decompose latent_diff into parallel and perpendicular components relative to latent_hat_uncond.
83
-
84
- Args:
85
- latent_diff: Difference tensor [B, C, ...]
86
- latent_hat_uncond: Unconditional prediction tensor [B, C, ...]
87
-
88
- Returns:
89
- projection: Parallel component
90
- perpendicular_component: Perpendicular component
91
- """
92
- n, t, c = latent_diff.shape
93
- latent_diff = latent_diff.view(n * t, c).float()
94
- latent_hat_uncond = latent_hat_uncond.view(n * t, c).float()
95
-
96
- if latent_diff.size() != latent_hat_uncond.size():
97
- raise ValueError("latent_diff and latent_hat_uncond must have the same shape [n, d].")
98
-
99
- dot_product = torch.sum(latent_diff * latent_hat_uncond, dim=1, keepdim=True) # [n, 1]
100
- norm_square = torch.sum(latent_hat_uncond * latent_hat_uncond, dim=1, keepdim=True) # [n, 1]
101
- projection = (dot_product / (norm_square + 1e-8)) * latent_hat_uncond
102
- perpendicular_component = latent_diff - projection
103
-
104
- return projection.view(n, t, c), perpendicular_component.reshape(n, t, c)
105
-
106
-
107
- def adg_forward(
108
- latents: torch.Tensor,
109
- noise_pred_cond: torch.Tensor,
110
- noise_pred_uncond: torch.Tensor,
111
- sigma: torch.Tensor,
112
- guidance_scale: float,
113
- angle_clip: float = 3.14 / 6, # pi/6 by default
114
- apply_norm: bool = False,
115
- apply_clip: bool = True,
116
- ):
117
- """
118
- ADG (Angle-based Dynamic Guidance) forward pass for Flow Matching.
119
-
120
- In flow matching (including SD3), sigma represents the current timestep t_curr.
121
- The predictions are velocity fields v(x_t, t).
122
-
123
- Args:
124
- latents: Current state x_t [N, T, d] where d=64
125
- noise_pred_cond: Conditional velocity prediction v_cond [N, T, d]
126
- noise_pred_uncond: Unconditional velocity prediction v_uncond [N, T, d]
127
- sigma: Current timestep t_curr (not t_prev!)
128
- guidance_scale: Guidance strength
129
- angle_clip: Maximum angle for clipping (default: pi/6)
130
- apply_norm: Whether to normalize the result (ADG_w_norm variant)
131
- apply_clip: Whether to clip the angle (ADG_wo_clip when False)
132
-
133
- Returns:
134
- Guided velocity prediction [N, T, d]
135
- """
136
- # Get batch size
137
- n = noise_pred_cond.shape[0]
138
- noise_pred_text = noise_pred_cond
139
- n, t, c = noise_pred_text.shape
140
-
141
- # Ensure sigma/t has the right shape for broadcasting [N, 1, 1]
142
- if isinstance(sigma, (int, float)):
143
- sigma = torch.tensor(sigma, device=latents.device, dtype=latents.dtype)
144
- sigma = sigma.view(1, 1, 1).expand(n, 1, 1)
145
- elif torch.is_tensor(sigma):
146
- if sigma.numel() == 1:
147
- sigma = sigma.view(1, 1, 1).expand(n, 1, 1)
148
- elif sigma.numel() == n:
149
- sigma = sigma.view(n, 1, 1)
150
- else:
151
- raise ValueError(f"sigma has incompatible shape. Expected scalar or size {n}, got {sigma.shape}")
152
- else:
153
- raise TypeError(f"sigma must be a number or tensor, got {type(sigma)}")
154
-
155
- # Adjust guidance weight
156
- weight = guidance_scale - 1
157
- weight = weight * (weight > 0) + 1e-3
158
-
159
- latent_hat_text = latents - sigma * noise_pred_text
160
- latent_hat_uncond = latents - sigma * noise_pred_uncond
161
- latent_diff = latent_hat_text - latent_hat_uncond
162
-
163
- # Calculate angle between conditional and unconditional predicted data
164
- latent_theta = torch.acos(
165
- call_cos_tensor(latent_hat_text.view(-1, c).to(float),
166
- latent_hat_uncond.reshape(-1, c).contiguous().to(float)))
167
- latent_theta_new = torch.clip(weight * latent_theta, -angle_clip, angle_clip) if apply_clip else weight * latent_theta
168
- proj, perp = compute_perpendicular_component(latent_diff, latent_hat_uncond)
169
- latent_v_new = torch.cos(latent_theta_new) * latent_hat_text
170
-
171
- latent_p_new = perp * torch.sin(latent_theta_new) / torch.sin(latent_theta) * (
172
- torch.sin(latent_theta) > 1e-3) + perp * weight * (torch.sin(latent_theta) <= 1e-3)
173
- latent_new = latent_v_new + latent_p_new
174
- if apply_norm:
175
- latent_new = latent_new * torch.linalg.norm(latent_hat_text, dim=1, keepdim=True) / torch.linalg.norm(
176
- latent_new, dim=1, keepdim=True)
177
-
178
- noise_pred = (latents - latent_new) / sigma
179
- noise_pred = noise_pred.reshape(n, t, c).to(latents.dtype)
180
- return noise_pred
181
-
182
-
183
- def adg_w_norm_forward(
184
- latents: torch.Tensor,
185
- noise_pred_cond: torch.Tensor,
186
- noise_pred_uncond: torch.Tensor,
187
- sigma: float,
188
- guidance_scale: float,
189
- angle_clip: float = 3.14 / 3,
190
- ):
191
- """
192
- ADG with normalization - preserves the magnitude of latent predictions.
193
-
194
- This variant normalizes the final latent to maintain the same norm as the
195
- conditional prediction, which can help preserve image quality.
196
- """
197
- return adg_forward(latents,
198
- noise_pred_cond,
199
- noise_pred_uncond,
200
- sigma,
201
- guidance_scale,
202
- angle_clip=angle_clip,
203
- apply_norm=True,
204
- apply_clip=True)
205
-
206
-
207
- def adg_wo_clip_forward(
208
- latents: torch.Tensor,
209
- noise_pred_cond: torch.Tensor,
210
- noise_pred_uncond: torch.Tensor,
211
- sigma: float,
212
- guidance_scale: float,
213
- ):
214
- """
215
- ADG without angle clipping - allows unbounded angle adjustments.
216
-
217
- This variant doesn't clip the angle, which may result in more aggressive
218
- guidance but could be less stable.
219
- """
220
- return adg_forward(latents, noise_pred_cond, noise_pred_uncond, sigma, guidance_scale, apply_norm=False, apply_clip=False)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ace-step/acestep-v15-sft/config.json DELETED
@@ -1,81 +0,0 @@
1
- {
2
- "architectures": [
3
- "AceStepConditionGenerationModel"
4
- ],
5
- "auto_map": {
6
- "AutoConfig": "configuration_acestep_v15.AceStepConfig",
7
- "AutoModel": "modeling_acestep_v15_base.AceStepConditionGenerationModel"
8
- },
9
- "attention_bias": false,
10
- "attention_dropout": 0.0,
11
- "audio_acoustic_hidden_dim": 64,
12
- "data_proportion": 0.5,
13
- "dtype": "bfloat16",
14
- "fsq_dim": 2048,
15
- "fsq_input_levels": [
16
- 8,
17
- 8,
18
- 8,
19
- 5,
20
- 5,
21
- 5
22
- ],
23
- "fsq_input_num_quantizers": 1,
24
- "head_dim": 128,
25
- "hidden_act": "silu",
26
- "hidden_size": 2048,
27
- "in_channels": 192,
28
- "initializer_range": 0.02,
29
- "intermediate_size": 6144,
30
- "layer_types": [
31
- "sliding_attention",
32
- "full_attention",
33
- "sliding_attention",
34
- "full_attention",
35
- "sliding_attention",
36
- "full_attention",
37
- "sliding_attention",
38
- "full_attention",
39
- "sliding_attention",
40
- "full_attention",
41
- "sliding_attention",
42
- "full_attention",
43
- "sliding_attention",
44
- "full_attention",
45
- "sliding_attention",
46
- "full_attention",
47
- "sliding_attention",
48
- "full_attention",
49
- "sliding_attention",
50
- "full_attention",
51
- "sliding_attention",
52
- "full_attention",
53
- "sliding_attention",
54
- "full_attention"
55
- ],
56
- "max_position_embeddings": 32768,
57
- "model_type": "acestep",
58
- "num_attention_heads": 16,
59
- "num_attention_pooler_hidden_layers": 2,
60
- "num_audio_decoder_hidden_layers": 24,
61
- "num_hidden_layers": 24,
62
- "num_key_value_heads": 8,
63
- "num_lyric_encoder_hidden_layers": 8,
64
- "num_timbre_encoder_hidden_layers": 4,
65
- "patch_size": 2,
66
- "pool_window_size": 5,
67
- "rms_norm_eps": 1e-06,
68
- "rope_scaling": null,
69
- "rope_theta": 1000000,
70
- "sliding_window": 128,
71
- "text_hidden_dim": 1024,
72
- "timbre_fix_frame": 750,
73
- "timbre_hidden_dim": 64,
74
- "timestep_mu": -0.4,
75
- "timestep_sigma": 1.0,
76
- "transformers_version": "4.57.0.dev0",
77
- "use_cache": true,
78
- "use_sliding_window": true,
79
- "vocab_size": 64003,
80
- "is_turbo": false
81
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ace-step/acestep-v15-sft/configuration_acestep_v15.py DELETED
@@ -1,263 +0,0 @@
1
- # coding=utf-8
2
- # Copyright 2024 The Qwen team, Alibaba Group and the HuggingFace Inc. team. All rights reserved.
3
- #
4
- # Licensed under the Apache License, Version 2.0 (the "License");
5
- # you may not use this file except in compliance with the License.
6
- # You may obtain a copy of the License at
7
- #
8
- # http://www.apache.org/licenses/LICENSE-2.0
9
- #
10
- # Unless required by applicable law or agreed to in writing, software
11
- # distributed under the License is distributed on an "AS IS" BASIS,
12
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
- # See the License for the specific language governing permissions and
14
- # limitations under the License.
15
- """AceStep model configuration"""
16
-
17
- from transformers.configuration_utils import PretrainedConfig, layer_type_validation
18
- from transformers.modeling_rope_utils import rope_config_validation
19
- from transformers.utils import logging
20
-
21
-
22
- logger = logging.get_logger(__name__)
23
-
24
-
25
- class AceStepConfig(PretrainedConfig):
26
- r"""
27
- This is the configuration class to store the configuration of a [`AceStepModel`]. It is used to instantiate an
28
- AceStep model according to the specified arguments, defining the model architecture.
29
-
30
- Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
31
- documentation from [`PretrainedConfig`] for more information.
32
-
33
- Args:
34
- vocab_size (`int`, *optional*, defaults to 64003):
35
- Vocabulary size of the AceStep model. Defines the number of different tokens that can be represented by the
36
- `inputs_ids` passed when calling the model.
37
- hidden_size (`int`, *optional*, defaults to 4096):
38
- Dimension of the hidden representations.
39
- intermediate_size (`int`, *optional*, defaults to 22016):
40
- Dimension of the MLP representations.
41
- num_hidden_layers (`int`, *optional*, defaults to 32):
42
- Number of hidden layers in the Transformer encoder.
43
- num_attention_heads (`int`, *optional*, defaults to 32):
44
- Number of attention heads for each attention layer in the Transformer encoder.
45
- num_key_value_heads (`int`, *optional*, defaults to 32):
46
- This is the number of key_value heads that should be used to implement Grouped Query Attention. If
47
- `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
48
- `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
49
- converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
50
- by meanpooling all the original heads within that group. For more details, check out [this
51
- paper](https://huggingface.co/papers/2305.13245). If it is not specified, will default to `32`.
52
- head_dim (`int`, *optional*, defaults to 128):
53
- The attention head dimension.
54
- hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
55
- The non-linear activation function (function or string) in the decoder.
56
- max_position_embeddings (`int`, *optional*, defaults to 32768):
57
- The maximum sequence length that this model might ever be used with.
58
- initializer_range (`float`, *optional*, defaults to 0.02):
59
- The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
60
- rms_norm_eps (`float`, *optional*, defaults to 1e-06):
61
- The epsilon used by the rms normalization layers.
62
- use_cache (`bool`, *optional*, defaults to `True`):
63
- Whether or not the model should return the last key/values attentions (not used by all models). Only
64
- relevant if `config.is_decoder=True`.
65
- tie_word_embeddings (`bool`, *optional*, defaults to `False`):
66
- Whether the model's input and output word embeddings should be tied.
67
- rope_theta (`float`, *optional*, defaults to 10000.0):
68
- The base period of the RoPE embeddings.
69
- rope_scaling (`Dict`, *optional*):
70
- Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type
71
- and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value
72
- accordingly.
73
- Expected contents:
74
- `rope_type` (`str`):
75
- The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope',
76
- 'llama3'], with 'default' being the original RoPE implementation.
77
- `factor` (`float`, *optional*):
78
- Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In
79
- most scaling types, a `factor` of x will enable the model to handle sequences of length x *
80
- original maximum pre-trained length.
81
- `original_max_position_embeddings` (`int`, *optional*):
82
- Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during
83
- pretraining.
84
- `attention_factor` (`float`, *optional*):
85
- Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention
86
- computation. If unspecified, it defaults to value recommended by the implementation, using the
87
- `factor` field to infer the suggested value.
88
- `beta_fast` (`float`, *optional*):
89
- Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear
90
- ramp function. If unspecified, it defaults to 32.
91
- `beta_slow` (`float`, *optional*):
92
- Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear
93
- ramp function. If unspecified, it defaults to 1.
94
- `short_factor` (`list[float]`, *optional*):
95
- Only used with 'longrope'. The scaling factor to be applied to short contexts (<
96
- `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
97
- size divided by the number of attention heads divided by 2
98
- `long_factor` (`list[float]`, *optional*):
99
- Only used with 'longrope'. The scaling factor to be applied to long contexts (<
100
- `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
101
- size divided by the number of attention heads divided by 2
102
- `low_freq_factor` (`float`, *optional*):
103
- Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE
104
- `high_freq_factor` (`float`, *optional*):
105
- Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE
106
- attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`):
107
- Whether to use a bias in the query, key, value and output projection layers during self-attention.
108
- use_sliding_window (`bool`, *optional*, defaults to `False`):
109
- Whether to use sliding window attention.
110
- sliding_window (`int`, *optional*, defaults to 4096):
111
- Sliding window attention (SWA) window size. If not specified, will default to `4096`.
112
- layer_types (`list`, *optional*):
113
- Attention pattern for each layer.
114
- attention_dropout (`float`, *optional*, defaults to 0.0):
115
- The dropout ratio for the attention probabilities.
116
-
117
- ```python
118
- >>> from acestep.models import AceStepConfig
119
-
120
- >>> # Initializing an AceStep configuration
121
- >>> configuration = AceStepConfig()
122
-
123
- >>> # Initializing a model from the configuration
124
- >>> model = AceStepConditionGenerationModel(configuration)
125
-
126
- >>> # Accessing the model configuration
127
- >>> configuration = model.config
128
- ```"""
129
-
130
- model_type = "acestep"
131
- keys_to_ignore_at_inference = ["past_key_values"]
132
-
133
- # Default tensor parallel plan for the base model
134
- base_model_tp_plan = {
135
- "layers.*.self_attn.q_proj": "colwise",
136
- "layers.*.self_attn.k_proj": "colwise",
137
- "layers.*.self_attn.v_proj": "colwise",
138
- "layers.*.self_attn.o_proj": "rowwise",
139
- "layers.*.mlp.gate_proj": "colwise",
140
- "layers.*.mlp.up_proj": "colwise",
141
- "layers.*.mlp.down_proj": "rowwise",
142
- }
143
- base_model_pp_plan = {
144
- "embed_tokens": (["input_ids"], ["inputs_embeds"]),
145
- "layers": (["hidden_states", "attention_mask"], ["hidden_states"]),
146
- "norm": (["hidden_states"], ["hidden_states"]),
147
- }
148
- def __init__(
149
- self,
150
- vocab_size=64003,
151
- fsq_dim=2048,
152
- fsq_input_levels=[8, 8, 8, 5, 5, 5],
153
- fsq_input_num_quantizers=1,
154
- hidden_size=2048,
155
- intermediate_size=6144,
156
- num_hidden_layers=24,
157
- num_attention_heads=16,
158
- num_key_value_heads=8,
159
- head_dim=128,
160
- hidden_act="silu",
161
- max_position_embeddings=32768,
162
- initializer_range=0.02,
163
- rms_norm_eps=1e-6,
164
- use_cache=True,
165
- tie_word_embeddings=True,
166
- rope_theta=1000000,
167
- rope_scaling=None,
168
- attention_bias=False,
169
- use_sliding_window=True,
170
- sliding_window=128,
171
- layer_types=None,
172
- attention_dropout=0.0,
173
- num_lyric_encoder_hidden_layers=8,
174
- audio_acoustic_hidden_dim=64,
175
- pool_window_size=5,
176
- text_hidden_dim=1024,
177
- in_channels=192,
178
- data_proportion=0.5,
179
- timestep_mu=-0.4,
180
- timestep_sigma=1.0,
181
- timbre_hidden_dim=64,
182
- num_timbre_encoder_hidden_layers=4,
183
- timbre_fix_frame=750,
184
- patch_size=2,
185
- num_attention_pooler_hidden_layers=2,
186
- num_audio_decoder_hidden_layers=24,
187
- model_version="turbo",
188
- **kwargs,
189
- ):
190
- self.max_position_embeddings = max_position_embeddings
191
- self.hidden_size = hidden_size
192
- self.intermediate_size = intermediate_size
193
- self.num_hidden_layers = num_hidden_layers
194
- self.num_attention_heads = num_attention_heads
195
- self.use_sliding_window = use_sliding_window
196
- self.sliding_window = sliding_window if self.use_sliding_window else None
197
-
198
- # Text encoder configuration
199
- self.text_hidden_dim = text_hidden_dim
200
-
201
- # Lyric encoder configuration
202
- self.num_lyric_encoder_hidden_layers = num_lyric_encoder_hidden_layers
203
- self.patch_size = patch_size
204
-
205
- # Audio semantic token generation configuration
206
- self.audio_acoustic_hidden_dim = audio_acoustic_hidden_dim
207
- self.pool_window_size = pool_window_size
208
- self.in_channels = in_channels
209
- self.data_proportion = data_proportion
210
- self.timestep_mu = timestep_mu
211
- self.timestep_sigma = timestep_sigma
212
-
213
- # FSQ (Finite Scalar Quantization) configuration
214
- self.fsq_dim = fsq_dim
215
- self.fsq_input_levels = fsq_input_levels
216
- self.fsq_input_num_quantizers = fsq_input_num_quantizers
217
-
218
- # Timbre encoder configuration
219
- self.timbre_hidden_dim = timbre_hidden_dim
220
- self.num_timbre_encoder_hidden_layers = num_timbre_encoder_hidden_layers
221
- self.timbre_fix_frame = timbre_fix_frame
222
- self.num_attention_pooler_hidden_layers = num_attention_pooler_hidden_layers
223
- self.num_audio_decoder_hidden_layers = num_audio_decoder_hidden_layers
224
- self.vocab_size = vocab_size
225
-
226
- # Backward compatibility: ensure num_key_value_heads is set
227
- if num_key_value_heads is None:
228
- num_key_value_heads = num_attention_heads
229
-
230
- self.num_key_value_heads = num_key_value_heads
231
- self.head_dim = head_dim
232
- self.hidden_act = hidden_act
233
- self.initializer_range = initializer_range
234
- self.rms_norm_eps = rms_norm_eps
235
- self.use_cache = use_cache
236
- self.rope_theta = rope_theta
237
- self.rope_scaling = rope_scaling
238
- self.attention_bias = attention_bias
239
- self.attention_dropout = attention_dropout
240
- self.model_version = model_version
241
-
242
- # Validate rotary position embeddings parameters
243
- # Backward compatibility: if there is a 'type' field, move it to 'rope_type'
244
- if self.rope_scaling is not None and "type" in self.rope_scaling:
245
- self.rope_scaling["rope_type"] = self.rope_scaling["type"]
246
- rope_config_validation(self)
247
-
248
- self.layer_types = layer_types
249
-
250
- # Set default layer types if not specified
251
- if self.layer_types is None:
252
- self.layer_types = [
253
- "sliding_attention" if bool((i + 1) % 2) else "full_attention" for i in range(self.num_hidden_layers)
254
- ]
255
- layer_type_validation(self.layer_types)
256
-
257
- super().__init__(
258
- tie_word_embeddings=tie_word_embeddings,
259
- **kwargs,
260
- )
261
-
262
-
263
- __all__ = ["AceStepConfig"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ace-step/acestep-v15-sft/modeling_acestep_v15_base.py DELETED
The diff for this file is too large to render. See raw diff
 
ace-step/acestep-v15-sft/silence_latent.pt DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:a778e9dd942f5e8b2c09c55370782d318834432b03dabbcdf70e6ed49ad6358b
3
- size 3841215
 
 
 
 
ace-step/acestep-v15-turbo/config.json DELETED
@@ -1,82 +0,0 @@
1
- {
2
- "architectures": [
3
- "AceStepConditionGenerationModel"
4
- ],
5
- "attention_bias": false,
6
- "attention_dropout": 0.0,
7
- "audio_acoustic_hidden_dim": 64,
8
- "auto_map": {
9
- "AutoConfig": "configuration_acestep_v15.AceStepConfig",
10
- "AutoModel": "modeling_acestep_v15_turbo.AceStepConditionGenerationModel"
11
- },
12
- "data_proportion": 0.5,
13
- "dtype": "bfloat16",
14
- "fsq_dim": 2048,
15
- "fsq_input_levels": [
16
- 8,
17
- 8,
18
- 8,
19
- 5,
20
- 5,
21
- 5
22
- ],
23
- "fsq_input_num_quantizers": 1,
24
- "head_dim": 128,
25
- "hidden_act": "silu",
26
- "hidden_size": 2048,
27
- "in_channels": 192,
28
- "initializer_range": 0.02,
29
- "intermediate_size": 6144,
30
- "is_turbo": true,
31
- "layer_types": [
32
- "sliding_attention",
33
- "full_attention",
34
- "sliding_attention",
35
- "full_attention",
36
- "sliding_attention",
37
- "full_attention",
38
- "sliding_attention",
39
- "full_attention",
40
- "sliding_attention",
41
- "full_attention",
42
- "sliding_attention",
43
- "full_attention",
44
- "sliding_attention",
45
- "full_attention",
46
- "sliding_attention",
47
- "full_attention",
48
- "sliding_attention",
49
- "full_attention",
50
- "sliding_attention",
51
- "full_attention",
52
- "sliding_attention",
53
- "full_attention",
54
- "sliding_attention",
55
- "full_attention"
56
- ],
57
- "max_position_embeddings": 32768,
58
- "model_type": "acestep",
59
- "model_version": "turbo",
60
- "num_attention_heads": 16,
61
- "num_attention_pooler_hidden_layers": 2,
62
- "num_audio_decoder_hidden_layers": 24,
63
- "num_hidden_layers": 24,
64
- "num_key_value_heads": 8,
65
- "num_lyric_encoder_hidden_layers": 8,
66
- "num_timbre_encoder_hidden_layers": 4,
67
- "patch_size": 2,
68
- "pool_window_size": 5,
69
- "rms_norm_eps": 1e-06,
70
- "rope_scaling": null,
71
- "rope_theta": 1000000,
72
- "sliding_window": 128,
73
- "text_hidden_dim": 1024,
74
- "timbre_fix_frame": 750,
75
- "timbre_hidden_dim": 64,
76
- "timestep_mu": -0.4,
77
- "timestep_sigma": 1.0,
78
- "transformers_version": "4.57.0.dev0",
79
- "use_cache": true,
80
- "use_sliding_window": true,
81
- "vocab_size": 64003
82
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ace-step/acestep-v15-turbo/configuration_acestep_v15.py DELETED
@@ -1,263 +0,0 @@
1
- # coding=utf-8
2
- # Copyright 2024 The Qwen team, Alibaba Group and the HuggingFace Inc. team. All rights reserved.
3
- #
4
- # Licensed under the Apache License, Version 2.0 (the "License");
5
- # you may not use this file except in compliance with the License.
6
- # You may obtain a copy of the License at
7
- #
8
- # http://www.apache.org/licenses/LICENSE-2.0
9
- #
10
- # Unless required by applicable law or agreed to in writing, software
11
- # distributed under the License is distributed on an "AS IS" BASIS,
12
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
- # See the License for the specific language governing permissions and
14
- # limitations under the License.
15
- """AceStep model configuration"""
16
-
17
- from transformers.configuration_utils import PretrainedConfig, layer_type_validation
18
- from transformers.modeling_rope_utils import rope_config_validation
19
- from transformers.utils import logging
20
-
21
-
22
- logger = logging.get_logger(__name__)
23
-
24
-
25
- class AceStepConfig(PretrainedConfig):
26
- r"""
27
- This is the configuration class to store the configuration of a [`AceStepModel`]. It is used to instantiate an
28
- AceStep model according to the specified arguments, defining the model architecture.
29
-
30
- Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
31
- documentation from [`PretrainedConfig`] for more information.
32
-
33
- Args:
34
- vocab_size (`int`, *optional*, defaults to 64003):
35
- Vocabulary size of the AceStep model. Defines the number of different tokens that can be represented by the
36
- `inputs_ids` passed when calling the model.
37
- hidden_size (`int`, *optional*, defaults to 4096):
38
- Dimension of the hidden representations.
39
- intermediate_size (`int`, *optional*, defaults to 22016):
40
- Dimension of the MLP representations.
41
- num_hidden_layers (`int`, *optional*, defaults to 32):
42
- Number of hidden layers in the Transformer encoder.
43
- num_attention_heads (`int`, *optional*, defaults to 32):
44
- Number of attention heads for each attention layer in the Transformer encoder.
45
- num_key_value_heads (`int`, *optional*, defaults to 32):
46
- This is the number of key_value heads that should be used to implement Grouped Query Attention. If
47
- `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
48
- `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
49
- converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
50
- by meanpooling all the original heads within that group. For more details, check out [this
51
- paper](https://huggingface.co/papers/2305.13245). If it is not specified, will default to `32`.
52
- head_dim (`int`, *optional*, defaults to 128):
53
- The attention head dimension.
54
- hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
55
- The non-linear activation function (function or string) in the decoder.
56
- max_position_embeddings (`int`, *optional*, defaults to 32768):
57
- The maximum sequence length that this model might ever be used with.
58
- initializer_range (`float`, *optional*, defaults to 0.02):
59
- The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
60
- rms_norm_eps (`float`, *optional*, defaults to 1e-06):
61
- The epsilon used by the rms normalization layers.
62
- use_cache (`bool`, *optional*, defaults to `True`):
63
- Whether or not the model should return the last key/values attentions (not used by all models). Only
64
- relevant if `config.is_decoder=True`.
65
- tie_word_embeddings (`bool`, *optional*, defaults to `False`):
66
- Whether the model's input and output word embeddings should be tied.
67
- rope_theta (`float`, *optional*, defaults to 10000.0):
68
- The base period of the RoPE embeddings.
69
- rope_scaling (`Dict`, *optional*):
70
- Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type
71
- and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value
72
- accordingly.
73
- Expected contents:
74
- `rope_type` (`str`):
75
- The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope',
76
- 'llama3'], with 'default' being the original RoPE implementation.
77
- `factor` (`float`, *optional*):
78
- Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In
79
- most scaling types, a `factor` of x will enable the model to handle sequences of length x *
80
- original maximum pre-trained length.
81
- `original_max_position_embeddings` (`int`, *optional*):
82
- Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during
83
- pretraining.
84
- `attention_factor` (`float`, *optional*):
85
- Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention
86
- computation. If unspecified, it defaults to value recommended by the implementation, using the
87
- `factor` field to infer the suggested value.
88
- `beta_fast` (`float`, *optional*):
89
- Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear
90
- ramp function. If unspecified, it defaults to 32.
91
- `beta_slow` (`float`, *optional*):
92
- Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear
93
- ramp function. If unspecified, it defaults to 1.
94
- `short_factor` (`list[float]`, *optional*):
95
- Only used with 'longrope'. The scaling factor to be applied to short contexts (<
96
- `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
97
- size divided by the number of attention heads divided by 2
98
- `long_factor` (`list[float]`, *optional*):
99
- Only used with 'longrope'. The scaling factor to be applied to long contexts (<
100
- `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
101
- size divided by the number of attention heads divided by 2
102
- `low_freq_factor` (`float`, *optional*):
103
- Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE
104
- `high_freq_factor` (`float`, *optional*):
105
- Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE
106
- attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`):
107
- Whether to use a bias in the query, key, value and output projection layers during self-attention.
108
- use_sliding_window (`bool`, *optional*, defaults to `False`):
109
- Whether to use sliding window attention.
110
- sliding_window (`int`, *optional*, defaults to 4096):
111
- Sliding window attention (SWA) window size. If not specified, will default to `4096`.
112
- layer_types (`list`, *optional*):
113
- Attention pattern for each layer.
114
- attention_dropout (`float`, *optional*, defaults to 0.0):
115
- The dropout ratio for the attention probabilities.
116
-
117
- ```python
118
- >>> from acestep.models import AceStepConfig
119
-
120
- >>> # Initializing an AceStep configuration
121
- >>> configuration = AceStepConfig()
122
-
123
- >>> # Initializing a model from the configuration
124
- >>> model = AceStepConditionGenerationModel(configuration)
125
-
126
- >>> # Accessing the model configuration
127
- >>> configuration = model.config
128
- ```"""
129
-
130
- model_type = "acestep"
131
- keys_to_ignore_at_inference = ["past_key_values"]
132
-
133
- # Default tensor parallel plan for the base model
134
- base_model_tp_plan = {
135
- "layers.*.self_attn.q_proj": "colwise",
136
- "layers.*.self_attn.k_proj": "colwise",
137
- "layers.*.self_attn.v_proj": "colwise",
138
- "layers.*.self_attn.o_proj": "rowwise",
139
- "layers.*.mlp.gate_proj": "colwise",
140
- "layers.*.mlp.up_proj": "colwise",
141
- "layers.*.mlp.down_proj": "rowwise",
142
- }
143
- base_model_pp_plan = {
144
- "embed_tokens": (["input_ids"], ["inputs_embeds"]),
145
- "layers": (["hidden_states", "attention_mask"], ["hidden_states"]),
146
- "norm": (["hidden_states"], ["hidden_states"]),
147
- }
148
- def __init__(
149
- self,
150
- vocab_size=64003,
151
- fsq_dim=2048,
152
- fsq_input_levels=[8, 8, 8, 5, 5, 5],
153
- fsq_input_num_quantizers=1,
154
- hidden_size=2048,
155
- intermediate_size=6144,
156
- num_hidden_layers=24,
157
- num_attention_heads=16,
158
- num_key_value_heads=8,
159
- head_dim=128,
160
- hidden_act="silu",
161
- max_position_embeddings=32768,
162
- initializer_range=0.02,
163
- rms_norm_eps=1e-6,
164
- use_cache=True,
165
- tie_word_embeddings=True,
166
- rope_theta=1000000,
167
- rope_scaling=None,
168
- attention_bias=False,
169
- use_sliding_window=True,
170
- sliding_window=128,
171
- layer_types=None,
172
- attention_dropout=0.0,
173
- num_lyric_encoder_hidden_layers=8,
174
- audio_acoustic_hidden_dim=64,
175
- pool_window_size=5,
176
- text_hidden_dim=1024,
177
- in_channels=192,
178
- data_proportion=0.5,
179
- timestep_mu=-0.4,
180
- timestep_sigma=1.0,
181
- timbre_hidden_dim=64,
182
- num_timbre_encoder_hidden_layers=4,
183
- timbre_fix_frame=750,
184
- patch_size=2,
185
- num_attention_pooler_hidden_layers=2,
186
- num_audio_decoder_hidden_layers=24,
187
- model_version="turbo",
188
- **kwargs,
189
- ):
190
- self.max_position_embeddings = max_position_embeddings
191
- self.hidden_size = hidden_size
192
- self.intermediate_size = intermediate_size
193
- self.num_hidden_layers = num_hidden_layers
194
- self.num_attention_heads = num_attention_heads
195
- self.use_sliding_window = use_sliding_window
196
- self.sliding_window = sliding_window if self.use_sliding_window else None
197
-
198
- # Text encoder configuration
199
- self.text_hidden_dim = text_hidden_dim
200
-
201
- # Lyric encoder configuration
202
- self.num_lyric_encoder_hidden_layers = num_lyric_encoder_hidden_layers
203
- self.patch_size = patch_size
204
-
205
- # Audio semantic token generation configuration
206
- self.audio_acoustic_hidden_dim = audio_acoustic_hidden_dim
207
- self.pool_window_size = pool_window_size
208
- self.in_channels = in_channels
209
- self.data_proportion = data_proportion
210
- self.timestep_mu = timestep_mu
211
- self.timestep_sigma = timestep_sigma
212
-
213
- # FSQ (Finite Scalar Quantization) configuration
214
- self.fsq_dim = fsq_dim
215
- self.fsq_input_levels = fsq_input_levels
216
- self.fsq_input_num_quantizers = fsq_input_num_quantizers
217
-
218
- # Timbre encoder configuration
219
- self.timbre_hidden_dim = timbre_hidden_dim
220
- self.num_timbre_encoder_hidden_layers = num_timbre_encoder_hidden_layers
221
- self.timbre_fix_frame = timbre_fix_frame
222
- self.num_attention_pooler_hidden_layers = num_attention_pooler_hidden_layers
223
- self.num_audio_decoder_hidden_layers = num_audio_decoder_hidden_layers
224
- self.vocab_size = vocab_size
225
-
226
- # Backward compatibility: ensure num_key_value_heads is set
227
- if num_key_value_heads is None:
228
- num_key_value_heads = num_attention_heads
229
-
230
- self.num_key_value_heads = num_key_value_heads
231
- self.head_dim = head_dim
232
- self.hidden_act = hidden_act
233
- self.initializer_range = initializer_range
234
- self.rms_norm_eps = rms_norm_eps
235
- self.use_cache = use_cache
236
- self.rope_theta = rope_theta
237
- self.rope_scaling = rope_scaling
238
- self.attention_bias = attention_bias
239
- self.attention_dropout = attention_dropout
240
- self.model_version = model_version
241
-
242
- # Validate rotary position embeddings parameters
243
- # Backward compatibility: if there is a 'type' field, move it to 'rope_type'
244
- if self.rope_scaling is not None and "type" in self.rope_scaling:
245
- self.rope_scaling["rope_type"] = self.rope_scaling["type"]
246
- rope_config_validation(self)
247
-
248
- self.layer_types = layer_types
249
-
250
- # Set default layer types if not specified
251
- if self.layer_types is None:
252
- self.layer_types = [
253
- "sliding_attention" if bool((i + 1) % 2) else "full_attention" for i in range(self.num_hidden_layers)
254
- ]
255
- layer_type_validation(self.layer_types)
256
-
257
- super().__init__(
258
- tie_word_embeddings=tie_word_embeddings,
259
- **kwargs,
260
- )
261
-
262
-
263
- __all__ = ["AceStepConfig"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ace-step/acestep-v15-turbo/modeling_acestep_v15_turbo.py DELETED
The diff for this file is too large to render. See raw diff
 
ace-step/acestep-v15-turbo/silence_latent.pt DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:a778e9dd942f5e8b2c09c55370782d318834432b03dabbcdf70e6ed49ad6358b
3
- size 3841215
 
 
 
 
ace-step/config.json DELETED
@@ -1,82 +0,0 @@
1
- {
2
- "architectures": [
3
- "AceStepConditionGenerationModel"
4
- ],
5
- "attention_bias": false,
6
- "attention_dropout": 0.0,
7
- "audio_acoustic_hidden_dim": 64,
8
- "auto_map": {
9
- "AutoConfig": "configuration_acestep_v15.AceStepConfig",
10
- "AutoModel": "modeling_acestep_v15_turbo.AceStepConditionGenerationModel"
11
- },
12
- "data_proportion": 0.5,
13
- "dtype": "bfloat16",
14
- "fsq_dim": 2048,
15
- "fsq_input_levels": [
16
- 8,
17
- 8,
18
- 8,
19
- 5,
20
- 5,
21
- 5
22
- ],
23
- "fsq_input_num_quantizers": 1,
24
- "head_dim": 128,
25
- "hidden_act": "silu",
26
- "hidden_size": 2048,
27
- "in_channels": 192,
28
- "initializer_range": 0.02,
29
- "intermediate_size": 6144,
30
- "is_turbo": true,
31
- "layer_types": [
32
- "sliding_attention",
33
- "full_attention",
34
- "sliding_attention",
35
- "full_attention",
36
- "sliding_attention",
37
- "full_attention",
38
- "sliding_attention",
39
- "full_attention",
40
- "sliding_attention",
41
- "full_attention",
42
- "sliding_attention",
43
- "full_attention",
44
- "sliding_attention",
45
- "full_attention",
46
- "sliding_attention",
47
- "full_attention",
48
- "sliding_attention",
49
- "full_attention",
50
- "sliding_attention",
51
- "full_attention",
52
- "sliding_attention",
53
- "full_attention",
54
- "sliding_attention",
55
- "full_attention"
56
- ],
57
- "max_position_embeddings": 32768,
58
- "model_type": "acestep",
59
- "model_version": "turbo",
60
- "num_attention_heads": 16,
61
- "num_attention_pooler_hidden_layers": 2,
62
- "num_audio_decoder_hidden_layers": 24,
63
- "num_hidden_layers": 24,
64
- "num_key_value_heads": 8,
65
- "num_lyric_encoder_hidden_layers": 8,
66
- "num_timbre_encoder_hidden_layers": 4,
67
- "patch_size": 2,
68
- "pool_window_size": 5,
69
- "rms_norm_eps": 1e-06,
70
- "rope_scaling": null,
71
- "rope_theta": 1000000,
72
- "sliding_window": 128,
73
- "text_hidden_dim": 1024,
74
- "timbre_fix_frame": 750,
75
- "timbre_hidden_dim": 64,
76
- "timestep_mu": -0.4,
77
- "timestep_sigma": 1.0,
78
- "transformers_version": "4.57.0.dev0",
79
- "use_cache": true,
80
- "use_sliding_window": true,
81
- "vocab_size": 64003
82
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ace-step/vae/config.json DELETED
@@ -1,24 +0,0 @@
1
- {
2
- "_class_name": "AutoencoderOobleck",
3
- "_diffusers_version": "0.34.0",
4
- "_name_or_path": "/root/data/repo/gongjunmin/ACE-Step-1.5/checkpoints/vae/",
5
- "audio_channels": 2,
6
- "channel_multiples": [
7
- 1,
8
- 2,
9
- 4,
10
- 8,
11
- 16
12
- ],
13
- "decoder_channels": 128,
14
- "decoder_input_channels": 64,
15
- "downsampling_ratios": [
16
- 2,
17
- 4,
18
- 4,
19
- 6,
20
- 10
21
- ],
22
- "encoder_hidden_size": 128,
23
- "sampling_rate": 48000
24
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ace-step/vae/diffusion_pytorch_model.safetensors DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:da17edb604c40deaf09e9b24974e590d1ca83a374070e5d0884cfa4bed9a99b0
3
- size 337431388
 
 
 
 
depth/dpt-large/.no_exist/bc15f29aa3a80d532f2ed650b5e16ac48d8958f9/processor_config.json DELETED
File without changes
depth/dpt-large/refs/main DELETED
@@ -1 +0,0 @@
1
- bc15f29aa3a80d532f2ed650b5e16ac48d8958f9
 
 
depth/dpt-large/snapshots/bc15f29aa3a80d532f2ed650b5e16ac48d8958f9/config.json DELETED
@@ -1,47 +0,0 @@
1
- {
2
- "architectures": [
3
- "DPTForDepthEstimation"
4
- ],
5
- "attention_probs_dropout_prob": 0.0,
6
- "auxiliary_loss_weight": 0.4,
7
- "backbone_out_indices": [
8
- 5,
9
- 11,
10
- 17,
11
- 23
12
- ],
13
- "fusion_hidden_size": 256,
14
- "head_in_index": -1,
15
- "hidden_act": "gelu",
16
- "hidden_dropout_prob": 0.0,
17
- "hidden_size": 1024,
18
- "image_size": 384,
19
- "initializer_range": 0.02,
20
- "intermediate_size": 4096,
21
- "layer_norm_eps": 1e-12,
22
- "model_type": "dpt",
23
- "neck_hidden_sizes": [
24
- 256,
25
- 512,
26
- 1024,
27
- 1024
28
- ],
29
- "num_attention_heads": 16,
30
- "num_channels": 3,
31
- "num_hidden_layers": 24,
32
- "patch_size": 16,
33
- "qkv_bias": true,
34
- "readout_type": "project",
35
- "reassemble_factors": [
36
- 4,
37
- 2,
38
- 1,
39
- 0.5
40
- ],
41
- "semantic_classifier_dropout": 0.1,
42
- "semantic_loss_ignore_index": 255,
43
- "torch_dtype": "float32",
44
- "transformers_version": "4.18.0.dev0",
45
- "use_auxiliary_head": true,
46
- "use_batch_norm_in_fusion_residual": false
47
- }