atMrMattV commited on 28 days ago

Commit

c90ac06

verified ·

1 Parent(s): 0c60560

Clear root for models subfolder reorganization

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +0 -74
ace-step/.gitattributes +0 -38
ace-step/Qwen3-Embedding-0.6B/added_tokens.json +0 -28
ace-step/Qwen3-Embedding-0.6B/chat_template.jinja +0 -85
ace-step/Qwen3-Embedding-0.6B/config.json +0 -60
ace-step/Qwen3-Embedding-0.6B/merges.txt +0 -0
ace-step/Qwen3-Embedding-0.6B/model.safetensors +0 -3
ace-step/Qwen3-Embedding-0.6B/special_tokens_map.json +0 -31
ace-step/Qwen3-Embedding-0.6B/tokenizer.json +0 -3
ace-step/Qwen3-Embedding-0.6B/tokenizer_config.json +0 -239
ace-step/Qwen3-Embedding-0.6B/vocab.json +0 -0
ace-step/README.md +0 -99
ace-step/acestep-5Hz-lm-1.7B/added_tokens.json +0 -0
ace-step/acestep-5Hz-lm-1.7B/chat_template.jinja +0 -89
ace-step/acestep-5Hz-lm-1.7B/config.json +0 -61
ace-step/acestep-5Hz-lm-1.7B/merges.txt +0 -0
ace-step/acestep-5Hz-lm-1.7B/model.safetensors +0 -3
ace-step/acestep-5Hz-lm-1.7B/special_tokens_map.json +0 -0
ace-step/acestep-5Hz-lm-1.7B/tokenizer.json +0 -3
ace-step/acestep-5Hz-lm-1.7B/tokenizer_config.json +0 -3
ace-step/acestep-5Hz-lm-1.7B/vocab.json +0 -0
ace-step/acestep-5Hz-lm-4B/Unconfirmed 786712.crdownload +0 -3
ace-step/acestep-5Hz-lm-4B/added_tokens.json +0 -0
ace-step/acestep-5Hz-lm-4B/config.json +0 -69
ace-step/acestep-5Hz-lm-4B/merges.txt +0 -0
ace-step/acestep-5Hz-lm-4B/model.safetensors.index.json +0 -405
ace-step/acestep-5Hz-lm-4B/special_tokens_map.json +0 -0
ace-step/acestep-5Hz-lm-4B/tokenizer.json +0 -3
ace-step/acestep-5Hz-lm-4B/tokenizer_config.json +0 -3
ace-step/acestep-5Hz-lm-4B/vocab.json +0 -0
ace-step/acestep-v15-base/apg_guidance.py +0 -220
ace-step/acestep-v15-base/config.json +0 -81
ace-step/acestep-v15-base/configuration_acestep_v15.py +0 -263
ace-step/acestep-v15-base/modeling_acestep_v15_base.py +0 -0
ace-step/acestep-v15-base/silence_latent.pt +0 -3
ace-step/acestep-v15-sft/apg_guidance.py +0 -220
ace-step/acestep-v15-sft/config.json +0 -81
ace-step/acestep-v15-sft/configuration_acestep_v15.py +0 -263
ace-step/acestep-v15-sft/modeling_acestep_v15_base.py +0 -0
ace-step/acestep-v15-sft/silence_latent.pt +0 -3
ace-step/acestep-v15-turbo/config.json +0 -82
ace-step/acestep-v15-turbo/configuration_acestep_v15.py +0 -263
ace-step/acestep-v15-turbo/modeling_acestep_v15_turbo.py +0 -0
ace-step/acestep-v15-turbo/silence_latent.pt +0 -3
ace-step/config.json +0 -82
ace-step/vae/config.json +0 -24
ace-step/vae/diffusion_pytorch_model.safetensors +0 -3
depth/dpt-large/.no_exist/bc15f29aa3a80d532f2ed650b5e16ac48d8958f9/processor_config.json +0 -0
depth/dpt-large/refs/main +0 -1
depth/dpt-large/snapshots/bc15f29aa3a80d532f2ed650b5e16ac48d8958f9/config.json +0 -47

.gitattributes DELETED Viewed

@@ -1,74 +0,0 @@
-*.7z filter=lfs diff=lfs merge=lfs -text
-*.arrow filter=lfs diff=lfs merge=lfs -text
-*.bin filter=lfs diff=lfs merge=lfs -text
-*.bz2 filter=lfs diff=lfs merge=lfs -text
-*.ckpt filter=lfs diff=lfs merge=lfs -text
-*.ftz filter=lfs diff=lfs merge=lfs -text
-*.gz filter=lfs diff=lfs merge=lfs -text
-*.h5 filter=lfs diff=lfs merge=lfs -text
-*.joblib filter=lfs diff=lfs merge=lfs -text
-*.lfs.* filter=lfs diff=lfs merge=lfs -text
-*.mlmodel filter=lfs diff=lfs merge=lfs -text
-*.model filter=lfs diff=lfs merge=lfs -text
-*.msgpack filter=lfs diff=lfs merge=lfs -text
-*.npy filter=lfs diff=lfs merge=lfs -text
-*.npz filter=lfs diff=lfs merge=lfs -text
-*.onnx filter=lfs diff=lfs merge=lfs -text
-*.ot filter=lfs diff=lfs merge=lfs -text
-*.parquet filter=lfs diff=lfs merge=lfs -text
-*.pb filter=lfs diff=lfs merge=lfs -text
-*.pickle filter=lfs diff=lfs merge=lfs -text
-*.pkl filter=lfs diff=lfs merge=lfs -text
-*.pt filter=lfs diff=lfs merge=lfs -text
-*.pth filter=lfs diff=lfs merge=lfs -text
-*.rar filter=lfs diff=lfs merge=lfs -text
-*.safetensors filter=lfs diff=lfs merge=lfs -text
-saved_model/**/* filter=lfs diff=lfs merge=lfs -text
-*.tar.* filter=lfs diff=lfs merge=lfs -text
-*.tar filter=lfs diff=lfs merge=lfs -text
-*.tflite filter=lfs diff=lfs merge=lfs -text
-*.tgz filter=lfs diff=lfs merge=lfs -text
-*.wasm filter=lfs diff=lfs merge=lfs -text
-*.xz filter=lfs diff=lfs merge=lfs -text
-*.zip filter=lfs diff=lfs merge=lfs -text
-*.zst filter=lfs diff=lfs merge=lfs -text
-*tfevents* filter=lfs diff=lfs merge=lfs -text
-llm/mmproj-F16.gguf filter=lfs diff=lfs merge=lfs -text
-voice-presets/anna.wav filter=lfs diff=lfs merge=lfs -text
-voice-presets/bertrand.wav filter=lfs diff=lfs merge=lfs -text
-voice-presets/cate.wav filter=lfs diff=lfs merge=lfs -text
-voice-presets/coralie.wav filter=lfs diff=lfs merge=lfs -text
-voice-presets/corrado.wav filter=lfs diff=lfs merge=lfs -text
-voice-presets/daniela.wav filter=lfs diff=lfs merge=lfs -text
-voice-presets/denzel.wav filter=lfs diff=lfs merge=lfs -text
-voice-presets/estelle.wav filter=lfs diff=lfs merge=lfs -text
-voice-presets/fabio.wav filter=lfs diff=lfs merge=lfs -text
-voice-presets/gerald.wav filter=lfs diff=lfs merge=lfs -text
-voice-presets/marion.wav filter=lfs diff=lfs merge=lfs -text
-voice-presets/mel.wav filter=lfs diff=lfs merge=lfs -text
-voice-presets/rita.wav filter=lfs diff=lfs merge=lfs -text
-voice-presets/roberto.wav filter=lfs diff=lfs merge=lfs -text
-voice-presets/ruggero.wav filter=lfs diff=lfs merge=lfs -text
-voice-presets/stefania.wav filter=lfs diff=lfs merge=lfs -text
-ace-step/acestep-5Hz-lm-1.7B/tokenizer.json filter=lfs diff=lfs merge=lfs -text
-ace-step/acestep-5Hz-lm-1.7B/tokenizer_config.json filter=lfs diff=lfs merge=lfs -text
-ace-step/acestep-5Hz-lm-4B/tokenizer.json filter=lfs diff=lfs merge=lfs -text
-ace-step/acestep-5Hz-lm-4B/tokenizer_config.json filter=lfs diff=lfs merge=lfs -text
-ace-step/Qwen3-Embedding-0.6B/tokenizer.json filter=lfs diff=lfs merge=lfs -text
-llm/Qwen3.5-4B-Q4_K_M.gguf filter=lfs diff=lfs merge=lfs -text
-flux2-klein/tokenizer/tokenizer.json filter=lfs diff=lfs merge=lfs -text
-llm/Llama-3.2-3B-Instruct-uncensored-Q8_0.gguf filter=lfs diff=lfs merge=lfs -text
-ace-step/acestep-5Hz-lm-4B/Unconfirmed[[:space:]]786712.crdownload filter=lfs diff=lfs merge=lfs -text
-stylemaster/tokenizer/tokenizer.json filter=lfs diff=lfs merge=lfs -text
-voice-presets/convert/anna.mp3 filter=lfs diff=lfs merge=lfs -text
-voice-presets/convert/bertrand.mp3 filter=lfs diff=lfs merge=lfs -text
-voice-presets/convert/coralie.mp3 filter=lfs diff=lfs merge=lfs -text
-voice-presets/convert/corrado.mp3 filter=lfs diff=lfs merge=lfs -text
-voice-presets/convert/daniela.mp3 filter=lfs diff=lfs merge=lfs -text
-voice-presets/convert/estelle.mp3 filter=lfs diff=lfs merge=lfs -text
-voice-presets/convert/fabio.mp3 filter=lfs diff=lfs merge=lfs -text
-voice-presets/convert/marion.mp3 filter=lfs diff=lfs merge=lfs -text
-voice-presets/convert/rita.mp3 filter=lfs diff=lfs merge=lfs -text
-voice-presets/convert/roberto.mp3 filter=lfs diff=lfs merge=lfs -text
-voice-presets/convert/ruggero.mp3 filter=lfs diff=lfs merge=lfs -text
-voice-presets/convert/stefania.mp3 filter=lfs diff=lfs merge=lfs -text

ace-step/.gitattributes DELETED Viewed

@@ -1,38 +0,0 @@
-*.7z filter=lfs diff=lfs merge=lfs -text
-*.arrow filter=lfs diff=lfs merge=lfs -text
-*.bin filter=lfs diff=lfs merge=lfs -text
-*.bz2 filter=lfs diff=lfs merge=lfs -text
-*.ckpt filter=lfs diff=lfs merge=lfs -text
-*.ftz filter=lfs diff=lfs merge=lfs -text
-*.gz filter=lfs diff=lfs merge=lfs -text
-*.h5 filter=lfs diff=lfs merge=lfs -text
-*.joblib filter=lfs diff=lfs merge=lfs -text
-*.lfs.* filter=lfs diff=lfs merge=lfs -text
-*.mlmodel filter=lfs diff=lfs merge=lfs -text
-*.model filter=lfs diff=lfs merge=lfs -text
-*.msgpack filter=lfs diff=lfs merge=lfs -text
-*.npy filter=lfs diff=lfs merge=lfs -text
-*.npz filter=lfs diff=lfs merge=lfs -text
-*.onnx filter=lfs diff=lfs merge=lfs -text
-*.ot filter=lfs diff=lfs merge=lfs -text
-*.parquet filter=lfs diff=lfs merge=lfs -text
-*.pb filter=lfs diff=lfs merge=lfs -text
-*.pickle filter=lfs diff=lfs merge=lfs -text
-*.pkl filter=lfs diff=lfs merge=lfs -text
-*.pt filter=lfs diff=lfs merge=lfs -text
-*.pth filter=lfs diff=lfs merge=lfs -text
-*.rar filter=lfs diff=lfs merge=lfs -text
-*.safetensors filter=lfs diff=lfs merge=lfs -text
-saved_model/**/* filter=lfs diff=lfs merge=lfs -text
-*.tar.* filter=lfs diff=lfs merge=lfs -text
-*.tar filter=lfs diff=lfs merge=lfs -text
-*.tflite filter=lfs diff=lfs merge=lfs -text
-*.tgz filter=lfs diff=lfs merge=lfs -text
-*.wasm filter=lfs diff=lfs merge=lfs -text
-*.xz filter=lfs diff=lfs merge=lfs -text
-*.zip filter=lfs diff=lfs merge=lfs -text
-*.zst filter=lfs diff=lfs merge=lfs -text
-*tfevents* filter=lfs diff=lfs merge=lfs -text
-acestep-5Hz-lm-1.7B/tokenizer.json filter=lfs diff=lfs merge=lfs -text
-acestep-5Hz-lm-1.7B/tokenizer_config.json filter=lfs diff=lfs merge=lfs -text
-Qwen3-Embedding-0.6B/tokenizer.json filter=lfs diff=lfs merge=lfs -text

ace-step/Qwen3-Embedding-0.6B/added_tokens.json DELETED Viewed

@@ -1,28 +0,0 @@
-{
-  "</think>": 151668,
-  "</tool_call>": 151658,
-  "</tool_response>": 151666,
-  "<think>": 151667,
-  "<tool_call>": 151657,
-  "<tool_response>": 151665,
-  "<|box_end|>": 151649,
-  "<|box_start|>": 151648,
-  "<|endoftext|>": 151643,
-  "<|file_sep|>": 151664,
-  "<|fim_middle|>": 151660,
-  "<|fim_pad|>": 151662,
-  "<|fim_prefix|>": 151659,
-  "<|fim_suffix|>": 151661,
-  "<|im_end|>": 151645,
-  "<|im_start|>": 151644,
-  "<|image_pad|>": 151655,
-  "<|object_ref_end|>": 151647,
-  "<|object_ref_start|>": 151646,
-  "<|quad_end|>": 151651,
-  "<|quad_start|>": 151650,
-  "<|repo_name|>": 151663,
-  "<|video_pad|>": 151656,
-  "<|vision_end|>": 151653,
-  "<|vision_pad|>": 151654,
-  "<|vision_start|>": 151652
-}

ace-step/Qwen3-Embedding-0.6B/chat_template.jinja DELETED Viewed

@@ -1,85 +0,0 @@
-{%- if tools %}
-    {{- '<|im_start|>system\n' }}
-    {%- if messages[0].role == 'system' %}
-        {{- messages[0].content + '\n\n' }}
-    {%- endif %}
-    {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
-    {%- for tool in tools %}
-        {{- "\n" }}
-        {{- tool | tojson }}
-    {%- endfor %}
-    {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
-{%- else %}
-    {%- if messages[0].role == 'system' %}
-        {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
-    {%- endif %}
-{%- endif %}
-{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
-{%- for message in messages[::-1] %}
-    {%- set index = (messages|length - 1) - loop.index0 %}
-    {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('<tool_response>') and message.content.endswith('</tool_response>')) %}
-        {%- set ns.multi_step_tool = false %}
-        {%- set ns.last_query_index = index %}
-    {%- endif %}
-{%- endfor %}
-{%- for message in messages %}
-    {%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
-        {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
-    {%- elif message.role == "assistant" %}
-        {%- set content = message.content %}
-        {%- set reasoning_content = '' %}
-        {%- if message.reasoning_content is defined and message.reasoning_content is not none %}
-            {%- set reasoning_content = message.reasoning_content %}
-        {%- else %}
-            {%- if '</think>' in message.content %}
-                {%- set content = message.content.split('</think>')[-1].lstrip('\n') %}
-                {%- set reasoning_content = message.content.split('</think>')[0].rstrip('\n').split('<think>')[-1].lstrip('\n') %}
-            {%- endif %}
-        {%- endif %}
-        {%- if loop.index0 > ns.last_query_index %}
-            {%- if loop.last or (not loop.last and reasoning_content) %}
-                {{- '<|im_start|>' + message.role + '\n<think>\n' + reasoning_content.strip('\n') + '\n</think>\n\n' + content.lstrip('\n') }}
-            {%- else %}
-                {{- '<|im_start|>' + message.role + '\n' + content }}
-            {%- endif %}
-        {%- else %}
-            {{- '<|im_start|>' + message.role + '\n' + content }}
-        {%- endif %}
-        {%- if message.tool_calls %}
-            {%- for tool_call in message.tool_calls %}
-                {%- if (loop.first and content) or (not loop.first) %}
-                    {{- '\n' }}
-                {%- endif %}
-                {%- if tool_call.function %}
-                    {%- set tool_call = tool_call.function %}
-                {%- endif %}
-                {{- '<tool_call>\n{"name": "' }}
-                {{- tool_call.name }}
-                {{- '", "arguments": ' }}
-                {%- if tool_call.arguments is string %}
-                    {{- tool_call.arguments }}
-                {%- else %}
-                    {{- tool_call.arguments | tojson }}
-                {%- endif %}
-                {{- '}\n</tool_call>' }}
-            {%- endfor %}
-        {%- endif %}
-        {{- '<|im_end|>\n' }}
-    {%- elif message.role == "tool" %}
-        {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
-            {{- '<|im_start|>user' }}
-        {%- endif %}
-        {{- '\n<tool_response>\n' }}
-        {{- message.content }}
-        {{- '\n</tool_response>' }}
-        {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
-            {{- '<|im_end|>\n' }}
-        {%- endif %}
-    {%- endif %}
-{%- endfor %}
-{%- if add_generation_prompt %}
-    {{- '<|im_start|>assistant\n' }}
-    {%- if enable_thinking is defined and enable_thinking is false %}
-        {{- '<think>\n\n</think>\n\n' }}
-    {%- endif %}
-{%- endif %}

ace-step/Qwen3-Embedding-0.6B/config.json DELETED Viewed

@@ -1,60 +0,0 @@
-{
-  "architectures": [
-    "Qwen3Model"
-  ],
-  "attention_bias": false,
-  "attention_dropout": 0.0,
-  "bos_token_id": 151643,
-  "dtype": "bfloat16",
-  "eos_token_id": 151643,
-  "head_dim": 128,
-  "hidden_act": "silu",
-  "hidden_size": 1024,
-  "initializer_range": 0.02,
-  "intermediate_size": 3072,
-  "layer_types": [
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention"
-  ],
-  "max_position_embeddings": 32768,
-  "max_window_layers": 28,
-  "model_type": "qwen3",
-  "num_attention_heads": 16,
-  "num_hidden_layers": 28,
-  "num_key_value_heads": 8,
-  "rms_norm_eps": 1e-06,
-  "rope_scaling": null,
-  "rope_theta": 1000000,
-  "sliding_window": null,
-  "tie_word_embeddings": true,
-  "transformers_version": "4.57.0.dev0",
-  "use_cache": true,
-  "use_sliding_window": false,
-  "vocab_size": 151669
-}

ace-step/Qwen3-Embedding-0.6B/merges.txt DELETED Viewed

The diff for this file is too large to render. See raw diff

ace-step/Qwen3-Embedding-0.6B/model.safetensors DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:0437e45c94563b09e13cb7a64478fc406947a93cb34a7e05870fc8dcd48e23fd
-size 1191586416

ace-step/Qwen3-Embedding-0.6B/special_tokens_map.json DELETED Viewed

@@ -1,31 +0,0 @@
-{
-  "additional_special_tokens": [
-    "<|im_start|>",
-    "<|im_end|>",
-    "<|object_ref_start|>",
-    "<|object_ref_end|>",
-    "<|box_start|>",
-    "<|box_end|>",
-    "<|quad_start|>",
-    "<|quad_end|>",
-    "<|vision_start|>",
-    "<|vision_end|>",
-    "<|vision_pad|>",
-    "<|image_pad|>",
-    "<|video_pad|>"
-  ],
-  "eos_token": {
-    "content": "<|im_end|>",
-    "lstrip": false,
-    "normalized": false,
-    "rstrip": false,
-    "single_word": false
-  },
-  "pad_token": {
-    "content": "<|endoftext|>",
-    "lstrip": false,
-    "normalized": false,
-    "rstrip": false,
-    "single_word": false
-  }
-}

ace-step/Qwen3-Embedding-0.6B/tokenizer.json DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:def76fb086971c7867b829c23a26261e38d9d74e02139253b38aeb9df8b4b50a
-size 11423705

ace-step/Qwen3-Embedding-0.6B/tokenizer_config.json DELETED Viewed

@@ -1,239 +0,0 @@
-{
-  "add_bos_token": false,
-  "add_prefix_space": false,
-  "added_tokens_decoder": {
-    "151643": {
-      "content": "<|endoftext|>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "151644": {
-      "content": "<|im_start|>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "151645": {
-      "content": "<|im_end|>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "151646": {
-      "content": "<|object_ref_start|>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "151647": {
-      "content": "<|object_ref_end|>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "151648": {
-      "content": "<|box_start|>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "151649": {
-      "content": "<|box_end|>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "151650": {
-      "content": "<|quad_start|>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "151651": {
-      "content": "<|quad_end|>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "151652": {
-      "content": "<|vision_start|>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "151653": {
-      "content": "<|vision_end|>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "151654": {
-      "content": "<|vision_pad|>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "151655": {
-      "content": "<|image_pad|>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "151656": {
-      "content": "<|video_pad|>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "151657": {
-      "content": "<tool_call>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "151658": {
-      "content": "</tool_call>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "151659": {
-      "content": "<|fim_prefix|>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "151660": {
-      "content": "<|fim_middle|>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "151661": {
-      "content": "<|fim_suffix|>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "151662": {
-      "content": "<|fim_pad|>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "151663": {
-      "content": "<|repo_name|>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "151664": {
-      "content": "<|file_sep|>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "151665": {
-      "content": "<tool_response>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "151666": {
-      "content": "</tool_response>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "151667": {
-      "content": "<think>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "151668": {
-      "content": "</think>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    }
-  },
-  "additional_special_tokens": [
-    "<|im_start|>",
-    "<|im_end|>",
-    "<|object_ref_start|>",
-    "<|object_ref_end|>",
-    "<|box_start|>",
-    "<|box_end|>",
-    "<|quad_start|>",
-    "<|quad_end|>",
-    "<|vision_start|>",
-    "<|vision_end|>",
-    "<|vision_pad|>",
-    "<|image_pad|>",
-    "<|video_pad|>"
-  ],
-  "bos_token": null,
-  "clean_up_tokenization_spaces": false,
-  "eos_token": "<|im_end|>",
-  "errors": "replace",
-  "extra_special_tokens": {},
-  "model_max_length": 131072,
-  "pad_token": "<|endoftext|>",
-  "split_special_tokens": false,
-  "tokenizer_class": "Qwen2Tokenizer",
-  "unk_token": null
-}

ace-step/Qwen3-Embedding-0.6B/vocab.json DELETED Viewed

The diff for this file is too large to render. See raw diff

ace-step/README.md DELETED Viewed

@@ -1,99 +0,0 @@
----
-library_name: transformers
-license: mit
-pipeline_tag: text-to-audio
-tags:
-- audio
-- music
-- text2music
----
-<h1 align="center">ACE-Step 1.5</h1>
-<h1 align="center">Pushing the Boundaries of Open-Source Music Generation</h1>
-<p align="center">
-    <a href="https://ace-step.github.io/ace-step-v1.5.github.io/">Project</a> |
-    <a href="https://huggingface.co/collections/ACE-Step/ace-step-15">Hugging Face</a> |
-    <a href="https://modelscope.cn/models/ACE-Step/Ace-Step1.5">ModelScope</a> |
-    <a href="https://huggingface.co/spaces/ACE-Step/Ace-Step-v1.5">Space Demo</a> |
-    <a href="https://discord.gg/PeWDxrkdj7">Discord</a>
-    <a href="https://arxiv.org/abs/2602.00744">Tech Report</a>
-</p>
-![image](https://cdn-uploads.huggingface.co/production/uploads/62dfaf90c42558bcbd0a4f6f/b84r7t0viIw7rKSr_ja9_.png)
-## Model Details
-🚀 **ACE-Step v1.5** is a highly efficient open-source music foundation model designed to bring commercial-grade music generation to consumer hardware.
-### Key Features
-*   **💰 Commercial-Ready:** Unlike many models trained on ambiguous datasets, ACE-Step v1.5 is designed for creators. You can strictly use the generated music for **commercial purposes**.
-*   **📚 Safe & Robust Training Data:** The model is trained on a massive, legally compliant dataset consisting of:
-    *   **Licensed Data:** Professionally licensed music tracks.
-    *   **Royalty-Free / No-Copyright Data:** A vast collection of public domain and royalty-free music.
-    *   **Synthetic Data:** High-quality audio generated via advanced MIDI-to-Audio conversion.
-*   **⚡ Extreme Speed:** Generates a full song in under 2 seconds on an A100 and under 10 seconds on an RTX 3090.
-*   **🖥️ Consumer Hardware Friendly:** Runs locally with less than 4GB of VRAM.
-### Technical Capabilities
-🌉 At its core lies a novel hybrid architecture where the Language Model (LM) functions as an omni-capable planner: it transforms simple user queries into comprehensive song blueprints—scaling from short loops to 10-minute compositions—while synthesizing metadata, lyrics, and captions via Chain-of-Thought to guide the Diffusion Transformer (DiT). ⚡ Uniquely, this alignment is achieved through intrinsic reinforcement learning relying solely on the model's internal mechanisms, thereby eliminating the biases inherent in external reward models or human preferences. 🎚️
-🔮 Beyond standard synthesis, ACE-Step v1.5 unifies precise stylistic control with versatile editing capabilities—such as cover generation, repainting, and vocal-to-BGM conversion—while maintaining strict adherence to prompts across 50+ languages. This paves the way for powerful tools that seamlessly integrate into the creative workflows of music artists, producers, and content creators. 🎸
-- **Developed by:** [ACE-STEP]
-- **Model type:** [Text2Music]
-- **Language(s):** [50+ languages]
-- **License:** [MIT]
-## Evaluation
-![image](https://cdn-uploads.huggingface.co/production/uploads/62dfaf90c42558bcbd0a4f6f/n9aKi_NhSmlMOgmGzahZi.png)
-## 🏗️ Architecture
-![image](https://cdn-uploads.huggingface.co/production/uploads/62dfaf90c42558bcbd0a4f6f/V_d1rTdqkQyoSM8td7OWl.png)
-## 🦁 Model Zoo
-![image](https://cdn-uploads.huggingface.co/production/uploads/62dfaf90c42558bcbd0a4f6f/B49V0OTKse_FRefTmTPsQ.png)
-### DiT Models
-| DiT Model | Pre-Training | SFT | RL | CFG | Step | Refer audio | Text2Music | Cover | Repaint | Extract | Lego | Complete | Quality | Diversity | Fine-Tunability | Hugging Face |
-|-----------|:------------:|:---:|:--:|:---:|:----:|:-----------:|:----------:|:-----:|:-------:|:-------:|:----:|:--------:|:-------:|:---------:|:---------------:|--------------|
-| `acestep-v15-base` | ✅ | ❌ | ❌ | ✅ | 50 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | Medium | High | Easy | [Link](https://huggingface.co/ACE-Step/acestep-v15-base) |
-| `acestep-v15-sft` | ✅ | ✅ | ❌ | ✅ | 50 | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ | High | Medium | Easy | [Link](https://huggingface.co/ACE-Step/acestep-v15-sft) |
-| `acestep-v15-turbo` | ✅ | ✅ | ❌ | ❌ | 8 | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ | Very High | Medium | Medium | [Link](https://huggingface.co/ACE-Step/Ace-Step1.5) |
-| `acestep-v15-turbo-rl` | ✅ | ✅ | ✅ | ❌ | 8 | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ | Very High | Medium | Medium | To be released |
-### LM Models
-| LM Model | Pretrain from | Pre-Training | SFT | RL | CoT metas | Query rewrite | Audio Understanding | Composition Capability | Copy Melody | Hugging Face |
-|----------|---------------|:------------:|:---:|:--:|:---------:|:-------------:|:-------------------:|:----------------------:|:-----------:|--------------|
-| `acestep-5Hz-lm-0.6B` | Qwen3-0.6B | ✅ | ✅ | ✅ | ✅ | ✅ | Medium | Medium | Weak | ✅ |
-| `acestep-5Hz-lm-1.7B` | Qwen3-1.7B | ✅ | ✅ | ✅ | ✅ | ✅ | Medium | Medium | Medium | ✅ |
-| `acestep-5Hz-lm-4B` | Qwen3-4B | ✅ | ✅ | ✅ | ✅ | ✅ | Strong | Strong | Strong | ✅ |
-## 🙏 Acknowledgements
-This project is co-led by ACE Studio and StepFun.
-## 📖 Citation
-If you find this project useful for your research, please consider citing:
-```BibTeX
-@misc{gong2026acestep,
-	title={ACE-Step 1.5: Pushing the Boundaries of Open-Source Music Generation},
-	author={Junmin Gong, Yulin Song, Wenxiao Zhao, Sen Wang, Shengyuan Xu, Jing Guo},
-	howpublished={\url{https://github.com/ace-step/ACE-Step-1.5}},
-	year={2026},
-	note={GitHub repository}
-}

ace-step/acestep-5Hz-lm-1.7B/added_tokens.json DELETED Viewed

The diff for this file is too large to render. See raw diff

ace-step/acestep-5Hz-lm-1.7B/chat_template.jinja DELETED Viewed

@@ -1,89 +0,0 @@
-{%- if tools %}
-    {{- '<|im_start|>system\n' }}
-    {%- if messages[0].role == 'system' %}
-        {{- messages[0].content + '\n\n' }}
-    {%- endif %}
-    {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
-    {%- for tool in tools %}
-        {{- "\n" }}
-        {{- tool | tojson }}
-    {%- endfor %}
-    {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
-{%- else %}
-    {%- if messages[0].role == 'system' %}
-        {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
-    {%- endif %}
-{%- endif %}
-{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
-{%- for message in messages[::-1] %}
-    {%- set index = (messages|length - 1) - loop.index0 %}
-    {%- if ns.multi_step_tool and message.role == "user" and message.content is string and not(message.content.startswith('<tool_response>') and message.content.endswith('</tool_response>')) %}
-        {%- set ns.multi_step_tool = false %}
-        {%- set ns.last_query_index = index %}
-    {%- endif %}
-{%- endfor %}
-{%- for message in messages %}
-    {%- if message.content is string %}
-        {%- set content = message.content %}
-    {%- else %}
-        {%- set content = '' %}
-    {%- endif %}
-    {%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
-        {{- '<|im_start|>' + message.role + '\n' + content + '<|im_end|>' + '\n' }}
-    {%- elif message.role == "assistant" %}
-        {%- set reasoning_content = '' %}
-        {%- if message.reasoning_content is string %}
-            {%- set reasoning_content = message.reasoning_content %}
-        {%- else %}
-            {%- if '</think>' in content %}
-                {%- set reasoning_content = content.split('</think>')[0].rstrip('\n').split('<think>')[-1].lstrip('\n') %}
-                {%- set content = content.split('</think>')[-1].lstrip('\n') %}
-            {%- endif %}
-        {%- endif %}
-        {%- if loop.index0 > ns.last_query_index %}
-            {%- if loop.last or (not loop.last and reasoning_content) %}
-                {{- '<|im_start|>' + message.role + '\n<think>\n' + reasoning_content.strip('\n') + '\n</think>\n\n' + content.lstrip('\n') }}
-            {%- else %}
-                {{- '<|im_start|>' + message.role + '\n' + content }}
-            {%- endif %}
-        {%- else %}
-            {{- '<|im_start|>' + message.role + '\n' + content }}
-        {%- endif %}
-        {%- if message.tool_calls %}
-            {%- for tool_call in message.tool_calls %}
-                {%- if (loop.first and content) or (not loop.first) %}
-                    {{- '\n' }}
-                {%- endif %}
-                {%- if tool_call.function %}
-                    {%- set tool_call = tool_call.function %}
-                {%- endif %}
-                {{- '<tool_call>\n{"name": "' }}
-                {{- tool_call.name }}
-                {{- '", "arguments": ' }}
-                {%- if tool_call.arguments is string %}
-                    {{- tool_call.arguments }}
-                {%- else %}
-                    {{- tool_call.arguments | tojson }}
-                {%- endif %}
-                {{- '}\n</tool_call>' }}
-            {%- endfor %}
-        {%- endif %}
-        {{- '<|im_end|>\n' }}
-    {%- elif message.role == "tool" %}
-        {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
-            {{- '<|im_start|>user' }}
-        {%- endif %}
-        {{- '\n<tool_response>\n' }}
-        {{- content }}
-        {{- '\n</tool_response>' }}
-        {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
-            {{- '<|im_end|>\n' }}
-        {%- endif %}
-    {%- endif %}
-{%- endfor %}
-{%- if add_generation_prompt %}
-    {{- '<|im_start|>assistant\n' }}
-    {%- if enable_thinking is defined and enable_thinking is false %}
-        {{- '<think>\n\n</think>\n\n' }}
-    {%- endif %}
-{%- endif %}

ace-step/acestep-5Hz-lm-1.7B/config.json DELETED Viewed

@@ -1,61 +0,0 @@
-{
-  "architectures": [
-    "Qwen3Model"
-  ],
-  "attention_bias": false,
-  "attention_dropout": 0.0,
-  "bos_token_id": 151643,
-  "dtype": "bfloat16",
-  "eos_token_id": 151645,
-  "head_dim": 128,
-  "hidden_act": "silu",
-  "hidden_size": 2048,
-  "initializer_range": 0.02,
-  "intermediate_size": 6144,
-  "layer_types": [
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention"
-  ],
-  "max_position_embeddings": 40960,
-  "max_window_layers": 28,
-  "model_type": "qwen3",
-  "num_attention_heads": 16,
-  "num_hidden_layers": 28,
-  "num_key_value_heads": 8,
-  "pad_token_id": 151643,
-  "rms_norm_eps": 1e-06,
-  "rope_scaling": null,
-  "rope_theta": 1000000,
-  "sliding_window": null,
-  "tie_word_embeddings": true,
-  "transformers_version": "4.57.0.dev0",
-  "use_cache": true,
-  "use_sliding_window": false,
-  "vocab_size": 217204
-}

ace-step/acestep-5Hz-lm-1.7B/merges.txt DELETED Viewed

The diff for this file is too large to render. See raw diff

ace-step/acestep-5Hz-lm-1.7B/model.safetensors DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:f161689da73e5ecefa28ff780d51c2d92a00f056d021d7933c779ed5c6cd7db8
-size 3708521528

ace-step/acestep-5Hz-lm-1.7B/special_tokens_map.json DELETED Viewed

The diff for this file is too large to render. See raw diff

ace-step/acestep-5Hz-lm-1.7B/tokenizer.json DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:35af56c3f5cb3ea2cc578aa28a8937770981d504f183ac5c8c38baf4bbd4af4d
-size 24321939

ace-step/acestep-5Hz-lm-1.7B/tokenizer_config.json DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:6cd70cdd89425971794f5235562edcc608b0629a6c4686ae51a8b8c8b8ba5e95
-size 14072925

ace-step/acestep-5Hz-lm-1.7B/vocab.json DELETED Viewed

The diff for this file is too large to render. See raw diff

ace-step/acestep-5Hz-lm-4B/Unconfirmed 786712.crdownload DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:75f193be8e6ec67e0cd154b6b8891af451f248458058ae6589c64cbdd78d8601
-size 3161911734

ace-step/acestep-5Hz-lm-4B/added_tokens.json DELETED Viewed

The diff for this file is too large to render. See raw diff

ace-step/acestep-5Hz-lm-4B/config.json DELETED Viewed

@@ -1,69 +0,0 @@
-{
-  "architectures": [
-    "Qwen3ForCausalLM"
-  ],
-  "attention_bias": false,
-  "attention_dropout": 0.0,
-  "bos_token_id": 151643,
-  "dtype": "bfloat16",
-  "eos_token_id": 151645,
-  "head_dim": 128,
-  "hidden_act": "silu",
-  "hidden_size": 2560,
-  "initializer_range": 0.02,
-  "intermediate_size": 9728,
-  "layer_types": [
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention",
-    "full_attention"
-  ],
-  "max_position_embeddings": 40960,
-  "max_window_layers": 36,
-  "model_type": "qwen3",
-  "num_attention_heads": 32,
-  "num_hidden_layers": 36,
-  "num_key_value_heads": 8,
-  "pad_token_id": 151643,
-  "rms_norm_eps": 1e-06,
-  "rope_scaling": null,
-  "rope_theta": 1000000,
-  "sliding_window": null,
-  "tie_word_embeddings": true,
-  "transformers_version": "4.57.1",
-  "use_cache": true,
-  "use_sliding_window": false,
-  "vocab_size": 217204
-}

ace-step/acestep-5Hz-lm-4B/merges.txt DELETED Viewed

The diff for this file is too large to render. See raw diff

ace-step/acestep-5Hz-lm-4B/model.safetensors.index.json DELETED Viewed

@@ -1,405 +0,0 @@
-{
-  "metadata": {
-    "total_size": 8379108352
-  },
-  "weight_map": {
-    "model.embed_tokens.weight": "model-00001-of-00002.safetensors",
-    "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
-    "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
-    "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
-    "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
-    "model.layers.0.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
-    "model.layers.0.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
-    "model.layers.0.input_layernorm.weight": "model-00001-of-00002.safetensors",
-    "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
-    "model.layers.0.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
-    "model.layers.0.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
-    "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
-    "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
-    "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
-    "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
-    "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
-    "model.layers.1.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
-    "model.layers.1.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
-    "model.layers.1.input_layernorm.weight": "model-00001-of-00002.safetensors",
-    "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
-    "model.layers.1.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
-    "model.layers.1.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
-    "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
-    "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
-    "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
-    "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
-    "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
-    "model.layers.2.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
-    "model.layers.2.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
-    "model.layers.2.input_layernorm.weight": "model-00001-of-00002.safetensors",
-    "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
-    "model.layers.2.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
-    "model.layers.2.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
-    "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
-    "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
-    "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
-    "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
-    "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
-    "model.layers.3.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
-    "model.layers.3.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
-    "model.layers.3.input_layernorm.weight": "model-00001-of-00002.safetensors",
-    "model.layers.3.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
-    "model.layers.3.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
-    "model.layers.3.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
-    "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
-    "model.layers.4.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
-    "model.layers.4.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
-    "model.layers.4.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
-    "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
-    "model.layers.4.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
-    "model.layers.4.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
-    "model.layers.4.input_layernorm.weight": "model-00001-of-00002.safetensors",
-    "model.layers.4.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
-    "model.layers.4.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
-    "model.layers.4.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
-    "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
-    "model.layers.5.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
-    "model.layers.5.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
-    "model.layers.5.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
-    "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
-    "model.layers.5.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
-    "model.layers.5.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
-    "model.layers.5.input_layernorm.weight": "model-00001-of-00002.safetensors",
-    "model.layers.5.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
-    "model.layers.5.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
-    "model.layers.5.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
-    "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
-    "model.layers.6.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
-    "model.layers.6.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
-    "model.layers.6.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
-    "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
-    "model.layers.6.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
-    "model.layers.6.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
-    "model.layers.6.input_layernorm.weight": "model-00001-of-00002.safetensors",
-    "model.layers.6.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
-    "model.layers.6.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
-    "model.layers.6.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
-    "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
-    "model.layers.7.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
-    "model.layers.7.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
-    "model.layers.7.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
-    "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
-    "model.layers.7.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
-    "model.layers.7.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
-    "model.layers.7.input_layernorm.weight": "model-00001-of-00002.safetensors",
-    "model.layers.7.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
-    "model.layers.7.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
-    "model.layers.7.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
-    "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
-    "model.layers.8.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
-    "model.layers.8.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
-    "model.layers.8.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
-    "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
-    "model.layers.8.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
-    "model.layers.8.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
-    "model.layers.8.input_layernorm.weight": "model-00001-of-00002.safetensors",
-    "model.layers.8.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
-    "model.layers.8.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
-    "model.layers.8.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
-    "model.layers.8.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
-    "model.layers.9.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
-    "model.layers.9.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
-    "model.layers.9.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
-    "model.layers.9.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
-    "model.layers.9.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
-    "model.layers.9.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
-    "model.layers.9.input_layernorm.weight": "model-00001-of-00002.safetensors",
-    "model.layers.9.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
-    "model.layers.9.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
-    "model.layers.9.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
-    "model.layers.9.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
-    "model.layers.10.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
-    "model.layers.10.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
-    "model.layers.10.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
-    "model.layers.10.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
-    "model.layers.10.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
-    "model.layers.10.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
-    "model.layers.10.input_layernorm.weight": "model-00001-of-00002.safetensors",
-    "model.layers.10.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
-    "model.layers.10.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
-    "model.layers.10.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
-    "model.layers.10.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
-    "model.layers.11.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
-    "model.layers.11.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
-    "model.layers.11.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
-    "model.layers.11.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
-    "model.layers.11.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
-    "model.layers.11.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
-    "model.layers.11.input_layernorm.weight": "model-00001-of-00002.safetensors",
-    "model.layers.11.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
-    "model.layers.11.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
-    "model.layers.11.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
-    "model.layers.11.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
-    "model.layers.12.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
-    "model.layers.12.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
-    "model.layers.12.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
-    "model.layers.12.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
-    "model.layers.12.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
-    "model.layers.12.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
-    "model.layers.12.input_layernorm.weight": "model-00001-of-00002.safetensors",
-    "model.layers.12.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
-    "model.layers.12.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
-    "model.layers.12.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
-    "model.layers.12.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
-    "model.layers.13.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
-    "model.layers.13.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
-    "model.layers.13.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
-    "model.layers.13.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
-    "model.layers.13.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
-    "model.layers.13.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
-    "model.layers.13.input_layernorm.weight": "model-00001-of-00002.safetensors",
-    "model.layers.13.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
-    "model.layers.13.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
-    "model.layers.13.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
-    "model.layers.13.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
-    "model.layers.14.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
-    "model.layers.14.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
-    "model.layers.14.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
-    "model.layers.14.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
-    "model.layers.14.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
-    "model.layers.14.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
-    "model.layers.14.input_layernorm.weight": "model-00001-of-00002.safetensors",
-    "model.layers.14.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
-    "model.layers.14.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
-    "model.layers.14.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
-    "model.layers.14.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
-    "model.layers.15.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
-    "model.layers.15.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
-    "model.layers.15.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
-    "model.layers.15.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
-    "model.layers.15.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
-    "model.layers.15.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
-    "model.layers.15.input_layernorm.weight": "model-00001-of-00002.safetensors",
-    "model.layers.15.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
-    "model.layers.15.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
-    "model.layers.15.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
-    "model.layers.15.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
-    "model.layers.16.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
-    "model.layers.16.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
-    "model.layers.16.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
-    "model.layers.16.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
-    "model.layers.16.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
-    "model.layers.16.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
-    "model.layers.16.input_layernorm.weight": "model-00001-of-00002.safetensors",
-    "model.layers.16.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
-    "model.layers.16.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
-    "model.layers.16.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
-    "model.layers.16.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
-    "model.layers.17.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
-    "model.layers.17.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
-    "model.layers.17.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
-    "model.layers.17.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
-    "model.layers.17.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
-    "model.layers.17.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
-    "model.layers.17.input_layernorm.weight": "model-00001-of-00002.safetensors",
-    "model.layers.17.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
-    "model.layers.17.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
-    "model.layers.17.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
-    "model.layers.17.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
-    "model.layers.18.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
-    "model.layers.18.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
-    "model.layers.18.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
-    "model.layers.18.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
-    "model.layers.18.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
-    "model.layers.18.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
-    "model.layers.18.input_layernorm.weight": "model-00001-of-00002.safetensors",
-    "model.layers.18.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
-    "model.layers.18.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
-    "model.layers.18.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
-    "model.layers.18.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
-    "model.layers.19.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
-    "model.layers.19.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
-    "model.layers.19.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
-    "model.layers.19.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
-    "model.layers.19.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
-    "model.layers.19.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
-    "model.layers.19.input_layernorm.weight": "model-00001-of-00002.safetensors",
-    "model.layers.19.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
-    "model.layers.19.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
-    "model.layers.19.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
-    "model.layers.19.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
-    "model.layers.20.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
-    "model.layers.20.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
-    "model.layers.20.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
-    "model.layers.20.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
-    "model.layers.20.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
-    "model.layers.20.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
-    "model.layers.20.input_layernorm.weight": "model-00002-of-00002.safetensors",
-    "model.layers.20.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
-    "model.layers.20.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
-    "model.layers.20.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
-    "model.layers.20.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
-    "model.layers.21.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
-    "model.layers.21.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
-    "model.layers.21.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
-    "model.layers.21.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
-    "model.layers.21.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
-    "model.layers.21.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
-    "model.layers.21.input_layernorm.weight": "model-00002-of-00002.safetensors",
-    "model.layers.21.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
-    "model.layers.21.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
-    "model.layers.21.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
-    "model.layers.21.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
-    "model.layers.22.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
-    "model.layers.22.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
-    "model.layers.22.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
-    "model.layers.22.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
-    "model.layers.22.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
-    "model.layers.22.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
-    "model.layers.22.input_layernorm.weight": "model-00002-of-00002.safetensors",
-    "model.layers.22.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
-    "model.layers.22.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
-    "model.layers.22.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
-    "model.layers.22.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
-    "model.layers.23.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
-    "model.layers.23.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
-    "model.layers.23.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
-    "model.layers.23.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
-    "model.layers.23.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
-    "model.layers.23.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
-    "model.layers.23.input_layernorm.weight": "model-00002-of-00002.safetensors",
-    "model.layers.23.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
-    "model.layers.23.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
-    "model.layers.23.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
-    "model.layers.23.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
-    "model.layers.24.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
-    "model.layers.24.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
-    "model.layers.24.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
-    "model.layers.24.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
-    "model.layers.24.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
-    "model.layers.24.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
-    "model.layers.24.input_layernorm.weight": "model-00002-of-00002.safetensors",
-    "model.layers.24.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
-    "model.layers.24.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
-    "model.layers.24.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
-    "model.layers.24.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
-    "model.layers.25.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
-    "model.layers.25.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
-    "model.layers.25.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
-    "model.layers.25.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
-    "model.layers.25.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
-    "model.layers.25.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
-    "model.layers.25.input_layernorm.weight": "model-00002-of-00002.safetensors",
-    "model.layers.25.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
-    "model.layers.25.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
-    "model.layers.25.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
-    "model.layers.25.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
-    "model.layers.26.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
-    "model.layers.26.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
-    "model.layers.26.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
-    "model.layers.26.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
-    "model.layers.26.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
-    "model.layers.26.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
-    "model.layers.26.input_layernorm.weight": "model-00002-of-00002.safetensors",
-    "model.layers.26.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
-    "model.layers.26.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
-    "model.layers.26.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
-    "model.layers.26.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
-    "model.layers.27.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
-    "model.layers.27.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
-    "model.layers.27.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
-    "model.layers.27.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
-    "model.layers.27.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
-    "model.layers.27.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
-    "model.layers.27.input_layernorm.weight": "model-00002-of-00002.safetensors",
-    "model.layers.27.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
-    "model.layers.27.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
-    "model.layers.27.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
-    "model.layers.27.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
-    "model.layers.28.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
-    "model.layers.28.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
-    "model.layers.28.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
-    "model.layers.28.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
-    "model.layers.28.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
-    "model.layers.28.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
-    "model.layers.28.input_layernorm.weight": "model-00002-of-00002.safetensors",
-    "model.layers.28.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
-    "model.layers.28.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
-    "model.layers.28.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
-    "model.layers.28.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
-    "model.layers.29.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
-    "model.layers.29.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
-    "model.layers.29.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
-    "model.layers.29.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
-    "model.layers.29.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
-    "model.layers.29.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
-    "model.layers.29.input_layernorm.weight": "model-00002-of-00002.safetensors",
-    "model.layers.29.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
-    "model.layers.29.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
-    "model.layers.29.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
-    "model.layers.29.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
-    "model.layers.30.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
-    "model.layers.30.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
-    "model.layers.30.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
-    "model.layers.30.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
-    "model.layers.30.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
-    "model.layers.30.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
-    "model.layers.30.input_layernorm.weight": "model-00002-of-00002.safetensors",
-    "model.layers.30.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
-    "model.layers.30.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
-    "model.layers.30.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
-    "model.layers.30.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
-    "model.layers.31.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
-    "model.layers.31.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
-    "model.layers.31.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
-    "model.layers.31.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
-    "model.layers.31.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
-    "model.layers.31.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
-    "model.layers.31.input_layernorm.weight": "model-00002-of-00002.safetensors",
-    "model.layers.31.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
-    "model.layers.31.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
-    "model.layers.31.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
-    "model.layers.31.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
-    "model.layers.32.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
-    "model.layers.32.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
-    "model.layers.32.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
-    "model.layers.32.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
-    "model.layers.32.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
-    "model.layers.32.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
-    "model.layers.32.input_layernorm.weight": "model-00002-of-00002.safetensors",
-    "model.layers.32.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
-    "model.layers.32.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
-    "model.layers.32.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
-    "model.layers.32.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
-    "model.layers.33.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
-    "model.layers.33.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
-    "model.layers.33.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
-    "model.layers.33.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
-    "model.layers.33.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
-    "model.layers.33.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
-    "model.layers.33.input_layernorm.weight": "model-00002-of-00002.safetensors",
-    "model.layers.33.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
-    "model.layers.33.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
-    "model.layers.33.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
-    "model.layers.33.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
-    "model.layers.34.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
-    "model.layers.34.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
-    "model.layers.34.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
-    "model.layers.34.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
-    "model.layers.34.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
-    "model.layers.34.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
-    "model.layers.34.input_layernorm.weight": "model-00002-of-00002.safetensors",
-    "model.layers.34.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
-    "model.layers.34.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
-    "model.layers.34.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
-    "model.layers.34.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
-    "model.layers.35.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
-    "model.layers.35.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
-    "model.layers.35.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
-    "model.layers.35.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
-    "model.layers.35.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
-    "model.layers.35.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
-    "model.layers.35.input_layernorm.weight": "model-00002-of-00002.safetensors",
-    "model.layers.35.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
-    "model.layers.35.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
-    "model.layers.35.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
-    "model.layers.35.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
-    "model.norm.weight": "model-00002-of-00002.safetensors"
-  }
-}

ace-step/acestep-5Hz-lm-4B/special_tokens_map.json DELETED Viewed

The diff for this file is too large to render. See raw diff

ace-step/acestep-5Hz-lm-4B/tokenizer.json DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:35af56c3f5cb3ea2cc578aa28a8937770981d504f183ac5c8c38baf4bbd4af4d
-size 24321939

ace-step/acestep-5Hz-lm-4B/tokenizer_config.json DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:6cd70cdd89425971794f5235562edcc608b0629a6c4686ae51a8b8c8b8ba5e95
-size 14072925

ace-step/acestep-5Hz-lm-4B/vocab.json DELETED Viewed

The diff for this file is too large to render. See raw diff

ace-step/acestep-v15-base/apg_guidance.py DELETED Viewed

@@ -1,220 +0,0 @@
-import torch
-import torch.nn.functional as F
-class MomentumBuffer:
-    def __init__(self, momentum: float = -0.75):
-        self.momentum = momentum
-        self.running_average = 0
-    def update(self, update_value: torch.Tensor):
-        new_average = self.momentum * self.running_average
-        self.running_average = update_value + new_average
-def project(
-    v0: torch.Tensor,  # [B, C, T]
-    v1: torch.Tensor,  # [B, C, T]
-    dims=[-1],
-):
-    dtype = v0.dtype
-    device_type = v0.device.type
-    if device_type == "mps":
-        v0, v1 = v0.cpu(), v1.cpu()
-    v0, v1 = v0.double(), v1.double()
-    v1 = torch.nn.functional.normalize(v1, dim=dims)
-    v0_parallel = (v0 * v1).sum(dim=dims, keepdim=True) * v1
-    v0_orthogonal = v0 - v0_parallel
-    return v0_parallel.to(dtype).to(device_type), v0_orthogonal.to(dtype).to(device_type)
-def apg_forward(
-    pred_cond: torch.Tensor,  # [B, C, T]
-    pred_uncond: torch.Tensor,  # [B, C, T]
-    guidance_scale: float,
-    momentum_buffer: MomentumBuffer = None,
-    eta: float = 0.0,
-    norm_threshold: float = 2.5,
-    dims=[-1],
-):
-    diff = pred_cond - pred_uncond
-    if momentum_buffer is not None:
-        momentum_buffer.update(diff)
-        diff = momentum_buffer.running_average
-    if norm_threshold > 0:
-        ones = torch.ones_like(diff)
-        diff_norm = diff.norm(p=2, dim=dims, keepdim=True)
-        scale_factor = torch.minimum(ones, norm_threshold / diff_norm)
-        diff = diff * scale_factor
-    diff_parallel, diff_orthogonal = project(diff, pred_cond, dims)
-    normalized_update = diff_orthogonal + eta * diff_parallel
-    pred_guided = pred_cond + (guidance_scale - 1) * normalized_update
-    return pred_guided
-def cfg_forward(cond_output, uncond_output, cfg_strength):
-    return uncond_output + cfg_strength * (cond_output - uncond_output)
-def call_cos_tensor(tensor1, tensor2):
-    """
-    Calculate cosine similarity between two normalized tensors.
-    Args:
-        tensor1: First tensor [B, ...]
-        tensor2: Second tensor [B, ...]
-    Returns:
-        Cosine similarity value [B, 1]
-    """
-    tensor1 = tensor1 / torch.linalg.norm(tensor1, dim=1, keepdim=True)
-    tensor2 = tensor2 / torch.linalg.norm(tensor2, dim=1, keepdim=True)
-    cosvalue = torch.sum(tensor1 * tensor2, dim=1, keepdim=True)
-    return cosvalue
-def compute_perpendicular_component(latent_diff, latent_hat_uncond):
-    """
-    Decompose latent_diff into parallel and perpendicular components relative to latent_hat_uncond.
-    Args:
-        latent_diff: Difference tensor [B, C, ...]
-        latent_hat_uncond: Unconditional prediction tensor [B, C, ...]
-    Returns:
-        projection: Parallel component
-        perpendicular_component: Perpendicular component
-    """
-    n, t, c = latent_diff.shape
-    latent_diff = latent_diff.view(n * t, c).float()
-    latent_hat_uncond = latent_hat_uncond.view(n * t, c).float()
-    if latent_diff.size() != latent_hat_uncond.size():
-        raise ValueError("latent_diff and latent_hat_uncond must have the same shape [n, d].")
-    dot_product = torch.sum(latent_diff * latent_hat_uncond, dim=1, keepdim=True)  # [n, 1]
-    norm_square = torch.sum(latent_hat_uncond * latent_hat_uncond, dim=1, keepdim=True)  # [n, 1]
-    projection = (dot_product / (norm_square + 1e-8)) * latent_hat_uncond
-    perpendicular_component = latent_diff - projection
-    return projection.view(n, t, c), perpendicular_component.reshape(n, t, c)
-def adg_forward(
-    latents: torch.Tensor,
-    noise_pred_cond: torch.Tensor,
-    noise_pred_uncond: torch.Tensor,
-    sigma: torch.Tensor,
-    guidance_scale: float,
-    angle_clip: float = 3.14 / 6,  # pi/6 by default
-    apply_norm: bool = False,
-    apply_clip: bool = True,
-):
-    """
-    ADG (Angle-based Dynamic Guidance) forward pass for Flow Matching.
-    In flow matching (including SD3), sigma represents the current timestep t_curr.
-    The predictions are velocity fields v(x_t, t).
-    Args:
-        latents: Current state x_t [N, T, d] where d=64
-        noise_pred_cond: Conditional velocity prediction v_cond [N, T, d]
-        noise_pred_uncond: Unconditional velocity prediction v_uncond [N, T, d]
-        sigma: Current timestep t_curr (not t_prev!)
-        guidance_scale: Guidance strength
-        angle_clip: Maximum angle for clipping (default: pi/6)
-        apply_norm: Whether to normalize the result (ADG_w_norm variant)
-        apply_clip: Whether to clip the angle (ADG_wo_clip when False)
-    Returns:
-        Guided velocity prediction [N, T, d]
-    """
-    # Get batch size
-    n = noise_pred_cond.shape[0]
-    noise_pred_text = noise_pred_cond
-    n, t, c = noise_pred_text.shape
-    # Ensure sigma/t has the right shape for broadcasting [N, 1, 1]
-    if isinstance(sigma, (int, float)):
-        sigma = torch.tensor(sigma, device=latents.device, dtype=latents.dtype)
-        sigma = sigma.view(1, 1, 1).expand(n, 1, 1)
-    elif torch.is_tensor(sigma):
-        if sigma.numel() == 1:
-            sigma = sigma.view(1, 1, 1).expand(n, 1, 1)
-        elif sigma.numel() == n:
-            sigma = sigma.view(n, 1, 1)
-        else:
-            raise ValueError(f"sigma has incompatible shape. Expected scalar or size {n}, got {sigma.shape}")
-    else:
-        raise TypeError(f"sigma must be a number or tensor, got {type(sigma)}")
-    # Adjust guidance weight
-    weight = guidance_scale - 1
-    weight = weight * (weight > 0) + 1e-3
-    latent_hat_text = latents - sigma * noise_pred_text
-    latent_hat_uncond = latents - sigma * noise_pred_uncond
-    latent_diff = latent_hat_text - latent_hat_uncond
-    # Calculate angle between conditional and unconditional predicted data
-    latent_theta = torch.acos(
-        call_cos_tensor(latent_hat_text.view(-1, c).to(float),
-                        latent_hat_uncond.reshape(-1, c).contiguous().to(float)))
-    latent_theta_new = torch.clip(weight * latent_theta, -angle_clip, angle_clip) if apply_clip else weight * latent_theta
-    proj, perp = compute_perpendicular_component(latent_diff, latent_hat_uncond)
-    latent_v_new = torch.cos(latent_theta_new) * latent_hat_text
-    latent_p_new = perp * torch.sin(latent_theta_new) / torch.sin(latent_theta) * (
-        torch.sin(latent_theta) > 1e-3) + perp * weight * (torch.sin(latent_theta) <= 1e-3)
-    latent_new = latent_v_new + latent_p_new
-    if apply_norm:
-        latent_new = latent_new * torch.linalg.norm(latent_hat_text, dim=1, keepdim=True) / torch.linalg.norm(
-            latent_new, dim=1, keepdim=True)
-    noise_pred = (latents - latent_new) / sigma
-    noise_pred = noise_pred.reshape(n, t, c).to(latents.dtype)
-    return noise_pred
-def adg_w_norm_forward(
-    latents: torch.Tensor,
-    noise_pred_cond: torch.Tensor,
-    noise_pred_uncond: torch.Tensor,
-    sigma: float,
-    guidance_scale: float,
-    angle_clip: float = 3.14 / 3,
-):
-    """
-    ADG with normalization - preserves the magnitude of latent predictions.
-    This variant normalizes the final latent to maintain the same norm as the
-    conditional prediction, which can help preserve image quality.
-    """
-    return adg_forward(latents,
-                       noise_pred_cond,
-                       noise_pred_uncond,
-                       sigma,
-                       guidance_scale,
-                       angle_clip=angle_clip,
-                       apply_norm=True,
-                       apply_clip=True)
-def adg_wo_clip_forward(
-    latents: torch.Tensor,
-    noise_pred_cond: torch.Tensor,
-    noise_pred_uncond: torch.Tensor,
-    sigma: float,
-    guidance_scale: float,
-):
-    """
-    ADG without angle clipping - allows unbounded angle adjustments.
-    This variant doesn't clip the angle, which may result in more aggressive
-    guidance but could be less stable.
-    """
-    return adg_forward(latents, noise_pred_cond, noise_pred_uncond, sigma, guidance_scale, apply_norm=False, apply_clip=False)

ace-step/acestep-v15-base/config.json DELETED Viewed

@@ -1,81 +0,0 @@
-{
-  "architectures": [
-    "AceStepConditionGenerationModel"
-  ],
-  "auto_map": {
-    "AutoConfig": "configuration_acestep_v15.AceStepConfig",
-    "AutoModel": "modeling_acestep_v15_base.AceStepConditionGenerationModel"
-  },
-  "attention_bias": false,
-  "attention_dropout": 0.0,
-  "audio_acoustic_hidden_dim": 64,
-  "data_proportion": 0.5,
-  "dtype": "bfloat16",
-  "fsq_dim": 2048,
-  "fsq_input_levels": [
-    8,
-    8,
-    8,
-    5,
-    5,
-    5
-  ],
-  "fsq_input_num_quantizers": 1,
-  "head_dim": 128,
-  "hidden_act": "silu",
-  "hidden_size": 2048,
-  "in_channels": 192,
-  "initializer_range": 0.02,
-  "intermediate_size": 6144,
-  "layer_types": [
-    "sliding_attention",
-    "full_attention",
-    "sliding_attention",
-    "full_attention",
-    "sliding_attention",
-    "full_attention",
-    "sliding_attention",
-    "full_attention",
-    "sliding_attention",
-    "full_attention",
-    "sliding_attention",
-    "full_attention",
-    "sliding_attention",
-    "full_attention",
-    "sliding_attention",
-    "full_attention",
-    "sliding_attention",
-    "full_attention",
-    "sliding_attention",
-    "full_attention",
-    "sliding_attention",
-    "full_attention",
-    "sliding_attention",
-    "full_attention"
-  ],
-  "max_position_embeddings": 32768,
-  "model_type": "acestep",
-  "num_attention_heads": 16,
-  "num_attention_pooler_hidden_layers": 2,
-  "num_audio_decoder_hidden_layers": 24,
-  "num_hidden_layers": 24,
-  "num_key_value_heads": 8,
-  "num_lyric_encoder_hidden_layers": 8,
-  "num_timbre_encoder_hidden_layers": 4,
-  "patch_size": 2,
-  "pool_window_size": 5,
-  "rms_norm_eps": 1e-06,
-  "rope_scaling": null,
-  "rope_theta": 1000000,
-  "sliding_window": 128,
-  "text_hidden_dim": 1024,
-  "timbre_fix_frame": 750,
-  "timbre_hidden_dim": 64,
-  "timestep_mu": -0.4,
-  "timestep_sigma": 1.0,
-  "transformers_version": "4.57.0.dev0",
-  "use_cache": true,
-  "use_sliding_window": true,
-  "vocab_size": 64003,
-  "is_turbo": false
-}

ace-step/acestep-v15-base/configuration_acestep_v15.py DELETED Viewed

@@ -1,263 +0,0 @@
-# coding=utf-8
-# Copyright 2024 The Qwen team, Alibaba Group and the HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""AceStep model configuration"""
-from transformers.configuration_utils import PretrainedConfig, layer_type_validation
-from transformers.modeling_rope_utils import rope_config_validation
-from transformers.utils import logging
-logger = logging.get_logger(__name__)
-class AceStepConfig(PretrainedConfig):
-    r"""
-    This is the configuration class to store the configuration of a [`AceStepModel`]. It is used to instantiate an
-    AceStep model according to the specified arguments, defining the model architecture.
-    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
-    documentation from [`PretrainedConfig`] for more information.
-    Args:
-        vocab_size (`int`, *optional*, defaults to 64003):
-            Vocabulary size of the AceStep model. Defines the number of different tokens that can be represented by the
-            `inputs_ids` passed when calling the model.
-        hidden_size (`int`, *optional*, defaults to 4096):
-            Dimension of the hidden representations.
-        intermediate_size (`int`, *optional*, defaults to 22016):
-            Dimension of the MLP representations.
-        num_hidden_layers (`int`, *optional*, defaults to 32):
-            Number of hidden layers in the Transformer encoder.
-        num_attention_heads (`int`, *optional*, defaults to 32):
-            Number of attention heads for each attention layer in the Transformer encoder.
-        num_key_value_heads (`int`, *optional*, defaults to 32):
-            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
-            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
-            `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
-            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
-            by meanpooling all the original heads within that group. For more details, check out [this
-            paper](https://huggingface.co/papers/2305.13245). If it is not specified, will default to `32`.
-        head_dim (`int`, *optional*, defaults to 128):
-            The attention head dimension.
-        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
-            The non-linear activation function (function or string) in the decoder.
-        max_position_embeddings (`int`, *optional*, defaults to 32768):
-            The maximum sequence length that this model might ever be used with.
-        initializer_range (`float`, *optional*, defaults to 0.02):
-            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-        rms_norm_eps (`float`, *optional*, defaults to 1e-06):
-            The epsilon used by the rms normalization layers.
-        use_cache (`bool`, *optional*, defaults to `True`):
-            Whether or not the model should return the last key/values attentions (not used by all models). Only
-            relevant if `config.is_decoder=True`.
-        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
-            Whether the model's input and output word embeddings should be tied.
-        rope_theta (`float`, *optional*, defaults to 10000.0):
-            The base period of the RoPE embeddings.
-        rope_scaling (`Dict`, *optional*):
-            Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type
-            and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value
-            accordingly.
-            Expected contents:
-                `rope_type` (`str`):
-                    The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope',
-                    'llama3'], with 'default' being the original RoPE implementation.
-                `factor` (`float`, *optional*):
-                    Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In
-                    most scaling types, a `factor` of x will enable the model to handle sequences of length x *
-                    original maximum pre-trained length.
-                `original_max_position_embeddings` (`int`, *optional*):
-                    Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during
-                    pretraining.
-                `attention_factor` (`float`, *optional*):
-                    Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention
-                    computation. If unspecified, it defaults to value recommended by the implementation, using the
-                    `factor` field to infer the suggested value.
-                `beta_fast` (`float`, *optional*):
-                    Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear
-                    ramp function. If unspecified, it defaults to 32.
-                `beta_slow` (`float`, *optional*):
-                    Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear
-                    ramp function. If unspecified, it defaults to 1.
-                `short_factor` (`list[float]`, *optional*):
-                    Only used with 'longrope'. The scaling factor to be applied to short contexts (<
-                    `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
-                    size divided by the number of attention heads divided by 2
-                `long_factor` (`list[float]`, *optional*):
-                    Only used with 'longrope'. The scaling factor to be applied to long contexts (<
-                    `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
-                    size divided by the number of attention heads divided by 2
-                `low_freq_factor` (`float`, *optional*):
-                    Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE
-                `high_freq_factor` (`float`, *optional*):
-                    Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE
-        attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`):
-            Whether to use a bias in the query, key, value and output projection layers during self-attention.
-        use_sliding_window (`bool`, *optional*, defaults to `False`):
-            Whether to use sliding window attention.
-        sliding_window (`int`, *optional*, defaults to 4096):
-            Sliding window attention (SWA) window size. If not specified, will default to `4096`.
-        layer_types (`list`, *optional*):
-            Attention pattern for each layer.
-        attention_dropout (`float`, *optional*, defaults to 0.0):
-            The dropout ratio for the attention probabilities.
-    ```python
-    >>> from acestep.models import AceStepConfig
-    >>> # Initializing an AceStep configuration
-    >>> configuration = AceStepConfig()
-    >>> # Initializing a model from the configuration
-    >>> model = AceStepConditionGenerationModel(configuration)
-    >>> # Accessing the model configuration
-    >>> configuration = model.config
-    ```"""
-    model_type = "acestep"
-    keys_to_ignore_at_inference = ["past_key_values"]
-    # Default tensor parallel plan for the base model
-    base_model_tp_plan = {
-        "layers.*.self_attn.q_proj": "colwise",
-        "layers.*.self_attn.k_proj": "colwise",
-        "layers.*.self_attn.v_proj": "colwise",
-        "layers.*.self_attn.o_proj": "rowwise",
-        "layers.*.mlp.gate_proj": "colwise",
-        "layers.*.mlp.up_proj": "colwise",
-        "layers.*.mlp.down_proj": "rowwise",
-    }
-    base_model_pp_plan = {
-        "embed_tokens": (["input_ids"], ["inputs_embeds"]),
-        "layers": (["hidden_states", "attention_mask"], ["hidden_states"]),
-        "norm": (["hidden_states"], ["hidden_states"]),
-    }
-    def __init__(
-        self,
-        vocab_size=64003,
-        fsq_dim=2048,
-        fsq_input_levels=[8, 8, 8, 5, 5, 5],
-        fsq_input_num_quantizers=1,
-        hidden_size=2048,
-        intermediate_size=6144,
-        num_hidden_layers=24,
-        num_attention_heads=16,
-        num_key_value_heads=8,
-        head_dim=128,
-        hidden_act="silu",
-        max_position_embeddings=32768,
-        initializer_range=0.02,
-        rms_norm_eps=1e-6,
-        use_cache=True,
-        tie_word_embeddings=True,
-        rope_theta=1000000,
-        rope_scaling=None,
-        attention_bias=False,
-        use_sliding_window=True,
-        sliding_window=128,
-        layer_types=None,
-        attention_dropout=0.0,
-        num_lyric_encoder_hidden_layers=8,
-        audio_acoustic_hidden_dim=64,
-        pool_window_size=5,
-        text_hidden_dim=1024,
-        in_channels=192,
-        data_proportion=0.5,
-        timestep_mu=-0.4,
-        timestep_sigma=1.0,
-        timbre_hidden_dim=64,
-        num_timbre_encoder_hidden_layers=4,
-        timbre_fix_frame=750,
-        patch_size=2,
-        num_attention_pooler_hidden_layers=2,
-        num_audio_decoder_hidden_layers=24,
-        model_version="turbo",
-        **kwargs,
-    ):
-        self.max_position_embeddings = max_position_embeddings
-        self.hidden_size = hidden_size
-        self.intermediate_size = intermediate_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.use_sliding_window = use_sliding_window
-        self.sliding_window = sliding_window if self.use_sliding_window else None
-        # Text encoder configuration
-        self.text_hidden_dim = text_hidden_dim
-        # Lyric encoder configuration
-        self.num_lyric_encoder_hidden_layers = num_lyric_encoder_hidden_layers
-        self.patch_size = patch_size
-        # Audio semantic token generation configuration
-        self.audio_acoustic_hidden_dim = audio_acoustic_hidden_dim
-        self.pool_window_size = pool_window_size
-        self.in_channels = in_channels
-        self.data_proportion = data_proportion
-        self.timestep_mu = timestep_mu
-        self.timestep_sigma = timestep_sigma
-        # FSQ (Finite Scalar Quantization) configuration
-        self.fsq_dim = fsq_dim
-        self.fsq_input_levels = fsq_input_levels
-        self.fsq_input_num_quantizers = fsq_input_num_quantizers
-        # Timbre encoder configuration
-        self.timbre_hidden_dim = timbre_hidden_dim
-        self.num_timbre_encoder_hidden_layers = num_timbre_encoder_hidden_layers
-        self.timbre_fix_frame = timbre_fix_frame
-        self.num_attention_pooler_hidden_layers = num_attention_pooler_hidden_layers
-        self.num_audio_decoder_hidden_layers = num_audio_decoder_hidden_layers
-        self.vocab_size = vocab_size
-        # Backward compatibility: ensure num_key_value_heads is set
-        if num_key_value_heads is None:
-            num_key_value_heads = num_attention_heads
-        self.num_key_value_heads = num_key_value_heads
-        self.head_dim = head_dim
-        self.hidden_act = hidden_act
-        self.initializer_range = initializer_range
-        self.rms_norm_eps = rms_norm_eps
-        self.use_cache = use_cache
-        self.rope_theta = rope_theta
-        self.rope_scaling = rope_scaling
-        self.attention_bias = attention_bias
-        self.attention_dropout = attention_dropout
-        self.model_version = model_version
-        # Validate rotary position embeddings parameters
-        # Backward compatibility: if there is a 'type' field, move it to 'rope_type'
-        if self.rope_scaling is not None and "type" in self.rope_scaling:
-            self.rope_scaling["rope_type"] = self.rope_scaling["type"]
-        rope_config_validation(self)
-        self.layer_types = layer_types
-        # Set default layer types if not specified
-        if self.layer_types is None:
-            self.layer_types = [
-                "sliding_attention" if bool((i + 1) % 2) else "full_attention" for i in range(self.num_hidden_layers)
-            ]
-        layer_type_validation(self.layer_types)
-        super().__init__(
-            tie_word_embeddings=tie_word_embeddings,
-            **kwargs,
-        )
-__all__ = ["AceStepConfig"]

ace-step/acestep-v15-base/modeling_acestep_v15_base.py DELETED Viewed

The diff for this file is too large to render. See raw diff

ace-step/acestep-v15-base/silence_latent.pt DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:a778e9dd942f5e8b2c09c55370782d318834432b03dabbcdf70e6ed49ad6358b
-size 3841215

ace-step/acestep-v15-sft/apg_guidance.py DELETED Viewed

@@ -1,220 +0,0 @@
-import torch
-import torch.nn.functional as F
-class MomentumBuffer:
-    def __init__(self, momentum: float = -0.75):
-        self.momentum = momentum
-        self.running_average = 0
-    def update(self, update_value: torch.Tensor):
-        new_average = self.momentum * self.running_average
-        self.running_average = update_value + new_average
-def project(
-    v0: torch.Tensor,  # [B, C, T]
-    v1: torch.Tensor,  # [B, C, T]
-    dims=[-1],
-):
-    dtype = v0.dtype
-    device_type = v0.device.type
-    if device_type == "mps":
-        v0, v1 = v0.cpu(), v1.cpu()
-    v0, v1 = v0.double(), v1.double()
-    v1 = torch.nn.functional.normalize(v1, dim=dims)
-    v0_parallel = (v0 * v1).sum(dim=dims, keepdim=True) * v1
-    v0_orthogonal = v0 - v0_parallel
-    return v0_parallel.to(dtype).to(device_type), v0_orthogonal.to(dtype).to(device_type)
-def apg_forward(
-    pred_cond: torch.Tensor,  # [B, C, T]
-    pred_uncond: torch.Tensor,  # [B, C, T]
-    guidance_scale: float,
-    momentum_buffer: MomentumBuffer = None,
-    eta: float = 0.0,
-    norm_threshold: float = 2.5,
-    dims=[-1],
-):
-    diff = pred_cond - pred_uncond
-    if momentum_buffer is not None:
-        momentum_buffer.update(diff)
-        diff = momentum_buffer.running_average
-    if norm_threshold > 0:
-        ones = torch.ones_like(diff)
-        diff_norm = diff.norm(p=2, dim=dims, keepdim=True)
-        scale_factor = torch.minimum(ones, norm_threshold / diff_norm)
-        diff = diff * scale_factor
-    diff_parallel, diff_orthogonal = project(diff, pred_cond, dims)
-    normalized_update = diff_orthogonal + eta * diff_parallel
-    pred_guided = pred_cond + (guidance_scale - 1) * normalized_update
-    return pred_guided
-def cfg_forward(cond_output, uncond_output, cfg_strength):
-    return uncond_output + cfg_strength * (cond_output - uncond_output)
-def call_cos_tensor(tensor1, tensor2):
-    """
-    Calculate cosine similarity between two normalized tensors.
-    Args:
-        tensor1: First tensor [B, ...]
-        tensor2: Second tensor [B, ...]
-    Returns:
-        Cosine similarity value [B, 1]
-    """
-    tensor1 = tensor1 / torch.linalg.norm(tensor1, dim=1, keepdim=True)
-    tensor2 = tensor2 / torch.linalg.norm(tensor2, dim=1, keepdim=True)
-    cosvalue = torch.sum(tensor1 * tensor2, dim=1, keepdim=True)
-    return cosvalue
-def compute_perpendicular_component(latent_diff, latent_hat_uncond):
-    """
-    Decompose latent_diff into parallel and perpendicular components relative to latent_hat_uncond.
-    Args:
-        latent_diff: Difference tensor [B, C, ...]
-        latent_hat_uncond: Unconditional prediction tensor [B, C, ...]
-    Returns:
-        projection: Parallel component
-        perpendicular_component: Perpendicular component
-    """
-    n, t, c = latent_diff.shape
-    latent_diff = latent_diff.view(n * t, c).float()
-    latent_hat_uncond = latent_hat_uncond.view(n * t, c).float()
-    if latent_diff.size() != latent_hat_uncond.size():
-        raise ValueError("latent_diff and latent_hat_uncond must have the same shape [n, d].")
-    dot_product = torch.sum(latent_diff * latent_hat_uncond, dim=1, keepdim=True)  # [n, 1]
-    norm_square = torch.sum(latent_hat_uncond * latent_hat_uncond, dim=1, keepdim=True)  # [n, 1]
-    projection = (dot_product / (norm_square + 1e-8)) * latent_hat_uncond
-    perpendicular_component = latent_diff - projection
-    return projection.view(n, t, c), perpendicular_component.reshape(n, t, c)
-def adg_forward(
-    latents: torch.Tensor,
-    noise_pred_cond: torch.Tensor,
-    noise_pred_uncond: torch.Tensor,
-    sigma: torch.Tensor,
-    guidance_scale: float,
-    angle_clip: float = 3.14 / 6,  # pi/6 by default
-    apply_norm: bool = False,
-    apply_clip: bool = True,
-):
-    """
-    ADG (Angle-based Dynamic Guidance) forward pass for Flow Matching.
-    In flow matching (including SD3), sigma represents the current timestep t_curr.
-    The predictions are velocity fields v(x_t, t).
-    Args:
-        latents: Current state x_t [N, T, d] where d=64
-        noise_pred_cond: Conditional velocity prediction v_cond [N, T, d]
-        noise_pred_uncond: Unconditional velocity prediction v_uncond [N, T, d]
-        sigma: Current timestep t_curr (not t_prev!)
-        guidance_scale: Guidance strength
-        angle_clip: Maximum angle for clipping (default: pi/6)
-        apply_norm: Whether to normalize the result (ADG_w_norm variant)
-        apply_clip: Whether to clip the angle (ADG_wo_clip when False)
-    Returns:
-        Guided velocity prediction [N, T, d]
-    """
-    # Get batch size
-    n = noise_pred_cond.shape[0]
-    noise_pred_text = noise_pred_cond
-    n, t, c = noise_pred_text.shape
-    # Ensure sigma/t has the right shape for broadcasting [N, 1, 1]
-    if isinstance(sigma, (int, float)):
-        sigma = torch.tensor(sigma, device=latents.device, dtype=latents.dtype)
-        sigma = sigma.view(1, 1, 1).expand(n, 1, 1)
-    elif torch.is_tensor(sigma):
-        if sigma.numel() == 1:
-            sigma = sigma.view(1, 1, 1).expand(n, 1, 1)
-        elif sigma.numel() == n:
-            sigma = sigma.view(n, 1, 1)
-        else:
-            raise ValueError(f"sigma has incompatible shape. Expected scalar or size {n}, got {sigma.shape}")
-    else:
-        raise TypeError(f"sigma must be a number or tensor, got {type(sigma)}")
-    # Adjust guidance weight
-    weight = guidance_scale - 1
-    weight = weight * (weight > 0) + 1e-3
-    latent_hat_text = latents - sigma * noise_pred_text
-    latent_hat_uncond = latents - sigma * noise_pred_uncond
-    latent_diff = latent_hat_text - latent_hat_uncond
-    # Calculate angle between conditional and unconditional predicted data
-    latent_theta = torch.acos(
-        call_cos_tensor(latent_hat_text.view(-1, c).to(float),
-                        latent_hat_uncond.reshape(-1, c).contiguous().to(float)))
-    latent_theta_new = torch.clip(weight * latent_theta, -angle_clip, angle_clip) if apply_clip else weight * latent_theta
-    proj, perp = compute_perpendicular_component(latent_diff, latent_hat_uncond)
-    latent_v_new = torch.cos(latent_theta_new) * latent_hat_text
-    latent_p_new = perp * torch.sin(latent_theta_new) / torch.sin(latent_theta) * (
-        torch.sin(latent_theta) > 1e-3) + perp * weight * (torch.sin(latent_theta) <= 1e-3)
-    latent_new = latent_v_new + latent_p_new
-    if apply_norm:
-        latent_new = latent_new * torch.linalg.norm(latent_hat_text, dim=1, keepdim=True) / torch.linalg.norm(
-            latent_new, dim=1, keepdim=True)
-    noise_pred = (latents - latent_new) / sigma
-    noise_pred = noise_pred.reshape(n, t, c).to(latents.dtype)
-    return noise_pred
-def adg_w_norm_forward(
-    latents: torch.Tensor,
-    noise_pred_cond: torch.Tensor,
-    noise_pred_uncond: torch.Tensor,
-    sigma: float,
-    guidance_scale: float,
-    angle_clip: float = 3.14 / 3,
-):
-    """
-    ADG with normalization - preserves the magnitude of latent predictions.
-    This variant normalizes the final latent to maintain the same norm as the
-    conditional prediction, which can help preserve image quality.
-    """
-    return adg_forward(latents,
-                       noise_pred_cond,
-                       noise_pred_uncond,
-                       sigma,
-                       guidance_scale,
-                       angle_clip=angle_clip,
-                       apply_norm=True,
-                       apply_clip=True)
-def adg_wo_clip_forward(
-    latents: torch.Tensor,
-    noise_pred_cond: torch.Tensor,
-    noise_pred_uncond: torch.Tensor,
-    sigma: float,
-    guidance_scale: float,
-):
-    """
-    ADG without angle clipping - allows unbounded angle adjustments.
-    This variant doesn't clip the angle, which may result in more aggressive
-    guidance but could be less stable.
-    """
-    return adg_forward(latents, noise_pred_cond, noise_pred_uncond, sigma, guidance_scale, apply_norm=False, apply_clip=False)

ace-step/acestep-v15-sft/config.json DELETED Viewed

@@ -1,81 +0,0 @@
-{
-  "architectures": [
-    "AceStepConditionGenerationModel"
-  ],
-  "auto_map": {
-    "AutoConfig": "configuration_acestep_v15.AceStepConfig",
-    "AutoModel": "modeling_acestep_v15_base.AceStepConditionGenerationModel"
-  },
-  "attention_bias": false,
-  "attention_dropout": 0.0,
-  "audio_acoustic_hidden_dim": 64,
-  "data_proportion": 0.5,
-  "dtype": "bfloat16",
-  "fsq_dim": 2048,
-  "fsq_input_levels": [
-    8,
-    8,
-    8,
-    5,
-    5,
-    5
-  ],
-  "fsq_input_num_quantizers": 1,
-  "head_dim": 128,
-  "hidden_act": "silu",
-  "hidden_size": 2048,
-  "in_channels": 192,
-  "initializer_range": 0.02,
-  "intermediate_size": 6144,
-  "layer_types": [
-    "sliding_attention",
-    "full_attention",
-    "sliding_attention",
-    "full_attention",
-    "sliding_attention",
-    "full_attention",
-    "sliding_attention",
-    "full_attention",
-    "sliding_attention",
-    "full_attention",
-    "sliding_attention",
-    "full_attention",
-    "sliding_attention",
-    "full_attention",
-    "sliding_attention",
-    "full_attention",
-    "sliding_attention",
-    "full_attention",
-    "sliding_attention",
-    "full_attention",
-    "sliding_attention",
-    "full_attention",
-    "sliding_attention",
-    "full_attention"
-  ],
-  "max_position_embeddings": 32768,
-  "model_type": "acestep",
-  "num_attention_heads": 16,
-  "num_attention_pooler_hidden_layers": 2,
-  "num_audio_decoder_hidden_layers": 24,
-  "num_hidden_layers": 24,
-  "num_key_value_heads": 8,
-  "num_lyric_encoder_hidden_layers": 8,
-  "num_timbre_encoder_hidden_layers": 4,
-  "patch_size": 2,
-  "pool_window_size": 5,
-  "rms_norm_eps": 1e-06,
-  "rope_scaling": null,
-  "rope_theta": 1000000,
-  "sliding_window": 128,
-  "text_hidden_dim": 1024,
-  "timbre_fix_frame": 750,
-  "timbre_hidden_dim": 64,
-  "timestep_mu": -0.4,
-  "timestep_sigma": 1.0,
-  "transformers_version": "4.57.0.dev0",
-  "use_cache": true,
-  "use_sliding_window": true,
-  "vocab_size": 64003,
-  "is_turbo": false
-}

ace-step/acestep-v15-sft/configuration_acestep_v15.py DELETED Viewed

@@ -1,263 +0,0 @@
-# coding=utf-8
-# Copyright 2024 The Qwen team, Alibaba Group and the HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""AceStep model configuration"""
-from transformers.configuration_utils import PretrainedConfig, layer_type_validation
-from transformers.modeling_rope_utils import rope_config_validation
-from transformers.utils import logging
-logger = logging.get_logger(__name__)
-class AceStepConfig(PretrainedConfig):
-    r"""
-    This is the configuration class to store the configuration of a [`AceStepModel`]. It is used to instantiate an
-    AceStep model according to the specified arguments, defining the model architecture.
-    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
-    documentation from [`PretrainedConfig`] for more information.
-    Args:
-        vocab_size (`int`, *optional*, defaults to 64003):
-            Vocabulary size of the AceStep model. Defines the number of different tokens that can be represented by the
-            `inputs_ids` passed when calling the model.
-        hidden_size (`int`, *optional*, defaults to 4096):
-            Dimension of the hidden representations.
-        intermediate_size (`int`, *optional*, defaults to 22016):
-            Dimension of the MLP representations.
-        num_hidden_layers (`int`, *optional*, defaults to 32):
-            Number of hidden layers in the Transformer encoder.
-        num_attention_heads (`int`, *optional*, defaults to 32):
-            Number of attention heads for each attention layer in the Transformer encoder.
-        num_key_value_heads (`int`, *optional*, defaults to 32):
-            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
-            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
-            `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
-            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
-            by meanpooling all the original heads within that group. For more details, check out [this
-            paper](https://huggingface.co/papers/2305.13245). If it is not specified, will default to `32`.
-        head_dim (`int`, *optional*, defaults to 128):
-            The attention head dimension.
-        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
-            The non-linear activation function (function or string) in the decoder.
-        max_position_embeddings (`int`, *optional*, defaults to 32768):
-            The maximum sequence length that this model might ever be used with.
-        initializer_range (`float`, *optional*, defaults to 0.02):
-            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-        rms_norm_eps (`float`, *optional*, defaults to 1e-06):
-            The epsilon used by the rms normalization layers.
-        use_cache (`bool`, *optional*, defaults to `True`):
-            Whether or not the model should return the last key/values attentions (not used by all models). Only
-            relevant if `config.is_decoder=True`.
-        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
-            Whether the model's input and output word embeddings should be tied.
-        rope_theta (`float`, *optional*, defaults to 10000.0):
-            The base period of the RoPE embeddings.
-        rope_scaling (`Dict`, *optional*):
-            Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type
-            and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value
-            accordingly.
-            Expected contents:
-                `rope_type` (`str`):
-                    The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope',
-                    'llama3'], with 'default' being the original RoPE implementation.
-                `factor` (`float`, *optional*):
-                    Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In
-                    most scaling types, a `factor` of x will enable the model to handle sequences of length x *
-                    original maximum pre-trained length.
-                `original_max_position_embeddings` (`int`, *optional*):
-                    Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during
-                    pretraining.
-                `attention_factor` (`float`, *optional*):
-                    Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention
-                    computation. If unspecified, it defaults to value recommended by the implementation, using the
-                    `factor` field to infer the suggested value.
-                `beta_fast` (`float`, *optional*):
-                    Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear
-                    ramp function. If unspecified, it defaults to 32.
-                `beta_slow` (`float`, *optional*):
-                    Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear
-                    ramp function. If unspecified, it defaults to 1.
-                `short_factor` (`list[float]`, *optional*):
-                    Only used with 'longrope'. The scaling factor to be applied to short contexts (<
-                    `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
-                    size divided by the number of attention heads divided by 2
-                `long_factor` (`list[float]`, *optional*):
-                    Only used with 'longrope'. The scaling factor to be applied to long contexts (<
-                    `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
-                    size divided by the number of attention heads divided by 2
-                `low_freq_factor` (`float`, *optional*):
-                    Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE
-                `high_freq_factor` (`float`, *optional*):
-                    Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE
-        attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`):
-            Whether to use a bias in the query, key, value and output projection layers during self-attention.
-        use_sliding_window (`bool`, *optional*, defaults to `False`):
-            Whether to use sliding window attention.
-        sliding_window (`int`, *optional*, defaults to 4096):
-            Sliding window attention (SWA) window size. If not specified, will default to `4096`.
-        layer_types (`list`, *optional*):
-            Attention pattern for each layer.
-        attention_dropout (`float`, *optional*, defaults to 0.0):
-            The dropout ratio for the attention probabilities.
-    ```python
-    >>> from acestep.models import AceStepConfig
-    >>> # Initializing an AceStep configuration
-    >>> configuration = AceStepConfig()
-    >>> # Initializing a model from the configuration
-    >>> model = AceStepConditionGenerationModel(configuration)
-    >>> # Accessing the model configuration
-    >>> configuration = model.config
-    ```"""
-    model_type = "acestep"
-    keys_to_ignore_at_inference = ["past_key_values"]
-    # Default tensor parallel plan for the base model
-    base_model_tp_plan = {
-        "layers.*.self_attn.q_proj": "colwise",
-        "layers.*.self_attn.k_proj": "colwise",
-        "layers.*.self_attn.v_proj": "colwise",
-        "layers.*.self_attn.o_proj": "rowwise",
-        "layers.*.mlp.gate_proj": "colwise",
-        "layers.*.mlp.up_proj": "colwise",
-        "layers.*.mlp.down_proj": "rowwise",
-    }
-    base_model_pp_plan = {
-        "embed_tokens": (["input_ids"], ["inputs_embeds"]),
-        "layers": (["hidden_states", "attention_mask"], ["hidden_states"]),
-        "norm": (["hidden_states"], ["hidden_states"]),
-    }
-    def __init__(
-        self,
-        vocab_size=64003,
-        fsq_dim=2048,
-        fsq_input_levels=[8, 8, 8, 5, 5, 5],
-        fsq_input_num_quantizers=1,
-        hidden_size=2048,
-        intermediate_size=6144,
-        num_hidden_layers=24,
-        num_attention_heads=16,
-        num_key_value_heads=8,
-        head_dim=128,
-        hidden_act="silu",
-        max_position_embeddings=32768,
-        initializer_range=0.02,
-        rms_norm_eps=1e-6,
-        use_cache=True,
-        tie_word_embeddings=True,
-        rope_theta=1000000,
-        rope_scaling=None,
-        attention_bias=False,
-        use_sliding_window=True,
-        sliding_window=128,
-        layer_types=None,
-        attention_dropout=0.0,
-        num_lyric_encoder_hidden_layers=8,
-        audio_acoustic_hidden_dim=64,
-        pool_window_size=5,
-        text_hidden_dim=1024,
-        in_channels=192,
-        data_proportion=0.5,
-        timestep_mu=-0.4,
-        timestep_sigma=1.0,
-        timbre_hidden_dim=64,
-        num_timbre_encoder_hidden_layers=4,
-        timbre_fix_frame=750,
-        patch_size=2,
-        num_attention_pooler_hidden_layers=2,
-        num_audio_decoder_hidden_layers=24,
-        model_version="turbo",
-        **kwargs,
-    ):
-        self.max_position_embeddings = max_position_embeddings
-        self.hidden_size = hidden_size
-        self.intermediate_size = intermediate_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.use_sliding_window = use_sliding_window
-        self.sliding_window = sliding_window if self.use_sliding_window else None
-        # Text encoder configuration
-        self.text_hidden_dim = text_hidden_dim
-        # Lyric encoder configuration
-        self.num_lyric_encoder_hidden_layers = num_lyric_encoder_hidden_layers
-        self.patch_size = patch_size
-        # Audio semantic token generation configuration
-        self.audio_acoustic_hidden_dim = audio_acoustic_hidden_dim
-        self.pool_window_size = pool_window_size
-        self.in_channels = in_channels
-        self.data_proportion = data_proportion
-        self.timestep_mu = timestep_mu
-        self.timestep_sigma = timestep_sigma
-        # FSQ (Finite Scalar Quantization) configuration
-        self.fsq_dim = fsq_dim
-        self.fsq_input_levels = fsq_input_levels
-        self.fsq_input_num_quantizers = fsq_input_num_quantizers
-        # Timbre encoder configuration
-        self.timbre_hidden_dim = timbre_hidden_dim
-        self.num_timbre_encoder_hidden_layers = num_timbre_encoder_hidden_layers
-        self.timbre_fix_frame = timbre_fix_frame
-        self.num_attention_pooler_hidden_layers = num_attention_pooler_hidden_layers
-        self.num_audio_decoder_hidden_layers = num_audio_decoder_hidden_layers
-        self.vocab_size = vocab_size
-        # Backward compatibility: ensure num_key_value_heads is set
-        if num_key_value_heads is None:
-            num_key_value_heads = num_attention_heads
-        self.num_key_value_heads = num_key_value_heads
-        self.head_dim = head_dim
-        self.hidden_act = hidden_act
-        self.initializer_range = initializer_range
-        self.rms_norm_eps = rms_norm_eps
-        self.use_cache = use_cache
-        self.rope_theta = rope_theta
-        self.rope_scaling = rope_scaling
-        self.attention_bias = attention_bias
-        self.attention_dropout = attention_dropout
-        self.model_version = model_version
-        # Validate rotary position embeddings parameters
-        # Backward compatibility: if there is a 'type' field, move it to 'rope_type'
-        if self.rope_scaling is not None and "type" in self.rope_scaling:
-            self.rope_scaling["rope_type"] = self.rope_scaling["type"]
-        rope_config_validation(self)
-        self.layer_types = layer_types
-        # Set default layer types if not specified
-        if self.layer_types is None:
-            self.layer_types = [
-                "sliding_attention" if bool((i + 1) % 2) else "full_attention" for i in range(self.num_hidden_layers)
-            ]
-        layer_type_validation(self.layer_types)
-        super().__init__(
-            tie_word_embeddings=tie_word_embeddings,
-            **kwargs,
-        )
-__all__ = ["AceStepConfig"]

ace-step/acestep-v15-sft/modeling_acestep_v15_base.py DELETED Viewed

The diff for this file is too large to render. See raw diff

ace-step/acestep-v15-sft/silence_latent.pt DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:a778e9dd942f5e8b2c09c55370782d318834432b03dabbcdf70e6ed49ad6358b
-size 3841215

ace-step/acestep-v15-turbo/config.json DELETED Viewed

@@ -1,82 +0,0 @@
-{
-  "architectures": [
-    "AceStepConditionGenerationModel"
-  ],
-  "attention_bias": false,
-  "attention_dropout": 0.0,
-  "audio_acoustic_hidden_dim": 64,
-  "auto_map": {
-    "AutoConfig": "configuration_acestep_v15.AceStepConfig",
-    "AutoModel": "modeling_acestep_v15_turbo.AceStepConditionGenerationModel"
-  },
-  "data_proportion": 0.5,
-  "dtype": "bfloat16",
-  "fsq_dim": 2048,
-  "fsq_input_levels": [
-    8,
-    8,
-    8,
-    5,
-    5,
-    5
-  ],
-  "fsq_input_num_quantizers": 1,
-  "head_dim": 128,
-  "hidden_act": "silu",
-  "hidden_size": 2048,
-  "in_channels": 192,
-  "initializer_range": 0.02,
-  "intermediate_size": 6144,
-  "is_turbo": true,
-  "layer_types": [
-    "sliding_attention",
-    "full_attention",
-    "sliding_attention",
-    "full_attention",
-    "sliding_attention",
-    "full_attention",
-    "sliding_attention",
-    "full_attention",
-    "sliding_attention",
-    "full_attention",
-    "sliding_attention",
-    "full_attention",
-    "sliding_attention",
-    "full_attention",
-    "sliding_attention",
-    "full_attention",
-    "sliding_attention",
-    "full_attention",
-    "sliding_attention",
-    "full_attention",
-    "sliding_attention",
-    "full_attention",
-    "sliding_attention",
-    "full_attention"
-  ],
-  "max_position_embeddings": 32768,
-  "model_type": "acestep",
-  "model_version": "turbo",
-  "num_attention_heads": 16,
-  "num_attention_pooler_hidden_layers": 2,
-  "num_audio_decoder_hidden_layers": 24,
-  "num_hidden_layers": 24,
-  "num_key_value_heads": 8,
-  "num_lyric_encoder_hidden_layers": 8,
-  "num_timbre_encoder_hidden_layers": 4,
-  "patch_size": 2,
-  "pool_window_size": 5,
-  "rms_norm_eps": 1e-06,
-  "rope_scaling": null,
-  "rope_theta": 1000000,
-  "sliding_window": 128,
-  "text_hidden_dim": 1024,
-  "timbre_fix_frame": 750,
-  "timbre_hidden_dim": 64,
-  "timestep_mu": -0.4,
-  "timestep_sigma": 1.0,
-  "transformers_version": "4.57.0.dev0",
-  "use_cache": true,
-  "use_sliding_window": true,
-  "vocab_size": 64003
-}

ace-step/acestep-v15-turbo/configuration_acestep_v15.py DELETED Viewed

@@ -1,263 +0,0 @@
-# coding=utf-8
-# Copyright 2024 The Qwen team, Alibaba Group and the HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""AceStep model configuration"""
-from transformers.configuration_utils import PretrainedConfig, layer_type_validation
-from transformers.modeling_rope_utils import rope_config_validation
-from transformers.utils import logging
-logger = logging.get_logger(__name__)
-class AceStepConfig(PretrainedConfig):
-    r"""
-    This is the configuration class to store the configuration of a [`AceStepModel`]. It is used to instantiate an
-    AceStep model according to the specified arguments, defining the model architecture.
-    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
-    documentation from [`PretrainedConfig`] for more information.
-    Args:
-        vocab_size (`int`, *optional*, defaults to 64003):
-            Vocabulary size of the AceStep model. Defines the number of different tokens that can be represented by the
-            `inputs_ids` passed when calling the model.
-        hidden_size (`int`, *optional*, defaults to 4096):
-            Dimension of the hidden representations.
-        intermediate_size (`int`, *optional*, defaults to 22016):
-            Dimension of the MLP representations.
-        num_hidden_layers (`int`, *optional*, defaults to 32):
-            Number of hidden layers in the Transformer encoder.
-        num_attention_heads (`int`, *optional*, defaults to 32):
-            Number of attention heads for each attention layer in the Transformer encoder.
-        num_key_value_heads (`int`, *optional*, defaults to 32):
-            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
-            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
-            `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
-            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
-            by meanpooling all the original heads within that group. For more details, check out [this
-            paper](https://huggingface.co/papers/2305.13245). If it is not specified, will default to `32`.
-        head_dim (`int`, *optional*, defaults to 128):
-            The attention head dimension.
-        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
-            The non-linear activation function (function or string) in the decoder.
-        max_position_embeddings (`int`, *optional*, defaults to 32768):
-            The maximum sequence length that this model might ever be used with.
-        initializer_range (`float`, *optional*, defaults to 0.02):
-            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-        rms_norm_eps (`float`, *optional*, defaults to 1e-06):
-            The epsilon used by the rms normalization layers.
-        use_cache (`bool`, *optional*, defaults to `True`):
-            Whether or not the model should return the last key/values attentions (not used by all models). Only
-            relevant if `config.is_decoder=True`.
-        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
-            Whether the model's input and output word embeddings should be tied.
-        rope_theta (`float`, *optional*, defaults to 10000.0):
-            The base period of the RoPE embeddings.
-        rope_scaling (`Dict`, *optional*):
-            Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type
-            and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value
-            accordingly.
-            Expected contents:
-                `rope_type` (`str`):
-                    The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope',
-                    'llama3'], with 'default' being the original RoPE implementation.
-                `factor` (`float`, *optional*):
-                    Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In
-                    most scaling types, a `factor` of x will enable the model to handle sequences of length x *
-                    original maximum pre-trained length.
-                `original_max_position_embeddings` (`int`, *optional*):
-                    Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during
-                    pretraining.
-                `attention_factor` (`float`, *optional*):
-                    Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention
-                    computation. If unspecified, it defaults to value recommended by the implementation, using the
-                    `factor` field to infer the suggested value.
-                `beta_fast` (`float`, *optional*):
-                    Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear
-                    ramp function. If unspecified, it defaults to 32.
-                `beta_slow` (`float`, *optional*):
-                    Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear
-                    ramp function. If unspecified, it defaults to 1.
-                `short_factor` (`list[float]`, *optional*):
-                    Only used with 'longrope'. The scaling factor to be applied to short contexts (<
-                    `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
-                    size divided by the number of attention heads divided by 2
-                `long_factor` (`list[float]`, *optional*):
-                    Only used with 'longrope'. The scaling factor to be applied to long contexts (<
-                    `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
-                    size divided by the number of attention heads divided by 2
-                `low_freq_factor` (`float`, *optional*):
-                    Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE
-                `high_freq_factor` (`float`, *optional*):
-                    Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE
-        attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`):
-            Whether to use a bias in the query, key, value and output projection layers during self-attention.
-        use_sliding_window (`bool`, *optional*, defaults to `False`):
-            Whether to use sliding window attention.
-        sliding_window (`int`, *optional*, defaults to 4096):
-            Sliding window attention (SWA) window size. If not specified, will default to `4096`.
-        layer_types (`list`, *optional*):
-            Attention pattern for each layer.
-        attention_dropout (`float`, *optional*, defaults to 0.0):
-            The dropout ratio for the attention probabilities.
-    ```python
-    >>> from acestep.models import AceStepConfig
-    >>> # Initializing an AceStep configuration
-    >>> configuration = AceStepConfig()
-    >>> # Initializing a model from the configuration
-    >>> model = AceStepConditionGenerationModel(configuration)
-    >>> # Accessing the model configuration
-    >>> configuration = model.config
-    ```"""
-    model_type = "acestep"
-    keys_to_ignore_at_inference = ["past_key_values"]
-    # Default tensor parallel plan for the base model
-    base_model_tp_plan = {
-        "layers.*.self_attn.q_proj": "colwise",
-        "layers.*.self_attn.k_proj": "colwise",
-        "layers.*.self_attn.v_proj": "colwise",
-        "layers.*.self_attn.o_proj": "rowwise",
-        "layers.*.mlp.gate_proj": "colwise",
-        "layers.*.mlp.up_proj": "colwise",
-        "layers.*.mlp.down_proj": "rowwise",
-    }
-    base_model_pp_plan = {
-        "embed_tokens": (["input_ids"], ["inputs_embeds"]),
-        "layers": (["hidden_states", "attention_mask"], ["hidden_states"]),
-        "norm": (["hidden_states"], ["hidden_states"]),
-    }
-    def __init__(
-        self,
-        vocab_size=64003,
-        fsq_dim=2048,
-        fsq_input_levels=[8, 8, 8, 5, 5, 5],
-        fsq_input_num_quantizers=1,
-        hidden_size=2048,
-        intermediate_size=6144,
-        num_hidden_layers=24,
-        num_attention_heads=16,
-        num_key_value_heads=8,
-        head_dim=128,
-        hidden_act="silu",
-        max_position_embeddings=32768,
-        initializer_range=0.02,
-        rms_norm_eps=1e-6,
-        use_cache=True,
-        tie_word_embeddings=True,
-        rope_theta=1000000,
-        rope_scaling=None,
-        attention_bias=False,
-        use_sliding_window=True,
-        sliding_window=128,
-        layer_types=None,
-        attention_dropout=0.0,
-        num_lyric_encoder_hidden_layers=8,
-        audio_acoustic_hidden_dim=64,
-        pool_window_size=5,
-        text_hidden_dim=1024,
-        in_channels=192,
-        data_proportion=0.5,
-        timestep_mu=-0.4,
-        timestep_sigma=1.0,
-        timbre_hidden_dim=64,
-        num_timbre_encoder_hidden_layers=4,
-        timbre_fix_frame=750,
-        patch_size=2,
-        num_attention_pooler_hidden_layers=2,
-        num_audio_decoder_hidden_layers=24,
-        model_version="turbo",
-        **kwargs,
-    ):
-        self.max_position_embeddings = max_position_embeddings
-        self.hidden_size = hidden_size
-        self.intermediate_size = intermediate_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.use_sliding_window = use_sliding_window
-        self.sliding_window = sliding_window if self.use_sliding_window else None
-        # Text encoder configuration
-        self.text_hidden_dim = text_hidden_dim
-        # Lyric encoder configuration
-        self.num_lyric_encoder_hidden_layers = num_lyric_encoder_hidden_layers
-        self.patch_size = patch_size
-        # Audio semantic token generation configuration
-        self.audio_acoustic_hidden_dim = audio_acoustic_hidden_dim
-        self.pool_window_size = pool_window_size
-        self.in_channels = in_channels
-        self.data_proportion = data_proportion
-        self.timestep_mu = timestep_mu
-        self.timestep_sigma = timestep_sigma
-        # FSQ (Finite Scalar Quantization) configuration
-        self.fsq_dim = fsq_dim
-        self.fsq_input_levels = fsq_input_levels
-        self.fsq_input_num_quantizers = fsq_input_num_quantizers
-        # Timbre encoder configuration
-        self.timbre_hidden_dim = timbre_hidden_dim
-        self.num_timbre_encoder_hidden_layers = num_timbre_encoder_hidden_layers
-        self.timbre_fix_frame = timbre_fix_frame
-        self.num_attention_pooler_hidden_layers = num_attention_pooler_hidden_layers
-        self.num_audio_decoder_hidden_layers = num_audio_decoder_hidden_layers
-        self.vocab_size = vocab_size
-        # Backward compatibility: ensure num_key_value_heads is set
-        if num_key_value_heads is None:
-            num_key_value_heads = num_attention_heads
-        self.num_key_value_heads = num_key_value_heads
-        self.head_dim = head_dim
-        self.hidden_act = hidden_act
-        self.initializer_range = initializer_range
-        self.rms_norm_eps = rms_norm_eps
-        self.use_cache = use_cache
-        self.rope_theta = rope_theta
-        self.rope_scaling = rope_scaling
-        self.attention_bias = attention_bias
-        self.attention_dropout = attention_dropout
-        self.model_version = model_version
-        # Validate rotary position embeddings parameters
-        # Backward compatibility: if there is a 'type' field, move it to 'rope_type'
-        if self.rope_scaling is not None and "type" in self.rope_scaling:
-            self.rope_scaling["rope_type"] = self.rope_scaling["type"]
-        rope_config_validation(self)
-        self.layer_types = layer_types
-        # Set default layer types if not specified
-        if self.layer_types is None:
-            self.layer_types = [
-                "sliding_attention" if bool((i + 1) % 2) else "full_attention" for i in range(self.num_hidden_layers)
-            ]
-        layer_type_validation(self.layer_types)
-        super().__init__(
-            tie_word_embeddings=tie_word_embeddings,
-            **kwargs,
-        )
-__all__ = ["AceStepConfig"]

ace-step/acestep-v15-turbo/modeling_acestep_v15_turbo.py DELETED Viewed

The diff for this file is too large to render. See raw diff

ace-step/acestep-v15-turbo/silence_latent.pt DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:a778e9dd942f5e8b2c09c55370782d318834432b03dabbcdf70e6ed49ad6358b
-size 3841215

ace-step/config.json DELETED Viewed

@@ -1,82 +0,0 @@
-{
-  "architectures": [
-    "AceStepConditionGenerationModel"
-  ],
-  "attention_bias": false,
-  "attention_dropout": 0.0,
-  "audio_acoustic_hidden_dim": 64,
-  "auto_map": {
-    "AutoConfig": "configuration_acestep_v15.AceStepConfig",
-    "AutoModel": "modeling_acestep_v15_turbo.AceStepConditionGenerationModel"
-  },
-  "data_proportion": 0.5,
-  "dtype": "bfloat16",
-  "fsq_dim": 2048,
-  "fsq_input_levels": [
-    8,
-    8,
-    8,
-    5,
-    5,
-    5
-  ],
-  "fsq_input_num_quantizers": 1,
-  "head_dim": 128,
-  "hidden_act": "silu",
-  "hidden_size": 2048,
-  "in_channels": 192,
-  "initializer_range": 0.02,
-  "intermediate_size": 6144,
-  "is_turbo": true,
-  "layer_types": [
-    "sliding_attention",
-    "full_attention",
-    "sliding_attention",
-    "full_attention",
-    "sliding_attention",
-    "full_attention",
-    "sliding_attention",
-    "full_attention",
-    "sliding_attention",
-    "full_attention",
-    "sliding_attention",
-    "full_attention",
-    "sliding_attention",
-    "full_attention",
-    "sliding_attention",
-    "full_attention",
-    "sliding_attention",
-    "full_attention",
-    "sliding_attention",
-    "full_attention",
-    "sliding_attention",
-    "full_attention",
-    "sliding_attention",
-    "full_attention"
-  ],
-  "max_position_embeddings": 32768,
-  "model_type": "acestep",
-  "model_version": "turbo",
-  "num_attention_heads": 16,
-  "num_attention_pooler_hidden_layers": 2,
-  "num_audio_decoder_hidden_layers": 24,
-  "num_hidden_layers": 24,
-  "num_key_value_heads": 8,
-  "num_lyric_encoder_hidden_layers": 8,
-  "num_timbre_encoder_hidden_layers": 4,
-  "patch_size": 2,
-  "pool_window_size": 5,
-  "rms_norm_eps": 1e-06,
-  "rope_scaling": null,
-  "rope_theta": 1000000,
-  "sliding_window": 128,
-  "text_hidden_dim": 1024,
-  "timbre_fix_frame": 750,
-  "timbre_hidden_dim": 64,
-  "timestep_mu": -0.4,
-  "timestep_sigma": 1.0,
-  "transformers_version": "4.57.0.dev0",
-  "use_cache": true,
-  "use_sliding_window": true,
-  "vocab_size": 64003
-}

ace-step/vae/config.json DELETED Viewed

@@ -1,24 +0,0 @@
-{
-  "_class_name": "AutoencoderOobleck",
-  "_diffusers_version": "0.34.0",
-  "_name_or_path": "/root/data/repo/gongjunmin/ACE-Step-1.5/checkpoints/vae/",
-  "audio_channels": 2,
-  "channel_multiples": [
-    1,
-    2,
-    4,
-    8,
-    16
-  ],
-  "decoder_channels": 128,
-  "decoder_input_channels": 64,
-  "downsampling_ratios": [
-    2,
-    4,
-    4,
-    6,
-    10
-  ],
-  "encoder_hidden_size": 128,
-  "sampling_rate": 48000
-}

ace-step/vae/diffusion_pytorch_model.safetensors DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:da17edb604c40deaf09e9b24974e590d1ca83a374070e5d0884cfa4bed9a99b0
-size 337431388

depth/dpt-large/.no_exist/bc15f29aa3a80d532f2ed650b5e16ac48d8958f9/processor_config.json DELETED Viewed

File without changes

depth/dpt-large/refs/main DELETED Viewed

	@@ -1 +0,0 @@
1	- bc15f29aa3a80d532f2ed650b5e16ac48d8958f9

depth/dpt-large/snapshots/bc15f29aa3a80d532f2ed650b5e16ac48d8958f9/config.json DELETED Viewed

@@ -1,47 +0,0 @@
-{
-  "architectures": [
-    "DPTForDepthEstimation"
-  ],
-  "attention_probs_dropout_prob": 0.0,
-  "auxiliary_loss_weight": 0.4,
-  "backbone_out_indices": [
-    5,
-    11,
-    17,
-    23
-  ],
-  "fusion_hidden_size": 256,
-  "head_in_index": -1,
-  "hidden_act": "gelu",
-  "hidden_dropout_prob": 0.0,
-  "hidden_size": 1024,
-  "image_size": 384,
-  "initializer_range": 0.02,
-  "intermediate_size": 4096,
-  "layer_norm_eps": 1e-12,
-  "model_type": "dpt",
-  "neck_hidden_sizes": [
-    256,
-    512,
-    1024,
-    1024
-  ],
-  "num_attention_heads": 16,
-  "num_channels": 3,
-  "num_hidden_layers": 24,
-  "patch_size": 16,
-  "qkv_bias": true,
-  "readout_type": "project",
-  "reassemble_factors": [
-    4,
-    2,
-    1,
-    0.5
-  ],
-  "semantic_classifier_dropout": 0.1,
-  "semantic_loss_ignore_index": 255,
-  "torch_dtype": "float32",
-  "transformers_version": "4.18.0.dev0",
-  "use_auxiliary_head": true,
-  "use_batch_norm_in_fusion_residual": false
-}