atMrMattV commited on
Commit
9933d98
·
verified ·
1 Parent(s): dde1697

Add files using upload-large-folder tool

Browse files
Files changed (50) hide show
  1. models/ace-step/.gitattributes +38 -0
  2. models/ace-step/README.md +99 -0
  3. models/ace-step/acestep-5Hz-lm-1.7B/added_tokens.json +0 -0
  4. models/ace-step/acestep-5Hz-lm-1.7B/chat_template.jinja +89 -0
  5. models/ace-step/acestep-5Hz-lm-1.7B/config.json +61 -0
  6. models/ace-step/acestep-5Hz-lm-1.7B/merges.txt +0 -0
  7. models/ace-step/acestep-5Hz-lm-1.7B/special_tokens_map.json +0 -0
  8. models/ace-step/acestep-5Hz-lm-1.7B/vocab.json +0 -0
  9. models/ace-step/acestep-5Hz-lm-4B/added_tokens.json +0 -0
  10. models/ace-step/acestep-5Hz-lm-4B/config.json +69 -0
  11. models/ace-step/acestep-5Hz-lm-4B/merges.txt +0 -0
  12. models/ace-step/acestep-5Hz-lm-4B/model.safetensors.index.json +405 -0
  13. models/ace-step/config.json +82 -0
  14. models/dettaglio-restyle/styles.json +1352 -0
  15. models/hunyuan-foley/config_xl.yaml +48 -0
  16. models/kiwi-edit/.gitattributes +36 -0
  17. models/kiwi-edit/README.md +48 -0
  18. models/kiwi-edit/__init__.py +4 -0
  19. models/kiwi-edit/conditional_embedder.py +33 -0
  20. models/kiwi-edit/mllm_encoder.py +0 -0
  21. models/kiwi-edit/model_index.json +35 -0
  22. models/kiwi-edit/pipeline_kiwi_edit.py +510 -0
  23. models/kiwi-edit/wan_video_vae.py +1486 -0
  24. models/rife/.DS_Store +0 -0
  25. models/rife/._.DS_Store +0 -0
  26. models/rife/._IFNet_HDv3.cpython-311.pyc +0 -0
  27. models/rife/._IFNet_HDv3.py +0 -0
  28. models/rife/._RIFE_HDv3.cpython-311.pyc +0 -0
  29. models/rife/._RIFE_HDv3.py +0 -0
  30. models/rife/._RIFEv4.26_0921 +0 -0
  31. models/rife/.___pycache__ +0 -0
  32. models/rife/._flownet.pkl +0 -0
  33. models/rife/._refine.py +0 -0
  34. models/rife/IFNet_HDv3.py +169 -0
  35. models/rife/RIFE_HDv3.py +89 -0
  36. models/rife/refine.py +90 -0
  37. models/seedvr2/.validation_cache.json +12 -0
  38. models/seedvr2/config.json +0 -0
  39. models/voice-presets/bruce.wav +0 -0
  40. models/voice-presets/christian.wav +0 -0
  41. models/voice-presets/hal.wav +0 -0
  42. models/voice-presets/heath.wav +0 -0
  43. models/voice-presets/ian.wav +0 -0
  44. models/voice-presets/johnny.wav +0 -0
  45. models/voice-presets/patrick.wav +0 -0
  46. models/voice-presets/robert.wav +0 -0
  47. models/voice-presets/russel.wav +0 -0
  48. models/voice-presets/sean.wav +0 -0
  49. models/voice-presets/sigourney.wav +0 -0
  50. models/z-image-ControlNet-Union/config.json +39 -0
models/ace-step/.gitattributes ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ acestep-5Hz-lm-1.7B/tokenizer.json filter=lfs diff=lfs merge=lfs -text
37
+ acestep-5Hz-lm-1.7B/tokenizer_config.json filter=lfs diff=lfs merge=lfs -text
38
+ Qwen3-Embedding-0.6B/tokenizer.json filter=lfs diff=lfs merge=lfs -text
models/ace-step/README.md ADDED
@@ -0,0 +1,99 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ library_name: transformers
3
+ license: mit
4
+ pipeline_tag: text-to-audio
5
+ tags:
6
+ - audio
7
+ - music
8
+ - text2music
9
+ ---
10
+
11
+ <h1 align="center">ACE-Step 1.5</h1>
12
+ <h1 align="center">Pushing the Boundaries of Open-Source Music Generation</h1>
13
+ <p align="center">
14
+ <a href="https://ace-step.github.io/ace-step-v1.5.github.io/">Project</a> |
15
+ <a href="https://huggingface.co/collections/ACE-Step/ace-step-15">Hugging Face</a> |
16
+ <a href="https://modelscope.cn/models/ACE-Step/Ace-Step1.5">ModelScope</a> |
17
+ <a href="https://huggingface.co/spaces/ACE-Step/Ace-Step-v1.5">Space Demo</a> |
18
+ <a href="https://discord.gg/PeWDxrkdj7">Discord</a>
19
+ <a href="https://arxiv.org/abs/2602.00744">Tech Report</a>
20
+ </p>
21
+
22
+
23
+ ![image](https://cdn-uploads.huggingface.co/production/uploads/62dfaf90c42558bcbd0a4f6f/b84r7t0viIw7rKSr_ja9_.png)
24
+
25
+ ## Model Details
26
+
27
+ 🚀 **ACE-Step v1.5** is a highly efficient open-source music foundation model designed to bring commercial-grade music generation to consumer hardware.
28
+
29
+ ### Key Features
30
+
31
+ * **💰 Commercial-Ready:** Unlike many models trained on ambiguous datasets, ACE-Step v1.5 is designed for creators. You can strictly use the generated music for **commercial purposes**.
32
+ * **📚 Safe & Robust Training Data:** The model is trained on a massive, legally compliant dataset consisting of:
33
+ * **Licensed Data:** Professionally licensed music tracks.
34
+ * **Royalty-Free / No-Copyright Data:** A vast collection of public domain and royalty-free music.
35
+ * **Synthetic Data:** High-quality audio generated via advanced MIDI-to-Audio conversion.
36
+ * **⚡ Extreme Speed:** Generates a full song in under 2 seconds on an A100 and under 10 seconds on an RTX 3090.
37
+ * **🖥️ Consumer Hardware Friendly:** Runs locally with less than 4GB of VRAM.
38
+
39
+ ### Technical Capabilities
40
+
41
+ 🌉 At its core lies a novel hybrid architecture where the Language Model (LM) functions as an omni-capable planner: it transforms simple user queries into comprehensive song blueprints—scaling from short loops to 10-minute compositions—while synthesizing metadata, lyrics, and captions via Chain-of-Thought to guide the Diffusion Transformer (DiT). ⚡ Uniquely, this alignment is achieved through intrinsic reinforcement learning relying solely on the model's internal mechanisms, thereby eliminating the biases inherent in external reward models or human preferences. 🎚️
42
+
43
+ 🔮 Beyond standard synthesis, ACE-Step v1.5 unifies precise stylistic control with versatile editing capabilities—such as cover generation, repainting, and vocal-to-BGM conversion—while maintaining strict adherence to prompts across 50+ languages. This paves the way for powerful tools that seamlessly integrate into the creative workflows of music artists, producers, and content creators. 🎸
44
+
45
+ - **Developed by:** [ACE-STEP]
46
+ - **Model type:** [Text2Music]
47
+ - **Language(s):** [50+ languages]
48
+ - **License:** [MIT]
49
+
50
+ ## Evaluation
51
+
52
+ ![image](https://cdn-uploads.huggingface.co/production/uploads/62dfaf90c42558bcbd0a4f6f/n9aKi_NhSmlMOgmGzahZi.png)
53
+
54
+ ## 🏗️ Architecture
55
+
56
+
57
+ ![image](https://cdn-uploads.huggingface.co/production/uploads/62dfaf90c42558bcbd0a4f6f/V_d1rTdqkQyoSM8td7OWl.png)
58
+
59
+
60
+ ## 🦁 Model Zoo
61
+
62
+
63
+ ![image](https://cdn-uploads.huggingface.co/production/uploads/62dfaf90c42558bcbd0a4f6f/B49V0OTKse_FRefTmTPsQ.png)
64
+
65
+ ### DiT Models
66
+
67
+ | DiT Model | Pre-Training | SFT | RL | CFG | Step | Refer audio | Text2Music | Cover | Repaint | Extract | Lego | Complete | Quality | Diversity | Fine-Tunability | Hugging Face |
68
+ |-----------|:------------:|:---:|:--:|:---:|:----:|:-----------:|:----------:|:-----:|:-------:|:-------:|:----:|:--------:|:-------:|:---------:|:---------------:|--------------|
69
+ | `acestep-v15-base` | ✅ | ❌ | ❌ | ✅ | 50 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | Medium | High | Easy | [Link](https://huggingface.co/ACE-Step/acestep-v15-base) |
70
+ | `acestep-v15-sft` | ✅ | ✅ | ❌ | ✅ | 50 | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ | High | Medium | Easy | [Link](https://huggingface.co/ACE-Step/acestep-v15-sft) |
71
+ | `acestep-v15-turbo` | ✅ | ✅ | ❌ | ❌ | 8 | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ | Very High | Medium | Medium | [Link](https://huggingface.co/ACE-Step/Ace-Step1.5) |
72
+ | `acestep-v15-turbo-rl` | ✅ | ✅ | ✅ | ❌ | 8 | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ | Very High | Medium | Medium | To be released |
73
+
74
+ ### LM Models
75
+
76
+ | LM Model | Pretrain from | Pre-Training | SFT | RL | CoT metas | Query rewrite | Audio Understanding | Composition Capability | Copy Melody | Hugging Face |
77
+ |----------|---------------|:------------:|:---:|:--:|:---------:|:-------------:|:-------------------:|:----------------------:|:-----------:|--------------|
78
+ | `acestep-5Hz-lm-0.6B` | Qwen3-0.6B | ✅ | ✅ | ✅ | ✅ | ✅ | Medium | Medium | Weak | ✅ |
79
+ | `acestep-5Hz-lm-1.7B` | Qwen3-1.7B | ✅ | ✅ | ✅ | ✅ | ✅ | Medium | Medium | Medium | ✅ |
80
+ | `acestep-5Hz-lm-4B` | Qwen3-4B | ✅ | ✅ | ✅ | ✅ | ✅ | Strong | Strong | Strong | ✅ |
81
+
82
+
83
+ ## 🙏 Acknowledgements
84
+
85
+ This project is co-led by ACE Studio and StepFun.
86
+
87
+
88
+ ## 📖 Citation
89
+
90
+ If you find this project useful for your research, please consider citing:
91
+
92
+ ```BibTeX
93
+ @misc{gong2026acestep,
94
+ title={ACE-Step 1.5: Pushing the Boundaries of Open-Source Music Generation},
95
+ author={Junmin Gong, Yulin Song, Wenxiao Zhao, Sen Wang, Shengyuan Xu, Jing Guo},
96
+ howpublished={\url{https://github.com/ace-step/ACE-Step-1.5}},
97
+ year={2026},
98
+ note={GitHub repository}
99
+ }
models/ace-step/acestep-5Hz-lm-1.7B/added_tokens.json ADDED
The diff for this file is too large to render. See raw diff
 
models/ace-step/acestep-5Hz-lm-1.7B/chat_template.jinja ADDED
@@ -0,0 +1,89 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {%- if tools %}
2
+ {{- '<|im_start|>system\n' }}
3
+ {%- if messages[0].role == 'system' %}
4
+ {{- messages[0].content + '\n\n' }}
5
+ {%- endif %}
6
+ {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
7
+ {%- for tool in tools %}
8
+ {{- "\n" }}
9
+ {{- tool | tojson }}
10
+ {%- endfor %}
11
+ {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
12
+ {%- else %}
13
+ {%- if messages[0].role == 'system' %}
14
+ {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
15
+ {%- endif %}
16
+ {%- endif %}
17
+ {%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
18
+ {%- for message in messages[::-1] %}
19
+ {%- set index = (messages|length - 1) - loop.index0 %}
20
+ {%- if ns.multi_step_tool and message.role == "user" and message.content is string and not(message.content.startswith('<tool_response>') and message.content.endswith('</tool_response>')) %}
21
+ {%- set ns.multi_step_tool = false %}
22
+ {%- set ns.last_query_index = index %}
23
+ {%- endif %}
24
+ {%- endfor %}
25
+ {%- for message in messages %}
26
+ {%- if message.content is string %}
27
+ {%- set content = message.content %}
28
+ {%- else %}
29
+ {%- set content = '' %}
30
+ {%- endif %}
31
+ {%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
32
+ {{- '<|im_start|>' + message.role + '\n' + content + '<|im_end|>' + '\n' }}
33
+ {%- elif message.role == "assistant" %}
34
+ {%- set reasoning_content = '' %}
35
+ {%- if message.reasoning_content is string %}
36
+ {%- set reasoning_content = message.reasoning_content %}
37
+ {%- else %}
38
+ {%- if '</think>' in content %}
39
+ {%- set reasoning_content = content.split('</think>')[0].rstrip('\n').split('<think>')[-1].lstrip('\n') %}
40
+ {%- set content = content.split('</think>')[-1].lstrip('\n') %}
41
+ {%- endif %}
42
+ {%- endif %}
43
+ {%- if loop.index0 > ns.last_query_index %}
44
+ {%- if loop.last or (not loop.last and reasoning_content) %}
45
+ {{- '<|im_start|>' + message.role + '\n<think>\n' + reasoning_content.strip('\n') + '\n</think>\n\n' + content.lstrip('\n') }}
46
+ {%- else %}
47
+ {{- '<|im_start|>' + message.role + '\n' + content }}
48
+ {%- endif %}
49
+ {%- else %}
50
+ {{- '<|im_start|>' + message.role + '\n' + content }}
51
+ {%- endif %}
52
+ {%- if message.tool_calls %}
53
+ {%- for tool_call in message.tool_calls %}
54
+ {%- if (loop.first and content) or (not loop.first) %}
55
+ {{- '\n' }}
56
+ {%- endif %}
57
+ {%- if tool_call.function %}
58
+ {%- set tool_call = tool_call.function %}
59
+ {%- endif %}
60
+ {{- '<tool_call>\n{"name": "' }}
61
+ {{- tool_call.name }}
62
+ {{- '", "arguments": ' }}
63
+ {%- if tool_call.arguments is string %}
64
+ {{- tool_call.arguments }}
65
+ {%- else %}
66
+ {{- tool_call.arguments | tojson }}
67
+ {%- endif %}
68
+ {{- '}\n</tool_call>' }}
69
+ {%- endfor %}
70
+ {%- endif %}
71
+ {{- '<|im_end|>\n' }}
72
+ {%- elif message.role == "tool" %}
73
+ {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
74
+ {{- '<|im_start|>user' }}
75
+ {%- endif %}
76
+ {{- '\n<tool_response>\n' }}
77
+ {{- content }}
78
+ {{- '\n</tool_response>' }}
79
+ {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
80
+ {{- '<|im_end|>\n' }}
81
+ {%- endif %}
82
+ {%- endif %}
83
+ {%- endfor %}
84
+ {%- if add_generation_prompt %}
85
+ {{- '<|im_start|>assistant\n' }}
86
+ {%- if enable_thinking is defined and enable_thinking is false %}
87
+ {{- '<think>\n\n</think>\n\n' }}
88
+ {%- endif %}
89
+ {%- endif %}
models/ace-step/acestep-5Hz-lm-1.7B/config.json ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "Qwen3Model"
4
+ ],
5
+ "attention_bias": false,
6
+ "attention_dropout": 0.0,
7
+ "bos_token_id": 151643,
8
+ "dtype": "bfloat16",
9
+ "eos_token_id": 151645,
10
+ "head_dim": 128,
11
+ "hidden_act": "silu",
12
+ "hidden_size": 2048,
13
+ "initializer_range": 0.02,
14
+ "intermediate_size": 6144,
15
+ "layer_types": [
16
+ "full_attention",
17
+ "full_attention",
18
+ "full_attention",
19
+ "full_attention",
20
+ "full_attention",
21
+ "full_attention",
22
+ "full_attention",
23
+ "full_attention",
24
+ "full_attention",
25
+ "full_attention",
26
+ "full_attention",
27
+ "full_attention",
28
+ "full_attention",
29
+ "full_attention",
30
+ "full_attention",
31
+ "full_attention",
32
+ "full_attention",
33
+ "full_attention",
34
+ "full_attention",
35
+ "full_attention",
36
+ "full_attention",
37
+ "full_attention",
38
+ "full_attention",
39
+ "full_attention",
40
+ "full_attention",
41
+ "full_attention",
42
+ "full_attention",
43
+ "full_attention"
44
+ ],
45
+ "max_position_embeddings": 40960,
46
+ "max_window_layers": 28,
47
+ "model_type": "qwen3",
48
+ "num_attention_heads": 16,
49
+ "num_hidden_layers": 28,
50
+ "num_key_value_heads": 8,
51
+ "pad_token_id": 151643,
52
+ "rms_norm_eps": 1e-06,
53
+ "rope_scaling": null,
54
+ "rope_theta": 1000000,
55
+ "sliding_window": null,
56
+ "tie_word_embeddings": true,
57
+ "transformers_version": "4.57.0.dev0",
58
+ "use_cache": true,
59
+ "use_sliding_window": false,
60
+ "vocab_size": 217204
61
+ }
models/ace-step/acestep-5Hz-lm-1.7B/merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
models/ace-step/acestep-5Hz-lm-1.7B/special_tokens_map.json ADDED
The diff for this file is too large to render. See raw diff
 
models/ace-step/acestep-5Hz-lm-1.7B/vocab.json ADDED
The diff for this file is too large to render. See raw diff
 
models/ace-step/acestep-5Hz-lm-4B/added_tokens.json ADDED
The diff for this file is too large to render. See raw diff
 
models/ace-step/acestep-5Hz-lm-4B/config.json ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "Qwen3ForCausalLM"
4
+ ],
5
+ "attention_bias": false,
6
+ "attention_dropout": 0.0,
7
+ "bos_token_id": 151643,
8
+ "dtype": "bfloat16",
9
+ "eos_token_id": 151645,
10
+ "head_dim": 128,
11
+ "hidden_act": "silu",
12
+ "hidden_size": 2560,
13
+ "initializer_range": 0.02,
14
+ "intermediate_size": 9728,
15
+ "layer_types": [
16
+ "full_attention",
17
+ "full_attention",
18
+ "full_attention",
19
+ "full_attention",
20
+ "full_attention",
21
+ "full_attention",
22
+ "full_attention",
23
+ "full_attention",
24
+ "full_attention",
25
+ "full_attention",
26
+ "full_attention",
27
+ "full_attention",
28
+ "full_attention",
29
+ "full_attention",
30
+ "full_attention",
31
+ "full_attention",
32
+ "full_attention",
33
+ "full_attention",
34
+ "full_attention",
35
+ "full_attention",
36
+ "full_attention",
37
+ "full_attention",
38
+ "full_attention",
39
+ "full_attention",
40
+ "full_attention",
41
+ "full_attention",
42
+ "full_attention",
43
+ "full_attention",
44
+ "full_attention",
45
+ "full_attention",
46
+ "full_attention",
47
+ "full_attention",
48
+ "full_attention",
49
+ "full_attention",
50
+ "full_attention",
51
+ "full_attention"
52
+ ],
53
+ "max_position_embeddings": 40960,
54
+ "max_window_layers": 36,
55
+ "model_type": "qwen3",
56
+ "num_attention_heads": 32,
57
+ "num_hidden_layers": 36,
58
+ "num_key_value_heads": 8,
59
+ "pad_token_id": 151643,
60
+ "rms_norm_eps": 1e-06,
61
+ "rope_scaling": null,
62
+ "rope_theta": 1000000,
63
+ "sliding_window": null,
64
+ "tie_word_embeddings": true,
65
+ "transformers_version": "4.57.1",
66
+ "use_cache": true,
67
+ "use_sliding_window": false,
68
+ "vocab_size": 217204
69
+ }
models/ace-step/acestep-5Hz-lm-4B/merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
models/ace-step/acestep-5Hz-lm-4B/model.safetensors.index.json ADDED
@@ -0,0 +1,405 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "metadata": {
3
+ "total_size": 8379108352
4
+ },
5
+ "weight_map": {
6
+ "model.embed_tokens.weight": "model-00001-of-00002.safetensors",
7
+ "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
8
+ "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
9
+ "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
10
+ "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
11
+ "model.layers.0.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
12
+ "model.layers.0.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
13
+ "model.layers.0.input_layernorm.weight": "model-00001-of-00002.safetensors",
14
+ "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
15
+ "model.layers.0.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
16
+ "model.layers.0.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
17
+ "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
18
+ "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
19
+ "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
20
+ "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
21
+ "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
22
+ "model.layers.1.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
23
+ "model.layers.1.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
24
+ "model.layers.1.input_layernorm.weight": "model-00001-of-00002.safetensors",
25
+ "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
26
+ "model.layers.1.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
27
+ "model.layers.1.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
28
+ "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
29
+ "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
30
+ "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
31
+ "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
32
+ "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
33
+ "model.layers.2.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
34
+ "model.layers.2.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
35
+ "model.layers.2.input_layernorm.weight": "model-00001-of-00002.safetensors",
36
+ "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
37
+ "model.layers.2.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
38
+ "model.layers.2.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
39
+ "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
40
+ "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
41
+ "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
42
+ "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
43
+ "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
44
+ "model.layers.3.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
45
+ "model.layers.3.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
46
+ "model.layers.3.input_layernorm.weight": "model-00001-of-00002.safetensors",
47
+ "model.layers.3.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
48
+ "model.layers.3.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
49
+ "model.layers.3.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
50
+ "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
51
+ "model.layers.4.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
52
+ "model.layers.4.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
53
+ "model.layers.4.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
54
+ "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
55
+ "model.layers.4.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
56
+ "model.layers.4.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
57
+ "model.layers.4.input_layernorm.weight": "model-00001-of-00002.safetensors",
58
+ "model.layers.4.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
59
+ "model.layers.4.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
60
+ "model.layers.4.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
61
+ "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
62
+ "model.layers.5.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
63
+ "model.layers.5.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
64
+ "model.layers.5.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
65
+ "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
66
+ "model.layers.5.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
67
+ "model.layers.5.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
68
+ "model.layers.5.input_layernorm.weight": "model-00001-of-00002.safetensors",
69
+ "model.layers.5.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
70
+ "model.layers.5.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
71
+ "model.layers.5.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
72
+ "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
73
+ "model.layers.6.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
74
+ "model.layers.6.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
75
+ "model.layers.6.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
76
+ "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
77
+ "model.layers.6.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
78
+ "model.layers.6.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
79
+ "model.layers.6.input_layernorm.weight": "model-00001-of-00002.safetensors",
80
+ "model.layers.6.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
81
+ "model.layers.6.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
82
+ "model.layers.6.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
83
+ "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
84
+ "model.layers.7.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
85
+ "model.layers.7.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
86
+ "model.layers.7.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
87
+ "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
88
+ "model.layers.7.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
89
+ "model.layers.7.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
90
+ "model.layers.7.input_layernorm.weight": "model-00001-of-00002.safetensors",
91
+ "model.layers.7.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
92
+ "model.layers.7.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
93
+ "model.layers.7.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
94
+ "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
95
+ "model.layers.8.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
96
+ "model.layers.8.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
97
+ "model.layers.8.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
98
+ "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
99
+ "model.layers.8.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
100
+ "model.layers.8.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
101
+ "model.layers.8.input_layernorm.weight": "model-00001-of-00002.safetensors",
102
+ "model.layers.8.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
103
+ "model.layers.8.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
104
+ "model.layers.8.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
105
+ "model.layers.8.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
106
+ "model.layers.9.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
107
+ "model.layers.9.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
108
+ "model.layers.9.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
109
+ "model.layers.9.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
110
+ "model.layers.9.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
111
+ "model.layers.9.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
112
+ "model.layers.9.input_layernorm.weight": "model-00001-of-00002.safetensors",
113
+ "model.layers.9.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
114
+ "model.layers.9.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
115
+ "model.layers.9.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
116
+ "model.layers.9.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
117
+ "model.layers.10.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
118
+ "model.layers.10.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
119
+ "model.layers.10.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
120
+ "model.layers.10.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
121
+ "model.layers.10.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
122
+ "model.layers.10.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
123
+ "model.layers.10.input_layernorm.weight": "model-00001-of-00002.safetensors",
124
+ "model.layers.10.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
125
+ "model.layers.10.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
126
+ "model.layers.10.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
127
+ "model.layers.10.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
128
+ "model.layers.11.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
129
+ "model.layers.11.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
130
+ "model.layers.11.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
131
+ "model.layers.11.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
132
+ "model.layers.11.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
133
+ "model.layers.11.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
134
+ "model.layers.11.input_layernorm.weight": "model-00001-of-00002.safetensors",
135
+ "model.layers.11.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
136
+ "model.layers.11.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
137
+ "model.layers.11.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
138
+ "model.layers.11.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
139
+ "model.layers.12.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
140
+ "model.layers.12.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
141
+ "model.layers.12.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
142
+ "model.layers.12.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
143
+ "model.layers.12.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
144
+ "model.layers.12.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
145
+ "model.layers.12.input_layernorm.weight": "model-00001-of-00002.safetensors",
146
+ "model.layers.12.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
147
+ "model.layers.12.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
148
+ "model.layers.12.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
149
+ "model.layers.12.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
150
+ "model.layers.13.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
151
+ "model.layers.13.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
152
+ "model.layers.13.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
153
+ "model.layers.13.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
154
+ "model.layers.13.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
155
+ "model.layers.13.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
156
+ "model.layers.13.input_layernorm.weight": "model-00001-of-00002.safetensors",
157
+ "model.layers.13.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
158
+ "model.layers.13.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
159
+ "model.layers.13.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
160
+ "model.layers.13.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
161
+ "model.layers.14.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
162
+ "model.layers.14.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
163
+ "model.layers.14.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
164
+ "model.layers.14.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
165
+ "model.layers.14.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
166
+ "model.layers.14.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
167
+ "model.layers.14.input_layernorm.weight": "model-00001-of-00002.safetensors",
168
+ "model.layers.14.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
169
+ "model.layers.14.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
170
+ "model.layers.14.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
171
+ "model.layers.14.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
172
+ "model.layers.15.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
173
+ "model.layers.15.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
174
+ "model.layers.15.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
175
+ "model.layers.15.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
176
+ "model.layers.15.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
177
+ "model.layers.15.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
178
+ "model.layers.15.input_layernorm.weight": "model-00001-of-00002.safetensors",
179
+ "model.layers.15.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
180
+ "model.layers.15.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
181
+ "model.layers.15.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
182
+ "model.layers.15.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
183
+ "model.layers.16.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
184
+ "model.layers.16.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
185
+ "model.layers.16.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
186
+ "model.layers.16.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
187
+ "model.layers.16.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
188
+ "model.layers.16.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
189
+ "model.layers.16.input_layernorm.weight": "model-00001-of-00002.safetensors",
190
+ "model.layers.16.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
191
+ "model.layers.16.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
192
+ "model.layers.16.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
193
+ "model.layers.16.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
194
+ "model.layers.17.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
195
+ "model.layers.17.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
196
+ "model.layers.17.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
197
+ "model.layers.17.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
198
+ "model.layers.17.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
199
+ "model.layers.17.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
200
+ "model.layers.17.input_layernorm.weight": "model-00001-of-00002.safetensors",
201
+ "model.layers.17.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
202
+ "model.layers.17.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
203
+ "model.layers.17.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
204
+ "model.layers.17.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
205
+ "model.layers.18.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
206
+ "model.layers.18.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
207
+ "model.layers.18.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
208
+ "model.layers.18.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
209
+ "model.layers.18.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
210
+ "model.layers.18.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
211
+ "model.layers.18.input_layernorm.weight": "model-00001-of-00002.safetensors",
212
+ "model.layers.18.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
213
+ "model.layers.18.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
214
+ "model.layers.18.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
215
+ "model.layers.18.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
216
+ "model.layers.19.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
217
+ "model.layers.19.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
218
+ "model.layers.19.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
219
+ "model.layers.19.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
220
+ "model.layers.19.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
221
+ "model.layers.19.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
222
+ "model.layers.19.input_layernorm.weight": "model-00001-of-00002.safetensors",
223
+ "model.layers.19.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
224
+ "model.layers.19.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
225
+ "model.layers.19.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
226
+ "model.layers.19.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
227
+ "model.layers.20.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
228
+ "model.layers.20.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
229
+ "model.layers.20.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
230
+ "model.layers.20.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
231
+ "model.layers.20.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
232
+ "model.layers.20.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
233
+ "model.layers.20.input_layernorm.weight": "model-00002-of-00002.safetensors",
234
+ "model.layers.20.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
235
+ "model.layers.20.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
236
+ "model.layers.20.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
237
+ "model.layers.20.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
238
+ "model.layers.21.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
239
+ "model.layers.21.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
240
+ "model.layers.21.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
241
+ "model.layers.21.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
242
+ "model.layers.21.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
243
+ "model.layers.21.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
244
+ "model.layers.21.input_layernorm.weight": "model-00002-of-00002.safetensors",
245
+ "model.layers.21.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
246
+ "model.layers.21.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
247
+ "model.layers.21.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
248
+ "model.layers.21.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
249
+ "model.layers.22.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
250
+ "model.layers.22.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
251
+ "model.layers.22.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
252
+ "model.layers.22.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
253
+ "model.layers.22.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
254
+ "model.layers.22.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
255
+ "model.layers.22.input_layernorm.weight": "model-00002-of-00002.safetensors",
256
+ "model.layers.22.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
257
+ "model.layers.22.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
258
+ "model.layers.22.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
259
+ "model.layers.22.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
260
+ "model.layers.23.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
261
+ "model.layers.23.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
262
+ "model.layers.23.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
263
+ "model.layers.23.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
264
+ "model.layers.23.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
265
+ "model.layers.23.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
266
+ "model.layers.23.input_layernorm.weight": "model-00002-of-00002.safetensors",
267
+ "model.layers.23.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
268
+ "model.layers.23.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
269
+ "model.layers.23.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
270
+ "model.layers.23.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
271
+ "model.layers.24.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
272
+ "model.layers.24.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
273
+ "model.layers.24.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
274
+ "model.layers.24.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
275
+ "model.layers.24.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
276
+ "model.layers.24.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
277
+ "model.layers.24.input_layernorm.weight": "model-00002-of-00002.safetensors",
278
+ "model.layers.24.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
279
+ "model.layers.24.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
280
+ "model.layers.24.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
281
+ "model.layers.24.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
282
+ "model.layers.25.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
283
+ "model.layers.25.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
284
+ "model.layers.25.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
285
+ "model.layers.25.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
286
+ "model.layers.25.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
287
+ "model.layers.25.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
288
+ "model.layers.25.input_layernorm.weight": "model-00002-of-00002.safetensors",
289
+ "model.layers.25.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
290
+ "model.layers.25.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
291
+ "model.layers.25.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
292
+ "model.layers.25.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
293
+ "model.layers.26.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
294
+ "model.layers.26.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
295
+ "model.layers.26.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
296
+ "model.layers.26.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
297
+ "model.layers.26.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
298
+ "model.layers.26.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
299
+ "model.layers.26.input_layernorm.weight": "model-00002-of-00002.safetensors",
300
+ "model.layers.26.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
301
+ "model.layers.26.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
302
+ "model.layers.26.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
303
+ "model.layers.26.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
304
+ "model.layers.27.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
305
+ "model.layers.27.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
306
+ "model.layers.27.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
307
+ "model.layers.27.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
308
+ "model.layers.27.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
309
+ "model.layers.27.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
310
+ "model.layers.27.input_layernorm.weight": "model-00002-of-00002.safetensors",
311
+ "model.layers.27.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
312
+ "model.layers.27.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
313
+ "model.layers.27.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
314
+ "model.layers.27.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
315
+ "model.layers.28.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
316
+ "model.layers.28.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
317
+ "model.layers.28.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
318
+ "model.layers.28.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
319
+ "model.layers.28.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
320
+ "model.layers.28.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
321
+ "model.layers.28.input_layernorm.weight": "model-00002-of-00002.safetensors",
322
+ "model.layers.28.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
323
+ "model.layers.28.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
324
+ "model.layers.28.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
325
+ "model.layers.28.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
326
+ "model.layers.29.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
327
+ "model.layers.29.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
328
+ "model.layers.29.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
329
+ "model.layers.29.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
330
+ "model.layers.29.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
331
+ "model.layers.29.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
332
+ "model.layers.29.input_layernorm.weight": "model-00002-of-00002.safetensors",
333
+ "model.layers.29.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
334
+ "model.layers.29.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
335
+ "model.layers.29.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
336
+ "model.layers.29.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
337
+ "model.layers.30.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
338
+ "model.layers.30.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
339
+ "model.layers.30.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
340
+ "model.layers.30.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
341
+ "model.layers.30.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
342
+ "model.layers.30.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
343
+ "model.layers.30.input_layernorm.weight": "model-00002-of-00002.safetensors",
344
+ "model.layers.30.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
345
+ "model.layers.30.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
346
+ "model.layers.30.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
347
+ "model.layers.30.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
348
+ "model.layers.31.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
349
+ "model.layers.31.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
350
+ "model.layers.31.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
351
+ "model.layers.31.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
352
+ "model.layers.31.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
353
+ "model.layers.31.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
354
+ "model.layers.31.input_layernorm.weight": "model-00002-of-00002.safetensors",
355
+ "model.layers.31.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
356
+ "model.layers.31.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
357
+ "model.layers.31.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
358
+ "model.layers.31.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
359
+ "model.layers.32.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
360
+ "model.layers.32.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
361
+ "model.layers.32.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
362
+ "model.layers.32.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
363
+ "model.layers.32.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
364
+ "model.layers.32.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
365
+ "model.layers.32.input_layernorm.weight": "model-00002-of-00002.safetensors",
366
+ "model.layers.32.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
367
+ "model.layers.32.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
368
+ "model.layers.32.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
369
+ "model.layers.32.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
370
+ "model.layers.33.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
371
+ "model.layers.33.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
372
+ "model.layers.33.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
373
+ "model.layers.33.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
374
+ "model.layers.33.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
375
+ "model.layers.33.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
376
+ "model.layers.33.input_layernorm.weight": "model-00002-of-00002.safetensors",
377
+ "model.layers.33.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
378
+ "model.layers.33.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
379
+ "model.layers.33.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
380
+ "model.layers.33.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
381
+ "model.layers.34.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
382
+ "model.layers.34.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
383
+ "model.layers.34.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
384
+ "model.layers.34.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
385
+ "model.layers.34.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
386
+ "model.layers.34.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
387
+ "model.layers.34.input_layernorm.weight": "model-00002-of-00002.safetensors",
388
+ "model.layers.34.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
389
+ "model.layers.34.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
390
+ "model.layers.34.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
391
+ "model.layers.34.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
392
+ "model.layers.35.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
393
+ "model.layers.35.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
394
+ "model.layers.35.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
395
+ "model.layers.35.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
396
+ "model.layers.35.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
397
+ "model.layers.35.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
398
+ "model.layers.35.input_layernorm.weight": "model-00002-of-00002.safetensors",
399
+ "model.layers.35.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
400
+ "model.layers.35.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
401
+ "model.layers.35.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
402
+ "model.layers.35.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
403
+ "model.norm.weight": "model-00002-of-00002.safetensors"
404
+ }
405
+ }
models/ace-step/config.json ADDED
@@ -0,0 +1,82 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "AceStepConditionGenerationModel"
4
+ ],
5
+ "attention_bias": false,
6
+ "attention_dropout": 0.0,
7
+ "audio_acoustic_hidden_dim": 64,
8
+ "auto_map": {
9
+ "AutoConfig": "configuration_acestep_v15.AceStepConfig",
10
+ "AutoModel": "modeling_acestep_v15_turbo.AceStepConditionGenerationModel"
11
+ },
12
+ "data_proportion": 0.5,
13
+ "dtype": "bfloat16",
14
+ "fsq_dim": 2048,
15
+ "fsq_input_levels": [
16
+ 8,
17
+ 8,
18
+ 8,
19
+ 5,
20
+ 5,
21
+ 5
22
+ ],
23
+ "fsq_input_num_quantizers": 1,
24
+ "head_dim": 128,
25
+ "hidden_act": "silu",
26
+ "hidden_size": 2048,
27
+ "in_channels": 192,
28
+ "initializer_range": 0.02,
29
+ "intermediate_size": 6144,
30
+ "is_turbo": true,
31
+ "layer_types": [
32
+ "sliding_attention",
33
+ "full_attention",
34
+ "sliding_attention",
35
+ "full_attention",
36
+ "sliding_attention",
37
+ "full_attention",
38
+ "sliding_attention",
39
+ "full_attention",
40
+ "sliding_attention",
41
+ "full_attention",
42
+ "sliding_attention",
43
+ "full_attention",
44
+ "sliding_attention",
45
+ "full_attention",
46
+ "sliding_attention",
47
+ "full_attention",
48
+ "sliding_attention",
49
+ "full_attention",
50
+ "sliding_attention",
51
+ "full_attention",
52
+ "sliding_attention",
53
+ "full_attention",
54
+ "sliding_attention",
55
+ "full_attention"
56
+ ],
57
+ "max_position_embeddings": 32768,
58
+ "model_type": "acestep",
59
+ "model_version": "turbo",
60
+ "num_attention_heads": 16,
61
+ "num_attention_pooler_hidden_layers": 2,
62
+ "num_audio_decoder_hidden_layers": 24,
63
+ "num_hidden_layers": 24,
64
+ "num_key_value_heads": 8,
65
+ "num_lyric_encoder_hidden_layers": 8,
66
+ "num_timbre_encoder_hidden_layers": 4,
67
+ "patch_size": 2,
68
+ "pool_window_size": 5,
69
+ "rms_norm_eps": 1e-06,
70
+ "rope_scaling": null,
71
+ "rope_theta": 1000000,
72
+ "sliding_window": 128,
73
+ "text_hidden_dim": 1024,
74
+ "timbre_fix_frame": 750,
75
+ "timbre_hidden_dim": 64,
76
+ "timestep_mu": -0.4,
77
+ "timestep_sigma": 1.0,
78
+ "transformers_version": "4.57.0.dev0",
79
+ "use_cache": true,
80
+ "use_sliding_window": true,
81
+ "vocab_size": 64003
82
+ }
models/dettaglio-restyle/styles.json ADDED
@@ -0,0 +1,1352 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "style_description": "cinematic still, emotional, harmonious, vignette, 4k epic detailed, shot on kodak, 35mm photo, sharp focus, high budget, cinemascope, moody, epic, gorgeous, film grain, grainy",
4
+ "name": "Fooocus Sharp",
5
+ "thumbnail": "fooocus_sharp.webp"
6
+ },
7
+ {
8
+ "style_description": "(masterpiece), (best quality), (ultra-detailed), illustration, disheveled hair, detailed eyes, perfect composition, moist skin, intricate details, earrings, by wlop",
9
+ "name": "Fooocus Masterpiece",
10
+ "thumbnail": "fooocus_masterpiece.webp"
11
+ },
12
+ {
13
+ "style_description": "photograph, 50mm, cinematic 4k epic detailed 4k epic detailed photograph shot on kodak detailed cinematic hbo dark moody, 35mm photo, grainy, vignette, vintage, Kodachrome, Lomography, stained, highly detailed, found footage",
14
+ "name": "Fooocus Photograph",
15
+ "thumbnail": "fooocus_photograph.webp"
16
+ },
17
+ {
18
+ "style_description": "cinematic still, emotional, harmonious, vignette, highly detailed, high budget, bokeh, cinemascope, moody, epic, gorgeous, film grain, grainy",
19
+ "name": "Fooocus Cinematic",
20
+ "thumbnail": "fooocus_cinematic.webp"
21
+ },
22
+ {
23
+ "style_description": "professional 3d model, octane render, highly detailed, volumetric, dramatic lighting",
24
+ "name": "sai-3d-model",
25
+ "thumbnail": "sai-3d-model.webp"
26
+ },
27
+ {
28
+ "style_description": "analog film photo, faded film, desaturated, 35mm photo, grainy, vignette, vintage, Kodachrome, Lomography, stained, highly detailed, found footage",
29
+ "name": "sai-analog film",
30
+ "thumbnail": "sai-analog_film.webp"
31
+ },
32
+ {
33
+ "style_description": "anime artwork, anime style, key visual, vibrant, studio anime, highly detailed",
34
+ "name": "sai-anime",
35
+ "thumbnail": "sai-anime.webp"
36
+ },
37
+ {
38
+ "style_description": "cinematic film still, shallow depth of field, vignette, highly detailed, high budget, bokeh, cinemascope, moody, epic, gorgeous, film grain, grainy",
39
+ "name": "sai-cinematic",
40
+ "thumbnail": "sai-cinematic.webp"
41
+ },
42
+ {
43
+ "style_description": "comic, graphic illustration, comic art, graphic novel art, vibrant, highly detailed",
44
+ "name": "sai-comic book",
45
+ "thumbnail": "sai-comic_book.webp"
46
+ },
47
+ {
48
+ "style_description": "play-doh style, sculpture, clay art, centered composition, Claymation",
49
+ "name": "sai-craft clay",
50
+ "thumbnail": "sai-craft_clay.webp"
51
+ },
52
+ {
53
+ "style_description": "concept art, digital artwork, illustrative, painterly, matte painting, highly detailed",
54
+ "name": "sai-digital art",
55
+ "thumbnail": "sai-digital_art.webp"
56
+ },
57
+ {
58
+ "style_description": "breathtaking, award-winning, professional, highly detailed",
59
+ "name": "sai-enhance",
60
+ "thumbnail": "sai-enhance.webp"
61
+ },
62
+ {
63
+ "style_description": "ethereal fantasy concept art of , magnificent, celestial, ethereal, painterly, epic, majestic, magical, fantasy art, cover art, dreamy",
64
+ "name": "sai-fantasy art",
65
+ "thumbnail": "sai-fantasy_art.webp"
66
+ },
67
+ {
68
+ "style_description": "isometric style, vibrant, beautiful, crisp, detailed, ultra detailed, intricate",
69
+ "name": "sai-isometric",
70
+ "thumbnail": "sai-isometric.webp"
71
+ },
72
+ {
73
+ "style_description": "line art drawing, professional, sleek, modern, minimalist, graphic, line art, vector graphics",
74
+ "name": "sai-line art",
75
+ "thumbnail": "sai-line_art.webp"
76
+ },
77
+ {
78
+ "style_description": "low-poly style, low-poly game art, polygon mesh, jagged, blocky, wireframe edges, centered composition",
79
+ "name": "sai-lowpoly",
80
+ "thumbnail": "sai-lowpoly.webp"
81
+ },
82
+ {
83
+ "style_description": "neonpunk style, cyberpunk, vaporwave, neon, vibes, vibrant, stunningly beautiful, crisp, detailed, sleek, ultramodern, magenta highlights, dark purple shadows, high contrast, cinematic, ultra detailed, intricate, professional",
84
+ "name": "sai-neonpunk",
85
+ "thumbnail": "sai-neonpunk.webp"
86
+ },
87
+ {
88
+ "style_description": "origami style, paper art, pleated paper, folded, origami art, pleats, cut and fold, centered composition",
89
+ "name": "sai-origami",
90
+ "thumbnail": "sai-origami.webp"
91
+ },
92
+ {
93
+ "style_description": "cinematic photo, 35mm photograph, film, bokeh, professional, 4k, highly detailed",
94
+ "name": "sai-photographic",
95
+ "thumbnail": "sai-photographic.webp"
96
+ },
97
+ {
98
+ "style_description": "pixel-art, low-res, blocky, pixel art style, 8-bit graphics",
99
+ "name": "sai-pixel art",
100
+ "thumbnail": "sai-pixel_art.webp"
101
+ },
102
+ {
103
+ "style_description": "epic cinematic shot of dynamic in motion. main subject of high budget action movie. raw photo, motion blur. best quality, high resolution",
104
+ "name": "mre-cinematic-dynamic",
105
+ "thumbnail": "mre-cinematic-dynamic.webp"
106
+ },
107
+ {
108
+ "style_description": "spontaneous picture of, taken by talented amateur. best quality, high resolution. magical moment, natural look. simple but good looking",
109
+ "name": "mre-spontaneous-picture",
110
+ "thumbnail": "mre-spontaneous-picture.webp"
111
+ },
112
+ {
113
+ "style_description": "powerful artistic vision of, breathtaking masterpiece made by great artist. best quality, high resolution",
114
+ "name": "mre-artistic-vision",
115
+ "thumbnail": "mre-artistic-vision.webp"
116
+ },
117
+ {
118
+ "style_description": "dark and unsettling dream showing, best quality, high resolution. created by genius but depressed mad artist. grim beauty",
119
+ "name": "mre-dark-dream",
120
+ "thumbnail": "mre-dark-dream.webp"
121
+ },
122
+ {
123
+ "style_description": "astonishing gloomy art made mainly of shadows and lighting, forming, masterful usage of lighting, shadows and chiaroscuro. made by black-hearted artist, drawing from darkness. best quality, high resolution",
124
+ "name": "mre-gloomy-art",
125
+ "thumbnail": "mre-gloomy-art.webp"
126
+ },
127
+ {
128
+ "style_description": "picture from really bad dream about terrifying, true horror. bone-chilling vision. mad world that shouldn't exist. best quality, high resolution",
129
+ "name": "mre-bad-dream",
130
+ "thumbnail": "mre-bad-dream.webp"
131
+ },
132
+ {
133
+ "style_description": "uncanny caliginous vision of, created by remarkable underground artist. best quality, high resolution. raw and brutal art, careless but impressive style. inspired by darkness and chaos",
134
+ "name": "mre-underground",
135
+ "thumbnail": "mre-underground.webp"
136
+ },
137
+ {
138
+ "style_description": "surreal painting representing strange vision of, harmonious madness, synergy with chance. unique artstyle, mindbending art, magical surrealism. best quality, high resolution",
139
+ "name": "mre-surreal-painting",
140
+ "thumbnail": "mre-surreal-painting.webp"
141
+ },
142
+ {
143
+ "style_description": "insanely dynamic illustration of, best quality, high resolution. crazy artstyle, careless brushstrokes, emotional and fun",
144
+ "name": "mre-dynamic-illustration",
145
+ "thumbnail": "mre-dynamic-illustration.webp"
146
+ },
147
+ {
148
+ "style_description": "long forgotten art created by undead artist illustrating, tribute to the death and decay. miserable art of the damned. wretched and decaying world. best quality, high resolution",
149
+ "name": "mre-undead-art",
150
+ "thumbnail": "mre-undead-art.webp"
151
+ },
152
+ {
153
+ "style_description": "art illustrating insane amounts of raging elemental energy turning into, avatar of elements. magical surrealism, wizardry. best quality, high resolution",
154
+ "name": "mre-elemental-art",
155
+ "thumbnail": "mre-elemental-art.webp"
156
+ },
157
+ {
158
+ "style_description": "winner of inter-galactic art contest illustrating, symbol of the interstellar singularity. best quality, high resolution. artstyle previously unseen in the whole galaxy",
159
+ "name": "mre-space-art",
160
+ "thumbnail": "mre-space-art.webp"
161
+ },
162
+ {
163
+ "style_description": "sublime ancient illustration of, predating human civilization. crude and simple, but also surprisingly beautiful artwork, made by genius primeval artist. best quality, high resolution",
164
+ "name": "mre-ancient-illustration",
165
+ "thumbnail": "mre-ancient-illustration.webp"
166
+ },
167
+ {
168
+ "style_description": "brave, shocking, and brutally true art showing, inspired by courage and unlimited creativity. truth found in chaos. best quality, high resolution",
169
+ "name": "mre-brave-art",
170
+ "thumbnail": "mre-brave-art.webp"
171
+ },
172
+ {
173
+ "style_description": "heroic fantasy painting of, in the dangerous fantasy world. airbrush over oil on canvas. best quality, high resolution",
174
+ "name": "mre-heroic-fantasy",
175
+ "thumbnail": "mre-heroic-fantasy.webp"
176
+ },
177
+ {
178
+ "style_description": "dark cyberpunk illustration of brutal in a world without hope, ruled by ruthless criminal corporations. best quality, high resolution",
179
+ "name": "mre-dark-cyberpunk",
180
+ "thumbnail": "mre-dark-cyberpunk.webp"
181
+ },
182
+ {
183
+ "style_description": "geometric and lyrical abstraction painting presenting, oil on metal. best quality, high resolution",
184
+ "name": "mre-lyrical-geometry",
185
+ "thumbnail": "mre-lyrical-geometry.webp"
186
+ },
187
+ {
188
+ "style_description": "big long brushstrokes of deep black sumi-e turning into symbolic painting of, master level raw art. best quality, high resolution",
189
+ "name": "mre-sumi-e-symbolic",
190
+ "thumbnail": "mre-sumi-e-symbolic.webp"
191
+ },
192
+ {
193
+ "style_description": "highly detailed black sumi-e painting of, in-depth study of perfection, created by a master. best quality, high resolution",
194
+ "name": "mre-sumi-e-detailed",
195
+ "thumbnail": "mre-sumi-e-detailed.webp"
196
+ },
197
+ {
198
+ "style_description": "manga artwork presenting, created by japanese manga artist. highly emotional. best quality, high resolution",
199
+ "name": "mre-manga",
200
+ "thumbnail": "mre-manga.webp"
201
+ },
202
+ {
203
+ "style_description": "anime artwork illustrating, created by japanese anime studio. highly emotional. best quality, high resolution",
204
+ "name": "mre-anime",
205
+ "thumbnail": "mre-anime.webp"
206
+ },
207
+ {
208
+ "style_description": "breathtaking illustration from adult comic book presenting, fabulous artwork. best quality, high resolution",
209
+ "name": "mre-comic",
210
+ "thumbnail": "mre-comic.webp"
211
+ },
212
+ {
213
+ "style_description": "advertising poster style, Professional, modern, product-focused, commercial, eye-catching, highly detailed",
214
+ "name": "ads-advertising",
215
+ "thumbnail": "ads-advertising.webp"
216
+ },
217
+ {
218
+ "style_description": "automotive advertisement style, sleek, dynamic, professional, commercial, vehicle-focused, high-resolution, highly detailed",
219
+ "name": "ads-automotive",
220
+ "thumbnail": "ads-automotive.webp"
221
+ },
222
+ {
223
+ "style_description": "corporate branding style, professional, clean, modern, sleek, minimalist, business-oriented, highly detailed",
224
+ "name": "ads-corporate",
225
+ "thumbnail": "ads-corporate.webp"
226
+ },
227
+ {
228
+ "style_description": "fashion editorial style, high fashion, trendy, stylish, editorial, magazine style, professional, highly detailed",
229
+ "name": "ads-fashion editorial",
230
+ "thumbnail": "ads-fashion_editorial.webp"
231
+ },
232
+ {
233
+ "style_description": "food photography style, appetizing, professional, culinary, high-resolution, commercial, highly detailed",
234
+ "name": "ads-food photography",
235
+ "thumbnail": "ads-food_photography.webp"
236
+ },
237
+ {
238
+ "style_description": "gourmet food photo of, soft natural lighting, macro details, vibrant colors, fresh ingredients, glistening textures, bokeh background, styled plating, wooden tabletop, garnished, tantalizing, editorial quality",
239
+ "name": "ads-gourmet food photography",
240
+ "thumbnail": "ads-gourmet_food_photography.webp"
241
+ },
242
+ {
243
+ "style_description": "luxury product style, elegant, sophisticated, high-end, luxurious, professional, highly detailed",
244
+ "name": "ads-luxury",
245
+ "thumbnail": "ads-luxury.webp"
246
+ },
247
+ {
248
+ "style_description": "retail packaging style, vibrant, enticing, commercial, product-focused, eye-catching, professional, highly detailed",
249
+ "name": "ads-retail",
250
+ "thumbnail": "ads-retail.webp"
251
+ },
252
+ {
253
+ "style_description": "abstract style, non-representational, colors and shapes, expression of feelings, imaginative, highly detailed",
254
+ "name": "artstyle-abstract",
255
+ "thumbnail": "artstyle-abstract.webp"
256
+ },
257
+ {
258
+ "style_description": "abstract expressionist painting, energetic brushwork, bold colors, abstract forms, expressive, emotional",
259
+ "name": "artstyle-abstract expressionism",
260
+ "thumbnail": "artstyle-abstract_expressionism.webp"
261
+ },
262
+ {
263
+ "style_description": "art deco style, geometric shapes, bold colors, luxurious, elegant, decorative, symmetrical, ornate, detailed",
264
+ "name": "artstyle-art deco",
265
+ "thumbnail": "artstyle-art_deco.webp"
266
+ },
267
+ {
268
+ "style_description": "art nouveau style, elegant, decorative, curvilinear forms, nature-inspired, ornate, detailed",
269
+ "name": "artstyle-art nouveau",
270
+ "thumbnail": "artstyle-art_nouveau.webp"
271
+ },
272
+ {
273
+ "style_description": "constructivist style, geometric shapes, bold colors, dynamic composition, propaganda art style",
274
+ "name": "artstyle-constructivist",
275
+ "thumbnail": "artstyle-constructivist.webp"
276
+ },
277
+ {
278
+ "style_description": "cubist artwork, geometric shapes, abstract, innovative, revolutionary",
279
+ "name": "artstyle-cubist",
280
+ "thumbnail": "artstyle-cubist.webp"
281
+ },
282
+ {
283
+ "style_description": "expressionist, raw, emotional, dynamic, distortion for emotional effect, vibrant, use of unusual colors, detailed",
284
+ "name": "artstyle-expressionist",
285
+ "thumbnail": "artstyle-expressionist.webp"
286
+ },
287
+ {
288
+ "style_description": "graffiti style, street art, vibrant, urban, detailed, tag, mural",
289
+ "name": "artstyle-graffiti",
290
+ "thumbnail": "artstyle-graffiti.webp"
291
+ },
292
+ {
293
+ "style_description": "hyperrealistic art, extremely high-resolution details, photographic, realism pushed to extreme, fine texture, incredibly lifelike",
294
+ "name": "artstyle-hyperrealism",
295
+ "thumbnail": "artstyle-hyperrealism.webp"
296
+ },
297
+ {
298
+ "style_description": "impressionist painting, loose brushwork, vibrant color, light and shadow play, captures feeling over form",
299
+ "name": "artstyle-impressionist",
300
+ "thumbnail": "artstyle-impressionist.webp"
301
+ },
302
+ {
303
+ "style_description": "pointillism style, composed entirely of small, distinct dots of color, vibrant, highly detailed",
304
+ "name": "artstyle-pointillism",
305
+ "thumbnail": "artstyle-pointillism.webp"
306
+ },
307
+ {
308
+ "style_description": "pop Art style, bright colors, bold outlines, popular culture themes, ironic or kitsch",
309
+ "name": "artstyle-pop art",
310
+ "thumbnail": "artstyle-pop_art.webp"
311
+ },
312
+ {
313
+ "style_description": "psychedelic style, vibrant colors, swirling patterns, abstract forms, surreal, trippy",
314
+ "name": "artstyle-psychedelic",
315
+ "thumbnail": "artstyle-psychedelic.webp"
316
+ },
317
+ {
318
+ "style_description": "renaissance style, realistic, perspective, light and shadow, religious or mythological themes, highly detailed",
319
+ "name": "artstyle-renaissance",
320
+ "thumbnail": "artstyle-renaissance.webp"
321
+ },
322
+ {
323
+ "style_description": "steampunk style, antique, mechanical, brass and copper tones, gears, intricate, detailed",
324
+ "name": "artstyle-steampunk",
325
+ "thumbnail": "artstyle-steampunk.webp"
326
+ },
327
+ {
328
+ "style_description": "surrealist art, dreamlike, mysterious, provocative, symbolic, intricate, detailed",
329
+ "name": "artstyle-surrealist",
330
+ "thumbnail": "artstyle-surrealist.webp"
331
+ },
332
+ {
333
+ "style_description": "typographic art, stylized, intricate, detailed, artistic, text-based",
334
+ "name": "artstyle-typography",
335
+ "thumbnail": "artstyle-typography.webp"
336
+ },
337
+ {
338
+ "style_description": "watercolor painting, vibrant, beautiful, painterly, detailed, textural, artistic",
339
+ "name": "artstyle-watercolor",
340
+ "thumbnail": "artstyle-watercolor.webp"
341
+ },
342
+ {
343
+ "style_description": "biomechanical style, blend of organic and mechanical elements, futuristic, cybernetic, detailed, intricate",
344
+ "name": "futuristic-biomechanical",
345
+ "thumbnail": "futuristic-biomechanical.webp"
346
+ },
347
+ {
348
+ "style_description": "biomechanical cyberpunk, cybernetics, human-machine fusion, dystopian, organic meets artificial, dark, intricate, highly detailed",
349
+ "name": "futuristic-biomechanical cyberpunk",
350
+ "thumbnail": "futuristic-biomechanical_cyberpunk.webp"
351
+ },
352
+ {
353
+ "style_description": "cybernetic style, futuristic, technological, cybernetic enhancements, robotics, artificial intelligence themes",
354
+ "name": "futuristic-cybernetic",
355
+ "thumbnail": "futuristic-cybernetic.webp"
356
+ },
357
+ {
358
+ "style_description": "cybernetic robot, android, AI, machine, metal, wires, tech, futuristic, highly detailed",
359
+ "name": "futuristic-cybernetic robot",
360
+ "thumbnail": "futuristic-cybernetic_robot.webp"
361
+ },
362
+ {
363
+ "style_description": "cyberpunk cityscape, neon lights, dark alleys, skyscrapers, futuristic, vibrant colors, high contrast, highly detailed",
364
+ "name": "futuristic-cyberpunk cityscape",
365
+ "thumbnail": "futuristic-cyberpunk_cityscape.webp"
366
+ },
367
+ {
368
+ "style_description": "futuristic style, sleek, modern, ultramodern, high tech, detailed",
369
+ "name": "futuristic-futuristic",
370
+ "thumbnail": "futuristic-futuristic.webp"
371
+ },
372
+ {
373
+ "style_description": "retro cyberpunk, 80's inspired, synthwave, neon, vibrant, detailed, retro futurism",
374
+ "name": "futuristic-retro cyberpunk",
375
+ "thumbnail": "futuristic-retro_cyberpunk.webp"
376
+ },
377
+ {
378
+ "style_description": "retro-futuristic, vintage sci-fi, 50s and 60s style, atomic age, vibrant, highly detailed",
379
+ "name": "futuristic-retro futurism",
380
+ "thumbnail": "futuristic-retro_futurism.webp"
381
+ },
382
+ {
383
+ "style_description": "sci-fi style, futuristic, technological, alien worlds, space themes, advanced civilizations",
384
+ "name": "futuristic-sci-fi",
385
+ "thumbnail": "futuristic-sci-fi.webp"
386
+ },
387
+ {
388
+ "style_description": "vaporwave style, retro aesthetic, cyberpunk, vibrant, neon colors, vintage 80s and 90s style, highly detailed",
389
+ "name": "futuristic-vaporwave",
390
+ "thumbnail": "futuristic-vaporwave.webp"
391
+ },
392
+ {
393
+ "style_description": "Bubble Bobble style, 8-bit, cute, pixelated, fantasy, vibrant, reminiscent of Bubble Bobble game",
394
+ "name": "game-bubble bobble",
395
+ "thumbnail": "game-bubble_bobble.webp"
396
+ },
397
+ {
398
+ "style_description": "cyberpunk game style, neon, dystopian, futuristic, digital, vibrant, detailed, high contrast, reminiscent of cyberpunk genre video games",
399
+ "name": "game-cyberpunk game",
400
+ "thumbnail": "game-cyberpunk_game.webp"
401
+ },
402
+ {
403
+ "style_description": "fighting game style, dynamic, vibrant, action-packed, detailed character design, reminiscent of fighting video games",
404
+ "name": "game-fighting game",
405
+ "thumbnail": "game-fighting_game.webp"
406
+ },
407
+ {
408
+ "style_description": "GTA-style artwork, satirical, exaggerated, pop art style, vibrant colors, iconic characters, action-packed",
409
+ "name": "game-gta",
410
+ "thumbnail": "game-gta.webp"
411
+ },
412
+ {
413
+ "style_description": "Super Mario style, vibrant, cute, cartoony, fantasy, playful, reminiscent of Super Mario series",
414
+ "name": "game-mario",
415
+ "thumbnail": "game-mario.webp"
416
+ },
417
+ {
418
+ "style_description": "Minecraft style, blocky, pixelated, vibrant colors, recognizable characters and objects, game assets",
419
+ "name": "game-minecraft",
420
+ "thumbnail": "game-minecraft.webp"
421
+ },
422
+ {
423
+ "style_description": "Pokémon style, vibrant, cute, anime, fantasy, reminiscent of Pokémon series",
424
+ "name": "game-pokemon",
425
+ "thumbnail": "game-pokemon.webp"
426
+ },
427
+ {
428
+ "style_description": "retro arcade style, 8-bit, pixelated, vibrant, classic video game, old school gaming, reminiscent of 80s and 90s arcade games",
429
+ "name": "game-retro arcade",
430
+ "thumbnail": "game-retro_arcade.webp"
431
+ },
432
+ {
433
+ "style_description": "retro game art, 16-bit, vibrant colors, pixelated, nostalgic, charming, fun",
434
+ "name": "game-retro game",
435
+ "thumbnail": "game-retro_game.webp"
436
+ },
437
+ {
438
+ "style_description": "strategy game style, overhead view, detailed map, units, reminiscent of real-time strategy video games",
439
+ "name": "game-strategy game",
440
+ "thumbnail": "game-strategy_game.webp"
441
+ },
442
+ {
443
+ "style_description": "Street Fighter style, vibrant, dynamic, arcade, 2D fighting game, highly detailed, reminiscent of Street Fighter series",
444
+ "name": "game-streetfighter",
445
+ "thumbnail": "game-streetfighter.webp"
446
+ },
447
+ {
448
+ "style_description": "Legend of Zelda style, vibrant, fantasy, detailed, epic, heroic, reminiscent of The Legend of Zelda series",
449
+ "name": "game-zelda",
450
+ "thumbnail": "game-zelda.webp"
451
+ },
452
+ {
453
+ "style_description": "architectural style, clean lines, geometric shapes, minimalist, modern, architectural drawing, highly detailed",
454
+ "name": "misc-architectural",
455
+ "thumbnail": "misc-architectural.webp"
456
+ },
457
+ {
458
+ "style_description": "disco-themed, vibrant, groovy, retro 70s style, shiny disco balls, neon lights, dance floor, highly detailed",
459
+ "name": "misc-disco",
460
+ "thumbnail": "misc-disco.webp"
461
+ },
462
+ {
463
+ "style_description": "dreamscape, surreal, ethereal, dreamy, mysterious, fantasy, highly detailed",
464
+ "name": "misc-dreamscape",
465
+ "thumbnail": "misc-dreamscape.webp"
466
+ },
467
+ {
468
+ "style_description": "dystopian style, bleak, post-apocalyptic, somber, dramatic, highly detailed",
469
+ "name": "misc-dystopian",
470
+ "thumbnail": "misc-dystopian.webp"
471
+ },
472
+ {
473
+ "style_description": "fairy tale, magical, fantastical, enchanting, storybook style, highly detailed",
474
+ "name": "misc-fairy tale",
475
+ "thumbnail": "misc-fairy_tale.webp"
476
+ },
477
+ {
478
+ "style_description": "gothic style, dark, mysterious, haunting, dramatic, ornate, detailed",
479
+ "name": "misc-gothic",
480
+ "thumbnail": "misc-gothic.webp"
481
+ },
482
+ {
483
+ "style_description": "grunge style, textured, distressed, vintage, edgy, punk rock vibe, dirty, noisy",
484
+ "name": "misc-grunge",
485
+ "thumbnail": "misc-grunge.webp"
486
+ },
487
+ {
488
+ "style_description": "horror-themed, eerie, unsettling, dark, spooky, suspenseful, grim, highly detailed",
489
+ "name": "misc-horror",
490
+ "thumbnail": "misc-horror.webp"
491
+ },
492
+ {
493
+ "style_description": "kawaii style, cute, adorable, brightly colored, cheerful, anime influence, highly detailed",
494
+ "name": "misc-kawaii",
495
+ "thumbnail": "misc-kawaii.webp"
496
+ },
497
+ {
498
+ "style_description": "lovecraftian horror, eldritch, cosmic horror, unknown, mysterious, surreal, highly detailed",
499
+ "name": "misc-lovecraftian",
500
+ "thumbnail": "misc-lovecraftian.webp"
501
+ },
502
+ {
503
+ "style_description": "macabre style, dark, gothic, grim, haunting, highly detailed",
504
+ "name": "misc-macabre",
505
+ "thumbnail": "misc-macabre.webp"
506
+ },
507
+ {
508
+ "style_description": "manga style, vibrant, high-energy, detailed, iconic, Japanese comic style",
509
+ "name": "misc-manga",
510
+ "thumbnail": "misc-manga.webp"
511
+ },
512
+ {
513
+ "style_description": "metropolis-themed, urban, cityscape, skyscrapers, modern, futuristic, highly detailed",
514
+ "name": "misc-metropolis",
515
+ "thumbnail": "misc-metropolis.webp"
516
+ },
517
+ {
518
+ "style_description": "minimalist style, simple, clean, uncluttered, modern, elegant",
519
+ "name": "misc-minimalist",
520
+ "thumbnail": "misc-minimalist.webp"
521
+ },
522
+ {
523
+ "style_description": "monochrome, black and white, contrast, tone, texture, detailed",
524
+ "name": "misc-monochrome",
525
+ "thumbnail": "misc-monochrome.webp"
526
+ },
527
+ {
528
+ "style_description": "nautical-themed, sea, ocean, ships, maritime, beach, marine life, highly detailed",
529
+ "name": "misc-nautical",
530
+ "thumbnail": "misc-nautical.webp"
531
+ },
532
+ {
533
+ "style_description": "space-themed, cosmic, celestial, stars, galaxies, nebulas, planets, science fiction, highly detailed",
534
+ "name": "misc-space",
535
+ "thumbnail": "misc-space.webp"
536
+ },
537
+ {
538
+ "style_description": "stained glass style, vibrant, beautiful, translucent, intricate, detailed",
539
+ "name": "misc-stained glass",
540
+ "thumbnail": "misc-stained_glass.webp"
541
+ },
542
+ {
543
+ "style_description": "techwear fashion, futuristic, cyberpunk, urban, tactical, sleek, dark, highly detailed",
544
+ "name": "misc-techwear fashion",
545
+ "thumbnail": "misc-techwear_fashion.webp"
546
+ },
547
+ {
548
+ "style_description": "tribal style, indigenous, ethnic, traditional patterns, bold, natural colors, highly detailed",
549
+ "name": "misc-tribal",
550
+ "thumbnail": "misc-tribal.webp"
551
+ },
552
+ {
553
+ "style_description": "zentangle, intricate, abstract, monochrome, patterns, meditative, highly detailed",
554
+ "name": "misc-zentangle",
555
+ "thumbnail": "misc-zentangle.webp"
556
+ },
557
+ {
558
+ "style_description": "collage style, mixed media, layered, textural, detailed, artistic",
559
+ "name": "papercraft-collage",
560
+ "thumbnail": "papercraft-collage.webp"
561
+ },
562
+ {
563
+ "style_description": "flat papercut style, silhouette, clean cuts, paper, sharp edges, minimalist, color block",
564
+ "name": "papercraft-flat papercut",
565
+ "thumbnail": "papercraft-flatpapercut.webp"
566
+ },
567
+ {
568
+ "style_description": "kirigami representation of, 3D, paper folding, paper cutting, Japanese, intricate, symmetrical, precision, clean lines",
569
+ "name": "papercraft-kirigami",
570
+ "thumbnail": "papercraft-kirigami.webp"
571
+ },
572
+ {
573
+ "style_description": "paper mache representation of, 3D, sculptural, textured, handmade, vibrant, fun",
574
+ "name": "papercraft-paper mache",
575
+ "thumbnail": "papercraft-paper_mache.webp"
576
+ },
577
+ {
578
+ "style_description": "paper quilling art of, intricate, delicate, curling, rolling, shaping, coiling, loops, 3D, dimensional, ornamental",
579
+ "name": "papercraft-paper quilling",
580
+ "thumbnail": "papercraft-paper_quilling.webp"
581
+ },
582
+ {
583
+ "style_description": "papercut collage of, mixed media, textured paper, overlapping, asymmetrical, abstract, vibrant",
584
+ "name": "papercraft-papercut collage",
585
+ "thumbnail": "papercraft-papercut_collage.webp"
586
+ },
587
+ {
588
+ "style_description": "3D papercut shadow box of, layered, dimensional, depth, silhouette, shadow, papercut, handmade, high contrast",
589
+ "name": "papercraft-papercut shadow box",
590
+ "thumbnail": "papercraft-papercut_shadow_box.webp"
591
+ },
592
+ {
593
+ "style_description": "stacked papercut art of, 3D, layered, dimensional, depth, precision cut, stacked layers, papercut, high contrast",
594
+ "name": "papercraft-stacked papercut",
595
+ "thumbnail": "papercraft-stacked_papercut.webp"
596
+ },
597
+ {
598
+ "style_description": "thick layered papercut art of, deep 3D, volumetric, dimensional, depth, thick paper, high stack, heavy texture, tangible layers",
599
+ "name": "papercraft-thick layered papercut",
600
+ "thumbnail": "papercraft-thick_layered_papercut.webp"
601
+ },
602
+ {
603
+ "style_description": "alien-themed, extraterrestrial, cosmic, otherworldly, mysterious, sci-fi, highly detailed",
604
+ "name": "photo-alien",
605
+ "thumbnail": "photo-alien.webp"
606
+ },
607
+ {
608
+ "style_description": "film noir style, monochrome, high contrast, dramatic shadows, 1940s style, mysterious, cinematic",
609
+ "name": "photo-film noir",
610
+ "thumbnail": "photo-film_noir.webp"
611
+ },
612
+ {
613
+ "style_description": "glamorous photo, high fashion, luxurious, extravagant, stylish, sensual, opulent, elegance, stunning beauty, professional, high contrast, detailed",
614
+ "name": "photo-glamour",
615
+ "thumbnail": "photo-glamour.webp"
616
+ },
617
+ {
618
+ "style_description": "HDR photo of, High dynamic range, vivid, rich details, clear shadows and highlights, realistic, intense, enhanced contrast, highly detailed",
619
+ "name": "photo-hdr",
620
+ "thumbnail": "photo-hdr.webp"
621
+ },
622
+ {
623
+ "style_description": "iphone photo, large depth of field, deep depth of field, highly detailed",
624
+ "name": "photo-iphone photographic",
625
+ "thumbnail": "photo-iphone_photographic.webp"
626
+ },
627
+ {
628
+ "style_description": "long exposure photo of, Blurred motion, streaks of light, surreal, dreamy, ghosting effect, highly detailed",
629
+ "name": "photo-long exposure",
630
+ "thumbnail": "photo-long_exposure.webp"
631
+ },
632
+ {
633
+ "style_description": "neon noir, cyberpunk, dark, rainy streets, neon signs, high contrast, low light, vibrant, highly detailed",
634
+ "name": "photo-neon noir",
635
+ "thumbnail": "photo-neon_noir.webp"
636
+ },
637
+ {
638
+ "style_description": "silhouette style, high contrast, minimalistic, black and white, stark, dramatic",
639
+ "name": "photo-silhouette",
640
+ "thumbnail": "photo-silhouette.webp"
641
+ },
642
+ {
643
+ "style_description": "tilt-shift photo of, selective focus, miniature effect, blurred background, highly detailed, vibrant, perspective control",
644
+ "name": "photo-tilt-shift",
645
+ "thumbnail": "photo-tilt-shift.webp"
646
+ },
647
+ {
648
+ "style_description": "UHD, 8K, ultra detailed, a cinematic photograph of, beautiful lighting, great composition",
649
+ "name": "cinematic-diva",
650
+ "thumbnail": "cinematic-diva.webp"
651
+ },
652
+ {
653
+ "style_description": "Abstract Expressionism Art, High contrast, minimalistic, colorful, stark, dramatic, expressionism",
654
+ "name": "Abstract Expressionism",
655
+ "thumbnail": "abstract_expressionism.webp"
656
+ },
657
+ {
658
+ "style_description": "Academia, preppy Ivy League style, stark, dramatic, chic boarding school, academia",
659
+ "name": "Academia",
660
+ "thumbnail": "academia.webp"
661
+ },
662
+ {
663
+ "style_description": "Action Figure, plastic collectable action figure, collectable toy action figure",
664
+ "name": "Action Figure",
665
+ "thumbnail": "action_figure.webp"
666
+ },
667
+ {
668
+ "style_description": "Adorable 3D Character, 3D render, adorable character, 3D art",
669
+ "name": "Adorable 3D Character",
670
+ "thumbnail": "adorable_3d_character.webp"
671
+ },
672
+ {
673
+ "style_description": "Adorable Kawaii, pretty, cute, adorable, kawaii",
674
+ "name": "Adorable Kawaii",
675
+ "thumbnail": "adorable_kawaii.webp"
676
+ },
677
+ {
678
+ "style_description": "Art Deco, sleek, geometric forms, art deco style",
679
+ "name": "Art Deco",
680
+ "thumbnail": "art_deco.webp"
681
+ },
682
+ {
683
+ "style_description": "Art Nouveau, beautiful art, sleek, organic forms, long, sinuous, art nouveau style",
684
+ "name": "Art Nouveau",
685
+ "thumbnail": "art_nouveau.webp"
686
+ },
687
+ {
688
+ "style_description": "Astral Aura, astral, colorful aura, vibrant energy",
689
+ "name": "Astral Aura",
690
+ "thumbnail": "astral_aura.webp"
691
+ },
692
+ {
693
+ "style_description": "Avant-garde, unusual, experimental, avant-garde art",
694
+ "name": "Avant-garde",
695
+ "thumbnail": "avant-garde.webp"
696
+ },
697
+ {
698
+ "style_description": "Baroque, dramatic, exuberant, grandeur, baroque art",
699
+ "name": "Baroque",
700
+ "thumbnail": "baroque.webp"
701
+ },
702
+ {
703
+ "style_description": "Bauhaus-Style Poster, simple geometric shapes, clean lines, primary colors, Bauhaus-Style Poster",
704
+ "name": "Bauhaus-Style Poster",
705
+ "thumbnail": "bauhaus-style_poster.webp"
706
+ },
707
+ {
708
+ "style_description": "Blueprint Schematic Drawing, technical drawing, blueprint, schematic",
709
+ "name": "Blueprint Schematic Drawing",
710
+ "thumbnail": "blueprint_schematic_drawing.webp"
711
+ },
712
+ {
713
+ "style_description": "Caricature, exaggerated, comical, caricature",
714
+ "name": "Caricature",
715
+ "thumbnail": "caricature.webp"
716
+ },
717
+ {
718
+ "style_description": "Cel Shaded Art, 2D, flat color, toon shading, cel shaded style",
719
+ "name": "Cel Shaded Art",
720
+ "thumbnail": "cel_shaded_art.webp"
721
+ },
722
+ {
723
+ "style_description": "Character Design Sheet, character reference sheet, character turn around",
724
+ "name": "Character Design Sheet",
725
+ "thumbnail": "character_design_sheet.webp"
726
+ },
727
+ {
728
+ "style_description": "Classicism Art, inspired by Roman and Greek culture, clarity, harmonious, classicism art",
729
+ "name": "Classicism Art",
730
+ "thumbnail": "classicism_art.webp"
731
+ },
732
+ {
733
+ "style_description": "Color Field Painting, abstract, simple, geometic, color field painting style",
734
+ "name": "Color Field Painting",
735
+ "thumbnail": "color_field_painting.webp"
736
+ },
737
+ {
738
+ "style_description": "Colored Pencil Art, colored pencil strokes, light color, visible paper texture, colored pencil art",
739
+ "name": "Colored Pencil Art",
740
+ "thumbnail": "colored_pencil_art.webp"
741
+ },
742
+ {
743
+ "style_description": "Conceptual Art, concept art",
744
+ "name": "Conceptual Art",
745
+ "thumbnail": "conceptual_art.webp"
746
+ },
747
+ {
748
+ "style_description": "Constructivism Art, minimalistic, geometric forms, constructivism art",
749
+ "name": "Constructivism",
750
+ "thumbnail": "constructivism.webp"
751
+ },
752
+ {
753
+ "style_description": "Cubism Art, flat geometric forms, cubism art",
754
+ "name": "Cubism",
755
+ "thumbnail": "cubism.webp"
756
+ },
757
+ {
758
+ "style_description": "Dadaism Art, satirical, nonsensical, dadaism art",
759
+ "name": "Dadaism",
760
+ "thumbnail": "dadaism.webp"
761
+ },
762
+ {
763
+ "style_description": "Dark Fantasy Art, dark, moody, dark fantasy style",
764
+ "name": "Dark Fantasy",
765
+ "thumbnail": "dark_fantasy.webp"
766
+ },
767
+ {
768
+ "style_description": "Dark Moody Atmosphere, dramatic, mysterious, dark moody atmosphere",
769
+ "name": "Dark Moody Atmosphere",
770
+ "thumbnail": "dark_moody_atmosphere.webp"
771
+ },
772
+ {
773
+ "style_description": "DMT Art Style, bright colors, surreal visuals, swirling patterns, DMT art style",
774
+ "name": "DMT Art Style",
775
+ "thumbnail": "dmt_art_style.webp"
776
+ },
777
+ {
778
+ "style_description": "Doodle Art Style, drawing, freeform, swirling patterns, doodle art style",
779
+ "name": "Doodle Art",
780
+ "thumbnail": "doodle_art.webp"
781
+ },
782
+ {
783
+ "style_description": "Double Exposure Style, double image ghost effect, image combination, double exposure style",
784
+ "name": "Double Exposure",
785
+ "thumbnail": "double_exposure.webp"
786
+ },
787
+ {
788
+ "style_description": "Dripping Paint Splatter Art, dramatic, paint drips, splatters, dripping paint",
789
+ "name": "Dripping Paint Splatter Art",
790
+ "thumbnail": "dripping_paint_splatter_art.webp"
791
+ },
792
+ {
793
+ "style_description": "Expressionism Art Style, movement, contrast, emotional, exaggerated forms, expressionism art style",
794
+ "name": "Expressionism",
795
+ "thumbnail": "expressionism.webp"
796
+ },
797
+ {
798
+ "style_description": "Faded Polaroid Photo, analog, old faded photo, old polaroid",
799
+ "name": "Faded Polaroid Photo",
800
+ "thumbnail": "faded_polaroid_photo.webp"
801
+ },
802
+ {
803
+ "style_description": "Fauvism Art, painterly, bold colors, textured brushwork, fauvism art",
804
+ "name": "Fauvism",
805
+ "thumbnail": "fauvism.webp"
806
+ },
807
+ {
808
+ "style_description": "Flat 2D Art, simple flat color, 2-dimensional, Flat 2D Art Style",
809
+ "name": "Flat 2D Art",
810
+ "thumbnail": "flat_2d_art.webp"
811
+ },
812
+ {
813
+ "style_description": "Fortnite Art Style, 3D cartoon, colorful, Fortnite Art Style",
814
+ "name": "Fortnite Art Style",
815
+ "thumbnail": "fortnite_art_style.webp"
816
+ },
817
+ {
818
+ "style_description": "Futurism Art Style, dynamic, dramatic, Futurism Art Style",
819
+ "name": "Futurism",
820
+ "thumbnail": "futurism.webp"
821
+ },
822
+ {
823
+ "style_description": "Glitchcore Art Style, dynamic, dramatic, distorted, vibrant colors, glitchcore art style",
824
+ "name": "Glitchcore",
825
+ "thumbnail": "glitchcore.webp"
826
+ },
827
+ {
828
+ "style_description": "Glo-fi Art Style, dynamic, dramatic, vibrant colors, glo-fi art style",
829
+ "name": "Glo-fi",
830
+ "thumbnail": "glo-fi.webp"
831
+ },
832
+ {
833
+ "style_description": "Googie Art Style, dynamic, dramatic, 1950's futurism, bold boomerang angles, Googie art style",
834
+ "name": "Googie Art Style",
835
+ "thumbnail": "googie_art_style.webp"
836
+ },
837
+ {
838
+ "style_description": "Graffiti Art Style, dynamic, dramatic, vibrant colors, graffiti art style",
839
+ "name": "Graffiti Art",
840
+ "thumbnail": "graffiti_art.webp"
841
+ },
842
+ {
843
+ "style_description": "Harlem Renaissance Art Style, dynamic, dramatic, 1920s African American culture, Harlem Renaissance art style",
844
+ "name": "Harlem Renaissance Art",
845
+ "thumbnail": "harlem_renaissance_art.webp"
846
+ },
847
+ {
848
+ "style_description": "High Fashion, dynamic, dramatic, haute couture, elegant, ornate clothing, High Fashion",
849
+ "name": "High Fashion",
850
+ "thumbnail": "high_fashion.webp"
851
+ },
852
+ {
853
+ "style_description": "Idyllic, peaceful, happy, pleasant, happy, harmonious, picturesque, charming",
854
+ "name": "Idyllic",
855
+ "thumbnail": "idyllic.webp"
856
+ },
857
+ {
858
+ "style_description": "Impressionism, painterly, small brushstrokes, visible brushstrokes, impressionistic style",
859
+ "name": "Impressionism",
860
+ "thumbnail": "impressionism.webp"
861
+ },
862
+ {
863
+ "style_description": "Infographic Drawing, diagram, infographic",
864
+ "name": "Infographic Drawing",
865
+ "thumbnail": "infographic_drawing.webp"
866
+ },
867
+ {
868
+ "style_description": "Ink Dripping Drawing, ink drawing, dripping ink",
869
+ "name": "Ink Dripping Drawing",
870
+ "thumbnail": "ink_dripping_drawing.webp"
871
+ },
872
+ {
873
+ "style_description": "Japanese Ink Drawing, ink drawing, inkwash, Japanese Ink Drawing",
874
+ "name": "Japanese Ink Drawing",
875
+ "thumbnail": "japanese_ink_drawing.webp"
876
+ },
877
+ {
878
+ "style_description": "Knolling Photography, flat lay photography, object arrangment, knolling photography",
879
+ "name": "Knolling Photography",
880
+ "thumbnail": "knolling_photography.webp"
881
+ },
882
+ {
883
+ "style_description": "Light Cheery Atmosphere, happy, joyful, cheerful, carefree, gleeful, lighthearted, pleasant atmosphere",
884
+ "name": "Light Cheery Atmosphere",
885
+ "thumbnail": "light_cheery_atmosphere.webp"
886
+ },
887
+ {
888
+ "style_description": "Logo Design, dynamic graphic art, vector art, minimalist, professional logo design",
889
+ "name": "Logo Design",
890
+ "thumbnail": "logo_design.webp"
891
+ },
892
+ {
893
+ "style_description": "Luxurious Elegance, extravagant, ornate, designer, opulent, picturesque, lavish",
894
+ "name": "Luxurious Elegance",
895
+ "thumbnail": "luxurious_elegance.webp"
896
+ },
897
+ {
898
+ "style_description": "Macro Photography, close-up, macro 100mm, macro photography",
899
+ "name": "Macro Photography",
900
+ "thumbnail": "macro_photography.webp"
901
+ },
902
+ {
903
+ "style_description": "Mandala art style, complex, circular design, mandala",
904
+ "name": "Mandala Art",
905
+ "thumbnail": "mandala_art.webp"
906
+ },
907
+ {
908
+ "style_description": "Marker Drawing, bold marker lines, visibile paper texture, marker drawing",
909
+ "name": "Marker Drawing",
910
+ "thumbnail": "marker_drawing.webp"
911
+ },
912
+ {
913
+ "style_description": "Medievalism, inspired by The Middle Ages, medieval art, elaborate patterns and decoration, Medievalism",
914
+ "name": "Medievalism",
915
+ "thumbnail": "medievalism.webp"
916
+ },
917
+ {
918
+ "style_description": "Minimalism, abstract, simple geometic shapes, hard edges, sleek contours, Minimalism",
919
+ "name": "Minimalism",
920
+ "thumbnail": "minimalism.webp"
921
+ },
922
+ {
923
+ "style_description": "Neo-Baroque, ornate and elaborate, dynaimc, Neo-Baroque",
924
+ "name": "Neo-Baroque",
925
+ "thumbnail": "neo-baroque.webp"
926
+ },
927
+ {
928
+ "style_description": "Neo-Byzantine, grand decorative religious style, Orthodox Christian inspired, Neo-Byzantine",
929
+ "name": "Neo-Byzantine",
930
+ "thumbnail": "neo-byzantine.webp"
931
+ },
932
+ {
933
+ "style_description": "Neo-Futurism, high-tech, curves, spirals, flowing lines, idealistic future, Neo-Futurism",
934
+ "name": "Neo-Futurism",
935
+ "thumbnail": "neo-futurism.webp"
936
+ },
937
+ {
938
+ "style_description": "Neo-Impressionism, tiny dabs of color, Pointillism, painterly, Neo-Impressionism",
939
+ "name": "Neo-Impressionism",
940
+ "thumbnail": "neo-impressionism.webp"
941
+ },
942
+ {
943
+ "style_description": "Neo-Rococo, curved forms, naturalistic ornamentation, elaborate, decorative, gaudy, Neo-Rococo",
944
+ "name": "Neo-Rococo",
945
+ "thumbnail": "neo-rococo.webp"
946
+ },
947
+ {
948
+ "style_description": "Neoclassicism, ancient Rome and Greece inspired, idealic, sober colors, Neoclassicism",
949
+ "name": "Neoclassicism",
950
+ "thumbnail": "neoclassicism.webp"
951
+ },
952
+ {
953
+ "style_description": "Op Art, optical illusion, abstract, geometric pattern, impression of movement, Op Art",
954
+ "name": "Op Art",
955
+ "thumbnail": "op_art.webp"
956
+ },
957
+ {
958
+ "style_description": "Ornate and Intricate, decorative, highly detailed, elaborate, ornate, intricate",
959
+ "name": "Ornate and Intricate",
960
+ "thumbnail": "ornate_and_intricate.webp"
961
+ },
962
+ {
963
+ "style_description": "Pencil Sketch Drawing, black and white drawing, graphite drawing",
964
+ "name": "Pencil Sketch Drawing",
965
+ "thumbnail": "pencil_sketch_drawing.webp"
966
+ },
967
+ {
968
+ "style_description": "Pop Art, vivid colors, flat color, 2D, strong lines, Pop Art",
969
+ "name": "Pop Art 2",
970
+ "thumbnail": "pop_art_2.webp"
971
+ },
972
+ {
973
+ "style_description": "Rococo, flamboyant, pastel colors, curved lines, elaborate detail, Rococo",
974
+ "name": "Rococo",
975
+ "thumbnail": "rococo.webp"
976
+ },
977
+ {
978
+ "style_description": "Silhouette Art, high contrast, well defined, Silhouette Art",
979
+ "name": "Silhouette Art",
980
+ "thumbnail": "silhouette_art.webp"
981
+ },
982
+ {
983
+ "style_description": "Simple Vector Art, 2D flat, simple shapes, minimalistic, professional graphic, flat color, high contrast, Simple Vector Art",
984
+ "name": "Simple Vector Art",
985
+ "thumbnail": "simple_vector_art.webp"
986
+ },
987
+ {
988
+ "style_description": "Sketchup, CAD, professional design, Sketchup",
989
+ "name": "Sketchup",
990
+ "thumbnail": "sketchup.webp"
991
+ },
992
+ {
993
+ "style_description": "Steampunk, retrofuturistic science fantasy, steam-powered tech, vintage industry, gears, neo-victorian, steampunk",
994
+ "name": "Steampunk 2",
995
+ "thumbnail": "steampunk_2.webp"
996
+ },
997
+ {
998
+ "style_description": "Surrealism, expressive, dramatic, organic lines and forms, dreamlike and mysterious, Surrealism",
999
+ "name": "Surrealism",
1000
+ "thumbnail": "surrealism.webp"
1001
+ },
1002
+ {
1003
+ "style_description": "Suprematism, abstract, limited color palette, geometric forms, Suprematism",
1004
+ "name": "Suprematism",
1005
+ "thumbnail": "suprematism.webp"
1006
+ },
1007
+ {
1008
+ "style_description": "Terragen, beautiful massive landscape, epic scenery, Terragen",
1009
+ "name": "Terragen",
1010
+ "thumbnail": "terragen.webp"
1011
+ },
1012
+ {
1013
+ "style_description": "Tranquil Relaxing Atmosphere, calming style, soothing colors, peaceful, idealic, Tranquil Relaxing Atmosphere",
1014
+ "name": "Tranquil Relaxing Atmosphere",
1015
+ "thumbnail": "tranquil_relaxing_atmosphere.webp"
1016
+ },
1017
+ {
1018
+ "style_description": "Vector Art Stickers, professional vector design, sticker designs, Sticker Sheet",
1019
+ "name": "Sticker Designs",
1020
+ "thumbnail": "sticker_designs.webp"
1021
+ },
1022
+ {
1023
+ "style_description": "Vibrant Rim Light, bright rim light, high contrast, bold edge light",
1024
+ "name": "Vibrant Rim Light",
1025
+ "thumbnail": "vibrant_rim_light.webp"
1026
+ },
1027
+ {
1028
+ "style_description": "Volumetric Lighting, light depth, dramatic atmospheric lighting, Volumetric Lighting",
1029
+ "name": "Volumetric Lighting",
1030
+ "thumbnail": "volumetric_lighting.webp"
1031
+ },
1032
+ {
1033
+ "style_description": "Watercolor style painting, visible paper texture, colorwash, watercolor",
1034
+ "name": "Watercolor 2",
1035
+ "thumbnail": "watercolor_2.webp"
1036
+ },
1037
+ {
1038
+ "style_description": "Whimsical and Playful, imaginative, fantastical, bight colors, stylized, happy, Whimsical and Playful",
1039
+ "name": "Whimsical and Playful",
1040
+ "thumbnail": "whimsical_and_playful.webp"
1041
+ },
1042
+ {
1043
+ "style_description": "Chromolithograph, Vibrant colors, intricate details, rich color saturation, meticulous registration, multi-layered printing, decorative elements, historical charm, artistic reproductions, commercial posters, nostalgic, ornate compositions.",
1044
+ "name": "MK Chromolithography",
1045
+ "thumbnail": "mk_chromolithography.webp"
1046
+ },
1047
+ {
1048
+ "style_description": "Cross processing print, Experimental color shifts, unconventional tonalities, vibrant and surreal hues, heightened contrasts, unpredictable results, artistic unpredictability, retro and vintage feel, dynamic color interplay, abstract and dreamlike.",
1049
+ "name": "MK Cross Processing Print",
1050
+ "thumbnail": "mk_cross_processing_print.webp"
1051
+ },
1052
+ {
1053
+ "style_description": "Dufaycolor photograph, Vintage color palette, distinctive color rendering, soft and dreamy atmosphere, historical charm, unique color process, grainy texture, evocative mood, nostalgic aesthetic, hand-tinted appearance, artistic patina.",
1054
+ "name": "MK Dufaycolor Photograph",
1055
+ "thumbnail": "mk_dufaycolor_photograph.webp"
1056
+ },
1057
+ {
1058
+ "style_description": "Herbarium drawing. Botanical accuracy, old botanical book illustration, detailed illustrations, pressed plants, delicate and precise linework, scientific documentation, meticulous presentation, educational purpose, organic compositions, timeless aesthetic, naturalistic beauty.",
1059
+ "name": "MK Herbarium",
1060
+ "thumbnail": "mk_herbarium.webp"
1061
+ },
1062
+ {
1063
+ "style_description": "punk collage style, mixed media, papercut,textured paper, overlapping, ripped posters, safety pins, chaotic layers, graffiti-style elements, anarchy symbols, vintage photos, cut-and-paste aesthetic, bold typography, distorted images, political messages, urban decay, distressed textures, newspaper clippings, spray paint, rebellious icons, DIY spirit, vivid colors, punk band logos, edgy and raw compositions",
1064
+ "name": "MK Punk Collage",
1065
+ "thumbnail": "mk_punk_collage.webp"
1066
+ },
1067
+ {
1068
+ "style_description": "mosaic style, fragmented, assembled, colorful, highly detailed",
1069
+ "name": "MK mosaic",
1070
+ "thumbnail": "mk_mosaic.webp"
1071
+ },
1072
+ {
1073
+ "style_description": "Oil painting by Van Gogh, Expressive, impasto, swirling brushwork, vibrant, brush strokes, Brushstroke-heavy, Textured, Impasto, Colorful, Dynamic, Bold, Distinctive, Vibrant, Whirling, Expressive, Dramatic, Swirling, Layered, Intense, Contrastive, Atmospheric, Luminous, Textural, Evocative, SpiraledVan Gogh style",
1074
+ "name": "MK Van Gogh",
1075
+ "thumbnail": "mk_van_gogh.webp"
1076
+ },
1077
+ {
1078
+ "style_description": "centered black and white high contrast line drawing, coloring book style, monochrome, blank white background",
1079
+ "name": "MK Coloring Book",
1080
+ "thumbnail": "mk_coloring_book.webp"
1081
+ },
1082
+ {
1083
+ "style_description": "Oil painting by John Singer Sargent, Elegant, refined, masterful technique,realistic portrayal, subtle play of light, captivating expression, rich details, harmonious colors, skillful composition, brush strokes, chiaroscuro.",
1084
+ "name": "MK Singer Sargent",
1085
+ "thumbnail": "mk_singer_sargent.webp"
1086
+ },
1087
+ {
1088
+ "style_description": "Oil painting by Jackson Pollock, Abstract expressionism, drip painting, chaotic composition, energetic, spontaneous, unconventional technique, dynamic, bold, distinctive, vibrant, intense, expressive, energetic, layered, non-representational, gestural.",
1089
+ "name": "MK Pollock",
1090
+ "thumbnail": "mk_pollock.webp"
1091
+ },
1092
+ {
1093
+ "style_description": "Artwork by Jean-Michel Basquiat, Neo-expressionism, street art influence, graffiti-inspired, raw, energetic, bold colors, dynamic composition, chaotic, layered, textural, expressive, spontaneous, distinctive, symbolic,energetic brushstrokes.",
1094
+ "name": "MK Basquiat",
1095
+ "thumbnail": "mk_basquiat.webp"
1096
+ },
1097
+ {
1098
+ "style_description": "Artwork in the style of Andy Warhol, Pop art, vibrant colors, bold compositions, repetition of iconic imagery, celebrity culture, commercial aesthetics, mass production influence, stylized simplicity, cultural commentary, graphical elements, distinctive portraits.",
1099
+ "name": "MK Andy Warhol",
1100
+ "thumbnail": "mk_andy_warhol.webp"
1101
+ },
1102
+ {
1103
+ "style_description": "Halftone print of, Dot matrix pattern, grayscale tones, vintage aesthetic, newspaper print vibe, stylized dots, visual texture, black and white contrasts, retro appearance, artistic pointillism,pop culture, (Roy Lichtenstein style:1.5).",
1104
+ "name": "MK Halftone print",
1105
+ "thumbnail": "mk_halftone_print.webp"
1106
+ },
1107
+ {
1108
+ "style_description": "Gond painting, Intricate patterns, vibrant colors, detailed motifs, nature-inspired themes, tribal folklore, fine lines, intricate detailing, storytelling compositions, mystical and folkloric, cultural richness.",
1109
+ "name": "MK Gond Painting",
1110
+ "thumbnail": "mk_gond_painting.webp"
1111
+ },
1112
+ {
1113
+ "style_description": "Albumen print, Sepia tones, fine details, subtle tonal gradations, delicate highlights, vintage aesthetic, soft and muted atmosphere, historical charm, rich textures, meticulous craftsmanship, classic photographic technique, vignetting.",
1114
+ "name": "MK Albumen Print",
1115
+ "thumbnail": "mk_albumen_print.webp"
1116
+ },
1117
+ {
1118
+ "style_description": "Aquatint print, Soft tonal gradations, atmospheric effects, velvety textures, rich contrasts, fine details, etching process, delicate lines, nuanced shading, expressive and moody atmosphere, artistic depth.",
1119
+ "name": "MK Aquatint Print",
1120
+ "thumbnail": "mk_aquatint_print.webp"
1121
+ },
1122
+ {
1123
+ "style_description": "Anthotype print, Monochrome dye, soft and muted colors, organic textures, ephemeral and delicate appearance, low details, watercolor canvas, low contrast, overexposed, silhouette, textured paper.",
1124
+ "name": "MK Anthotype Print",
1125
+ "thumbnail": "mk_anthotype_print.webp"
1126
+ },
1127
+ {
1128
+ "style_description": "A sculpture made of ivory, made of, Sculptures, Inuit art style, intricate carvings, natural materials, storytelling motifs, arctic wildlife themes, symbolic representations, cultural traditions, earthy tones, harmonious compositions, spiritual and mythological elements.",
1129
+ "name": "MK Inuit Carving",
1130
+ "thumbnail": "mk_inuit_carving.webp"
1131
+ },
1132
+ {
1133
+ "style_description": "Bromoil print, Painterly effects, sepia tones, textured surfaces, rich contrasts, expressive brushwork, tonal variations, vintage aesthetic, atmospheric mood, handmade quality, artistic experimentation, darkroom craftsmanship, vignetting.",
1134
+ "name": "MK Bromoil Print",
1135
+ "thumbnail": "mk_bromoil_print.webp"
1136
+ },
1137
+ {
1138
+ "style_description": "Calotype print, Soft focus, subtle tonal range, paper negative process, fine details, vintage aesthetic, artistic experimentation, atmospheric mood, early photographic charm, handmade quality, vignetting.",
1139
+ "name": "MK Calotype Print",
1140
+ "thumbnail": "mk_calotype_print.webp"
1141
+ },
1142
+ {
1143
+ "style_description": "Color sketchnote, Hand-drawn elements, vibrant colors, visual hierarchy, playful illustrations, varied typography, graphic icons, organic and dynamic layout, personalized touches, creative expression, engaging storytelling.",
1144
+ "name": "MK Color Sketchnote",
1145
+ "thumbnail": "mk_color_sketchnote.webp"
1146
+ },
1147
+ {
1148
+ "style_description": "A sculpture made of blue pattern porcelain of, Classic design, blue and white color scheme, intricate detailing, floral motifs, onion-shaped elements, historical charm, rococo, white ware, cobalt blue, underglaze pattern, fine craftsmanship, traditional elegance, delicate patterns, vintage aesthetic, Meissen, Blue Onion pattern, Cibulak.",
1149
+ "name": "MK Cibulak Porcelain",
1150
+ "thumbnail": "mk_cibulak_porcelain.webp"
1151
+ },
1152
+ {
1153
+ "style_description": "Alcohol ink art, Fluid and vibrant colors, unpredictable patterns, organic textures, translucent layers, abstract compositions, ethereal and dreamy effects, free-flowing movement, expressive brushstrokes, contemporary aesthetic, wet textured paper.",
1154
+ "name": "MK Alcohol Ink Art",
1155
+ "thumbnail": "mk_alcohol_ink_art.webp"
1156
+ },
1157
+ {
1158
+ "style_description": "One line art, Continuous and unbroken black line, minimalistic, simplicity, economical use of space, flowing and dynamic, symbolic representations, contemporary aesthetic, evocative and abstract, white background.",
1159
+ "name": "MK One Line Art",
1160
+ "thumbnail": "mk_one_line_art.webp"
1161
+ },
1162
+ {
1163
+ "style_description": "Blacklight paint, Fluorescent pigments, vibrant and surreal colors, ethereal glow, otherworldly effects, dynamic and psychedelic compositions, neon aesthetics, transformative in ultraviolet light, contemporary and experimental.",
1164
+ "name": "MK Blacklight Paint",
1165
+ "thumbnail": "mk_blacklight_paint.webp"
1166
+ },
1167
+ {
1168
+ "style_description": "A sculpture made of Carnival glass, Iridescent surfaces, vibrant colors, intricate patterns, opalescent hues, reflective and prismatic effects, Art Nouveau and Art Deco influences, vintage charm, intricate detailing, lustrous and luminous appearance, Carnival Glass style.",
1169
+ "name": "MK Carnival Glass",
1170
+ "thumbnail": "mk_carnival_glass.webp"
1171
+ },
1172
+ {
1173
+ "style_description": "Cyanotype print, Prussian blue tones, distinctive coloration, high contrast, blueprint aesthetics, atmospheric mood, sun-exposed paper, silhouette effects, delicate details, historical charm, handmade and experimental quality.",
1174
+ "name": "MK Cyanotype Print",
1175
+ "thumbnail": "mk_cyanotype_print.webp"
1176
+ },
1177
+ {
1178
+ "style_description": "Cross-stitching, Intricate patterns, embroidery thread, sewing, fine details, precise stitches, textile artistry, symmetrical designs, varied color palette, traditional and contemporary motifs, handmade and crafted,canvas, nostalgic charm.",
1179
+ "name": "MK Cross-Stitching",
1180
+ "thumbnail": "mk_cross-stitching.webp"
1181
+ },
1182
+ {
1183
+ "style_description": "Encaustic paint, Textured surfaces, translucent layers, luminous quality, wax medium, rich color saturation, fluid and organic shapes, contemporary and historical influences, mixed media elements, atmospheric depth.",
1184
+ "name": "MK Encaustic Paint",
1185
+ "thumbnail": "mk_encaustic_paint.webp"
1186
+ },
1187
+ {
1188
+ "style_description": "Embroidery, Intricate stitching, embroidery thread, fine details, varied thread textures, textile artistry, embellished surfaces, diverse color palette, traditional and contemporary motifs, handmade and crafted, tactile and ornate.",
1189
+ "name": "MK Embroidery",
1190
+ "thumbnail": "mk_embroidery.webp"
1191
+ },
1192
+ {
1193
+ "style_description": "Gyotaku, Fish impressions, realistic details, ink rubbings, textured surfaces, traditional Japanese art form, nature-inspired compositions, artistic representation of marine life, black and white contrasts, cultural significance.",
1194
+ "name": "MK Gyotaku",
1195
+ "thumbnail": "mk_gyotaku.webp"
1196
+ },
1197
+ {
1198
+ "style_description": "Luminogram, Photogram technique, ethereal and abstract effects, light and shadow interplay, luminous quality, experimental process, direct light exposure, unique and unpredictable results, artistic experimentation.",
1199
+ "name": "MK Luminogram",
1200
+ "thumbnail": "mk_luminogram.webp"
1201
+ },
1202
+ {
1203
+ "style_description": "Lite Brite art, Luminous and colorful designs, pixelated compositions, retro aesthetic, glowing effects, creative patterns, interactive and playful, nostalgic charm, vibrant and dynamic arrangements.",
1204
+ "name": "MK Lite Brite Art",
1205
+ "thumbnail": "mk_lite_brite_art.webp"
1206
+ },
1207
+ {
1208
+ "style_description": "Mokume-gane, Wood-grain patterns, mixed metal layers, intricate and organic designs, traditional Japanese metalwork, harmonious color combinations, artisanal craftsmanship, unique and layered textures, cultural and historical significance.",
1209
+ "name": "MK Mokume-gane",
1210
+ "thumbnail": "mk_mokume-gane.webp"
1211
+ },
1212
+ {
1213
+ "style_description": "a sculpture made of peebles, Pebble art style,natural materials, textured surfaces, balanced compositions, organic forms, harmonious arrangements, tactile and 3D effects, beach-inspired aesthetic, creative storytelling, artisanal craftsmanship.",
1214
+ "name": "Pebble Art",
1215
+ "thumbnail": "pebble_art.webp"
1216
+ },
1217
+ {
1218
+ "style_description": "Palekh art, Miniature paintings, intricate details, vivid colors, folkloric themes, lacquer finish, storytelling compositions, symbolic elements, Russian folklore influence, cultural and historical significance.",
1219
+ "name": "MK Palekh",
1220
+ "thumbnail": "mk_palekh.webp"
1221
+ },
1222
+ {
1223
+ "style_description": "Suminagashi, Floating ink patterns, marbled effects, delicate and ethereal designs, water-based ink, fluid and unpredictable compositions, meditative process, monochromatic or subtle color palette, Japanese artistic tradition.",
1224
+ "name": "MK Suminagashi",
1225
+ "thumbnail": "mk_suminagashi.webp"
1226
+ },
1227
+ {
1228
+ "style_description": "A Scrimshaw engraving of, Intricate engravings on a spermwhale's teeth, marine motifs, detailed scenes, nautical themes, black and white contrasts, historical craftsmanship, artisanal carving, storytelling compositions, maritime heritage.",
1229
+ "name": "MK Scrimshaw",
1230
+ "thumbnail": "mk_scrimshaw.webp"
1231
+ },
1232
+ {
1233
+ "style_description": "Shibori, Textured fabric, intricate patterns, resist-dyeing technique, indigo or vibrant colors, organic and flowing designs, Japanese textile art, cultural tradition, tactile and visual interest.",
1234
+ "name": "MK Shibori",
1235
+ "thumbnail": "mk_shibori.webp"
1236
+ },
1237
+ {
1238
+ "style_description": "A sculpture made of Vitreous enamel, Smooth and glossy surfaces, vibrant colors, glass-like finish, durable and resilient, intricate detailing, traditional and contemporary applications, artistic craftsmanship, jewelry and decorative objects, Vitreous enamel, colored glass.",
1239
+ "name": "MK Vitreous Enamel",
1240
+ "thumbnail": "mk_vitreous_enamel.webp"
1241
+ },
1242
+ {
1243
+ "style_description": "Ukiyo-e, Woodblock prints, vibrant colors, intricate details, depictions of landscapes, kabuki actors, beautiful women, cultural scenes, traditional Japanese art, artistic craftsmanship, historical significance.",
1244
+ "name": "MK Ukiyo-e",
1245
+ "thumbnail": "mk_ukiyo-e.webp"
1246
+ },
1247
+ {
1248
+ "style_description": "vintage airline poster, classic aviation fonts, pastel colors, elegant aircraft illustrations, scenic destinations, distressed textures, retro travel allure",
1249
+ "name": "MK vintage-airline-poster",
1250
+ "thumbnail": "mk_vintage-airline-poster.webp"
1251
+ },
1252
+ {
1253
+ "style_description": "vintage travel poster, retro fonts, muted colors, scenic illustrations, iconic landmarks, distressed textures, nostalgic vibes",
1254
+ "name": "MK vintage-travel-poster",
1255
+ "thumbnail": "mk_vintage-travel-poster.webp"
1256
+ },
1257
+ {
1258
+ "style_description": "Bauhaus-inspired, minimalism, geometric precision, primary colors, sans-serif typography, asymmetry, functional design",
1259
+ "name": "MK bauhaus-style",
1260
+ "thumbnail": "mk_bauhaus-style.webp"
1261
+ },
1262
+ {
1263
+ "style_description": "Afrofuturism illustration, vibrant colors, futuristic elements, cultural symbolism, cosmic imagery, dynamic patterns, empowering narratives",
1264
+ "name": "MK afrofuturism",
1265
+ "thumbnail": "mk_afrofuturism.webp"
1266
+ },
1267
+ {
1268
+ "style_description": "Atompunk illustation, retro-futuristic, atomic age aesthetics, sleek lines, metallic textures, futuristic technology, optimism, energy",
1269
+ "name": "MK atompunk",
1270
+ "thumbnail": "mk_atompunk.webp"
1271
+ },
1272
+ {
1273
+ "style_description": "Constructivism, geometric abstraction, bold colors, industrial aesthetics, dynamic compositions, utilitarian design, revolutionary spirit",
1274
+ "name": "MK constructivism",
1275
+ "thumbnail": "mk_constructivism.webp"
1276
+ },
1277
+ {
1278
+ "style_description": "Chicano art, bold colors, cultural symbolism, muralism, lowrider aesthetics, barrio life, political messages, social activism, Mexico",
1279
+ "name": "MK chicano-art",
1280
+ "thumbnail": "mk_chicano-art.webp"
1281
+ },
1282
+ {
1283
+ "style_description": "De Stijl Art, neoplasticism, primary colors, geometric abstraction, horizontal and vertical lines, simplicity, harmony, utopian ideals",
1284
+ "name": "MK de-stijl",
1285
+ "thumbnail": "mk_de-stijl.webp"
1286
+ },
1287
+ {
1288
+ "style_description": "Dayak art sculpture of, intricate patterns, nature-inspired motifs, vibrant colors, traditional craftsmanship, cultural symbolism, storytelling",
1289
+ "name": "MK dayak-art",
1290
+ "thumbnail": "mk_dayak-art.webp"
1291
+ },
1292
+ {
1293
+ "style_description": "Fayum portrait, encaustic painting, realistic facial features, warm earth tones, serene expressions, ancient Egyptian influences",
1294
+ "name": "MK fayum-portrait",
1295
+ "thumbnail": "mk_fayum-portrait.webp"
1296
+ },
1297
+ {
1298
+ "style_description": "Illuminated manuscript, intricate calligraphy, rich colors, detailed illustrations, gold leaf accents, ornate borders, religious, historical, medieval",
1299
+ "name": "MK illuminated-manuscript",
1300
+ "thumbnail": "mk_illuminated-manuscript.webp"
1301
+ },
1302
+ {
1303
+ "style_description": "Kalighat painting, bold lines, vibrant colors, narrative storytelling, cultural motifs, flat compositions, expressive characters",
1304
+ "name": "MK kalighat-painting",
1305
+ "thumbnail": "mk_kalighat-painting.webp"
1306
+ },
1307
+ {
1308
+ "style_description": "Madhubani painting, intricate patterns, vibrant colors, nature-inspired motifs, cultural storytelling, symmetry, folk art aesthetics",
1309
+ "name": "MK madhubani-painting",
1310
+ "thumbnail": "mk_madhubani-painting.webp"
1311
+ },
1312
+ {
1313
+ "style_description": "Pictorialism illustration, soft focus, atmospheric effects, artistic interpretation, tonality, muted colors, evocative storytelling",
1314
+ "name": "MK pictorialism",
1315
+ "thumbnail": "mk_pictorialism.webp"
1316
+ },
1317
+ {
1318
+ "style_description": "Pichwai painting, intricate detailing, vibrant colors, religious themes, nature motifs, devotional storytelling, gold leaf accents",
1319
+ "name": "MK pichwai-painting",
1320
+ "thumbnail": "mk_pichwai-painting.webp"
1321
+ },
1322
+ {
1323
+ "style_description": "Patachitra painting, bold outlines, vibrant colors, intricate detailing, mythological themes, storytelling, traditional craftsmanship",
1324
+ "name": "MK patachitra-painting",
1325
+ "thumbnail": "mk_patachitra-painting.webp"
1326
+ },
1327
+ {
1328
+ "style_description": "Samoan art-inspired wooden sculpture, traditional motifs, natural elements, bold colors, cultural symbolism, storytelling, craftsmanship",
1329
+ "name": "MK samoan-art-inspired",
1330
+ "thumbnail": "mk_samoan-art-inspired.webp"
1331
+ },
1332
+ {
1333
+ "style_description": "Tlingit art, formline design, natural elements, animal motifs, bold colors, cultural storytelling, traditional craftsmanship, Alaska traditional art, (totem:1.5)",
1334
+ "name": "MK tlingit-art",
1335
+ "thumbnail": "mk_tlingit-art.webp"
1336
+ },
1337
+ {
1338
+ "style_description": "Painting by Adnate, realistic portraits, street art, large-scale murals, subdued color palette, social narratives",
1339
+ "name": "MK adnate-style",
1340
+ "thumbnail": "mk_adnate-style.webp"
1341
+ },
1342
+ {
1343
+ "style_description": "Painting by Ron English, pop-surrealism, cultural subversion, iconic mash-ups, vibrant and bold colors, satirical commentary",
1344
+ "name": "MK ron-english-style",
1345
+ "thumbnail": "mk_ron-english-style.webp"
1346
+ },
1347
+ {
1348
+ "style_description": "Painting by Shepard Fairey, street art, political activism, iconic stencils, bold typography, high contrast, red, black, and white color palette",
1349
+ "name": "MK shepard-fairey-style",
1350
+ "thumbnail": "mk_shepard-fairey-style.webp"
1351
+ }
1352
+ ]
models/hunyuan-foley/config_xl.yaml ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ model_config:
2
+ model_name: HunyuanVideo-Foley-XL
3
+ model_type: 1d
4
+ model_precision: bf16
5
+ model_kwargs:
6
+ depth_triple_blocks: 12
7
+ depth_single_blocks: 24
8
+ hidden_size: 1408
9
+ num_heads: 11
10
+ mlp_ratio: 4
11
+ mlp_act_type: "gelu_tanh"
12
+ qkv_bias: True
13
+ qk_norm: True
14
+ qk_norm_type: "rms"
15
+ attn_mode: "torch"
16
+ embedder_type: "default"
17
+ interleaved_audio_visual_rope: True
18
+ enable_learnable_empty_visual_feat: True
19
+ sync_modulation: False
20
+ add_sync_feat_to_audio: True
21
+ cross_attention: True
22
+ use_attention_mask: False
23
+ condition_projection: "linear"
24
+ sync_feat_dim: 768 # syncformer 768 dim
25
+ condition_dim: 768 # clap 768 text condition dim (clip-text)
26
+ clip_dim: 768 # siglip2 visual dim
27
+ audio_vae_latent_dim: 128
28
+ audio_frame_rate: 50
29
+ patch_size: 1
30
+ rope_dim_list: null
31
+ rope_theta: 10000
32
+ text_length: 77
33
+ clip_length: 64
34
+ sync_length: 192
35
+ depth_triple_ssl_encoder: null
36
+ depth_single_ssl_encoder: 8
37
+ use_repa_with_audiossl: True
38
+
39
+ diffusion_config:
40
+ denoise_type: "flow"
41
+ flow_path_type: "linear"
42
+ flow_predict_type: "velocity"
43
+ flow_reverse: True
44
+ flow_solver: "euler"
45
+ sample_flow_shift: 1.0
46
+ sample_use_flux_shift: False
47
+ flux_base_shift: 0.5
48
+ flux_max_shift: 1.15
models/kiwi-edit/.gitattributes ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ processor/tokenizer.json filter=lfs diff=lfs merge=lfs -text
models/kiwi-edit/README.md ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ library_name: diffusers
3
+ pipeline_tag: image-to-video
4
+ ---
5
+
6
+ # Kiwi-Edit: Versatile Video Editing via Instruction and Reference Guidance
7
+
8
+ Kiwi-Edit is a versatile video editing framework built on an MLLM encoder and a video Diffusion Transformer (DiT). It supports both instruction-based video editing and reference-guided editing (using a reference image and instruction).
9
+
10
+ - **Paper:** [Kiwi-Edit: Versatile Video Editing via Instruction and Reference Guidance](https://huggingface.co/papers/2603.02175)
11
+ - **Project Page:** [https://showlab.github.io/Kiwi-Edit/](https://showlab.github.io/Kiwi-Edit/)
12
+ - **Repository:** [https://github.com/showlab/Kiwi-Edit](https://github.com/showlab/Kiwi-Edit)
13
+
14
+ ## Model Description
15
+
16
+ Kiwi-Edit introduces a unified editing architecture that synergizes learnable queries and latent visual features for reference semantic guidance. It addresses the challenge of precise visual control in instruction-based editing by allowing users to provide a reference image to guide the transformation. The framework achieves significant performance improvements in instruction following and reference fidelity through a scalable data generation pipeline and a multi-stage training curriculum.
17
+
18
+ ## Usage
19
+
20
+ This model is compatible with the `diffusers` library. To run inference, follow the installation instructions in the [official repository](https://github.com/showlab/Kiwi-Edit).
21
+
22
+ ### Quick Test with Diffusers
23
+
24
+ You can run a quick test on a demo video using the following command provided in the repository:
25
+
26
+ ```bash
27
+ python diffusers_demo.py \
28
+ --video_path ./demo_data/video/source/0005e4ad9f49814db1d3f2296b911abf.mp4 \
29
+ --prompt "Remove the monkey." \
30
+ --save_path output.mp4 \
31
+ --model_path linyq/kiwi-edit-5b-instruct-only-diffusers
32
+ ```
33
+
34
+ ## Citation
35
+
36
+ If you find this work useful, please cite:
37
+
38
+ ```bibtex
39
+ @misc{kiwiedit,
40
+ title={Kiwi-Edit: Versatile Video Editing via Instruction and Reference Guidance},
41
+ author={Yiqi Lin and Guoqiang Liang and Ziyun Zeng and Zechen Bai and Yanzhe Chen and Mike Zheng Shou},
42
+ year={2026},
43
+ eprint={2603.02175},
44
+ archivePrefix={arXiv},
45
+ primaryClass={cs.CV},
46
+ url={https://arxiv.org/abs/2603.02175},
47
+ }
48
+ ```
models/kiwi-edit/__init__.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ from pipeline_kiwi_edit import KiwiEditPipeline
2
+ from mllm_encoder import MLLMEncoder
3
+ from conditional_embedder import ConditionalEmbedder
4
+ from wan_video_vae import VAE
models/kiwi-edit/conditional_embedder.py ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn as nn
3
+ from diffusers import ModelMixin, ConfigMixin
4
+ from diffusers.configuration_utils import register_to_config
5
+
6
+
7
+ class ConditionalEmbedder(ModelMixin, ConfigMixin):
8
+ """
9
+ Patchifies VAE-encoded conditions (source video or reference image)
10
+ into the DiT hidden dimension space via a Conv3d layer.
11
+ """
12
+
13
+ @register_to_config
14
+ def __init__(
15
+ self,
16
+ in_dim: int = 48,
17
+ dim: int = 3072,
18
+ patch_size: list = [1, 2, 2],
19
+ zero_init: bool = True,
20
+ ref_pad_first: bool = False,
21
+ ):
22
+ super().__init__()
23
+ kernel_size = tuple(patch_size)
24
+ self.patch_embedding = nn.Conv3d(
25
+ in_dim, dim, kernel_size=kernel_size, stride=kernel_size
26
+ )
27
+ self.ref_pad_first = ref_pad_first
28
+ if zero_init:
29
+ nn.init.zeros_(self.patch_embedding.weight)
30
+ nn.init.zeros_(self.patch_embedding.bias)
31
+
32
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
33
+ return self.patch_embedding(x)
models/kiwi-edit/mllm_encoder.py ADDED
The diff for this file is too large to render. See raw diff
 
models/kiwi-edit/model_index.json ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_class_name": [
3
+ "pipeline_kiwi_edit",
4
+ "KiwiEditPipeline"
5
+ ],
6
+ "_diffusers_version": "0.32.0",
7
+ "processor": [
8
+ "transformers",
9
+ "AutoProcessor"
10
+ ],
11
+ "transformer": [
12
+ "diffusers",
13
+ "WanTransformer3DModel"
14
+ ],
15
+ "vae": [
16
+ "wan_video_vae",
17
+ "VAE"
18
+ ],
19
+ "scheduler": [
20
+ "diffusers",
21
+ "FlowMatchEulerDiscreteScheduler"
22
+ ],
23
+ "mllm_encoder": [
24
+ "mllm_encoder",
25
+ "MLLMEncoder"
26
+ ],
27
+ "source_embedder": [
28
+ "conditional_embedder",
29
+ "ConditionalEmbedder"
30
+ ],
31
+ "ref_embedder": [
32
+ "conditional_embedder",
33
+ "ConditionalEmbedder"
34
+ ]
35
+ }
models/kiwi-edit/pipeline_kiwi_edit.py ADDED
@@ -0,0 +1,510 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn.functional as F
3
+ import numpy as np
4
+ from typing import Optional, List, Union, Callable, Tuple
5
+ from PIL import Image, ImageOps
6
+ from einops import rearrange
7
+ from tqdm import tqdm
8
+ from diffusers import DiffusionPipeline
9
+
10
+
11
+ def sinusoidal_embedding_1d(dim, position):
12
+ """1D sinusoidal positional embedding for timesteps."""
13
+ sinusoid = torch.outer(
14
+ position.type(torch.float64),
15
+ torch.pow(
16
+ 10000,
17
+ -torch.arange(dim // 2, dtype=torch.float64, device=position.device).div(
18
+ dim // 2
19
+ ),
20
+ ),
21
+ )
22
+ x = torch.cat([torch.cos(sinusoid), torch.sin(sinusoid)], dim=1)
23
+ return x.to(position.dtype)
24
+
25
+
26
+ def _build_rope_3d(rope_module, f, h, w, device):
27
+ """
28
+ Build 3D RoPE (cos, sin) for a given (f, h, w) grid using the
29
+ WanRotaryPosEmbed module's precomputed buffers.
30
+
31
+ Returns:
32
+ (freqs_cos, freqs_sin) each of shape [1, f*h*w, 1, head_dim]
33
+ """
34
+ split_sizes = [rope_module.t_dim, rope_module.h_dim, rope_module.w_dim]
35
+ cos_parts = rope_module.freqs_cos.split(split_sizes, dim=1)
36
+ sin_parts = rope_module.freqs_sin.split(split_sizes, dim=1)
37
+
38
+ cos_f = cos_parts[0][:f].view(f, 1, 1, -1).expand(f, h, w, -1)
39
+ cos_h = cos_parts[1][:h].view(1, h, 1, -1).expand(f, h, w, -1)
40
+ cos_w = cos_parts[2][:w].view(1, 1, w, -1).expand(f, h, w, -1)
41
+
42
+ sin_f = sin_parts[0][:f].view(f, 1, 1, -1).expand(f, h, w, -1)
43
+ sin_h = sin_parts[1][:h].view(1, h, 1, -1).expand(f, h, w, -1)
44
+ sin_w = sin_parts[2][:w].view(1, 1, w, -1).expand(f, h, w, -1)
45
+
46
+ freqs_cos = torch.cat([cos_f, cos_h, cos_w], dim=-1).reshape(1, f * h * w, 1, -1).to(device)
47
+ freqs_sin = torch.cat([sin_f, sin_h, sin_w], dim=-1).reshape(1, f * h * w, 1, -1).to(device)
48
+ return freqs_cos, freqs_sin
49
+
50
+
51
+ class KiwiEditPipeline(DiffusionPipeline):
52
+ """
53
+ Pipeline for reference-guided video and image editing using KiwiEdit.
54
+
55
+ This pipeline uses a Qwen2.5-VL multimodal LLM encoder for understanding
56
+ editing instructions with source visual context, a WanTransformer3DModel
57
+ for diffusion, and AutoencoderKLWan for VAE encoding/decoding.
58
+
59
+ Args:
60
+ transformer: WanTransformer3DModel - DiT backbone for denoising.
61
+ vae: AutoencoderKLWan - 3D causal VAE.
62
+ scheduler: FlowMatchEulerDiscreteScheduler or compatible scheduler.
63
+ mllm_encoder: MLLMEncoder - Qwen2.5-VL MLLM with learnable queries.
64
+ processor: AutoProcessor - Qwen2.5-VL processor/tokenizer bundle.
65
+ source_embedder: ConditionalEmbedder - VAE source conditioning.
66
+ ref_embedder: ConditionalEmbedder - VAE reference conditioning.
67
+ """
68
+
69
+ # NOTE: model_cpu_offload_seq is NOT used -- manual offload in __call__
70
+ # handles the out-of-order VAE calls and the interleaved denoising loop.
71
+ model_cpu_offload_seq = "mllm_encoder->source_embedder->ref_embedder->transformer->vae"
72
+
73
+ @property
74
+ def _execution_device(self):
75
+ """Override: always CUDA — manual offload handles component placement."""
76
+ return torch.device("cuda")
77
+
78
+ def _offload_to(self, components, device):
79
+ """Move named components to device. components: list of attr names."""
80
+ import gc
81
+ for name in components:
82
+ comp = getattr(self, name, None)
83
+ if comp is not None:
84
+ if str(device) != "cpu":
85
+ comp.to(device=device, dtype=torch.bfloat16)
86
+ else:
87
+ comp.to(device)
88
+ if str(device) == "cpu":
89
+ gc.collect()
90
+ torch.cuda.empty_cache()
91
+
92
+ def __init__(
93
+ self,
94
+ transformer,
95
+ vae,
96
+ scheduler,
97
+ mllm_encoder,
98
+ source_embedder,
99
+ ref_embedder,
100
+ processor=None,
101
+ ):
102
+ super().__init__()
103
+ if isinstance(processor, (list, tuple)):
104
+ # Diffusers may pass the raw model_index spec; let MLLMEncoder resolve it later.
105
+ processor = None
106
+ self.register_modules(
107
+ transformer=transformer,
108
+ vae=vae,
109
+ scheduler=scheduler,
110
+ mllm_encoder=mllm_encoder,
111
+ processor=processor,
112
+ source_embedder=source_embedder,
113
+ ref_embedder=ref_embedder,
114
+ )
115
+ if processor is not None:
116
+ self.mllm_encoder.processor = processor
117
+
118
+ # ------------------------------------------------------------------ #
119
+ # Helper utilities #
120
+ # ------------------------------------------------------------------ #
121
+
122
+ @staticmethod
123
+ def _check_resize(height, width, num_frames, h_div=16, w_div=16, t_div=4, t_rem=1):
124
+ """Round height/width/num_frames to valid values."""
125
+ if height % h_div != 0:
126
+ height = (height + h_div - 1) // h_div * h_div
127
+ if width % w_div != 0:
128
+ width = (width + w_div - 1) // w_div * w_div
129
+ if num_frames % t_div != t_rem:
130
+ num_frames = (num_frames + t_div - 1) // t_div * t_div + t_rem
131
+ return height, width, num_frames
132
+
133
+ @staticmethod
134
+ def _preprocess_image(image: Image.Image, dtype, device):
135
+ """Convert PIL Image to tensor in [-1, 1]."""
136
+ arr = np.array(image, dtype=np.float32)
137
+ tensor = torch.from_numpy(arr).to(dtype=dtype, device=device)
138
+ tensor = tensor / 127.5 - 1.0 # [0, 255] -> [-1, 1]
139
+ tensor = tensor.permute(2, 0, 1) # H W C -> C H W
140
+ return tensor
141
+
142
+ def _preprocess_video(self, frames: List[Image.Image], dtype, device):
143
+ """Convert list of PIL Images to tensor [1, C, T, H, W] in [-1, 1]."""
144
+ tensors = [self._preprocess_image(f, dtype, device) for f in frames]
145
+ video = torch.stack(tensors, dim=1) # C T H W
146
+ return video.unsqueeze(0) # 1 C T H W
147
+
148
+ @staticmethod
149
+ def _vae_output_to_video(vae_output):
150
+ """Convert VAE output tensor to list of PIL Images."""
151
+ # vae_output shape: [B, C, T, H, W] or [T, H, W, C]
152
+ if vae_output.dim() == 5:
153
+ vae_output = vae_output.squeeze(0).permute(1, 2, 3, 0) # T H W C
154
+ frames = []
155
+ for t in range(vae_output.shape[0]):
156
+ frame = ((vae_output[t] + 1.0) * 127.5).clamp(0, 255)
157
+ frame = frame.to(device="cpu", dtype=torch.uint8).numpy()
158
+ frames.append(Image.fromarray(frame))
159
+ return frames
160
+
161
+ # ------------------------------------------------------------------ #
162
+ # Custom Flow Match Scheduler #
163
+ # ------------------------------------------------------------------ #
164
+
165
+ def _setup_scheduler(self, num_inference_steps, denoising_strength=1.0, shift=5.0):
166
+ """
167
+ Set up flow-match sigmas and timesteps matching the original diffsynth
168
+ FlowMatchScheduler with extra_one_step=True and shift.
169
+ """
170
+ sigma_min = 0.003 / 1.002
171
+ sigma_max = 1.0
172
+ sigma_start = sigma_min + (sigma_max - sigma_min) * denoising_strength
173
+ # extra_one_step: generate N+1 points, drop last
174
+ sigmas = torch.linspace(sigma_start, sigma_min, num_inference_steps + 1)[:-1]
175
+ # Apply shift
176
+ sigmas = shift * sigmas / (1 + (shift - 1) * sigmas)
177
+ timesteps = sigmas * 1000 # num_train_timesteps = 1000
178
+ return sigmas, timesteps
179
+
180
+ def _scheduler_step(self, model_output, sigmas, step_index, sample):
181
+ """Euler step for flow matching."""
182
+ sigma = sigmas[step_index]
183
+ if step_index + 1 >= len(sigmas):
184
+ sigma_next = 0.0
185
+ else:
186
+ sigma_next = sigmas[step_index + 1]
187
+ return sample + model_output * (sigma_next - sigma)
188
+
189
+ def _scheduler_add_noise(self, original_samples, noise, sigmas, step_index):
190
+ """Add noise at given timestep for img2img / video2video."""
191
+ sigma = sigmas[step_index]
192
+ return (1 - sigma) * original_samples + sigma * noise
193
+
194
+ def _scheduler_get_sigma(self, timestep, sigmas, timesteps):
195
+ """Get sigma for a given timestep."""
196
+ timestep_id = torch.argmin((timesteps - timestep).abs())
197
+ return sigmas[timestep_id]
198
+
199
+ # ------------------------------------------------------------------ #
200
+ # Transformer forward helpers #
201
+ # ------------------------------------------------------------------ #
202
+
203
+ def _model_forward(
204
+ self,
205
+ latents,
206
+ timestep,
207
+ context,
208
+ vae_source_input=None,
209
+ vae_ref_image=None,
210
+ sigmas=None,
211
+ timesteps_schedule=None,
212
+ ):
213
+ """
214
+ Custom DiT forward pass that handles source/ref conditioning.
215
+ Mirrors model_fn_wan_video from the original diffsynth pipeline.
216
+ """
217
+ device = latents.device
218
+ dtype = latents.dtype
219
+ t = self.transformer
220
+
221
+ # --- Timestep embedding ---
222
+ timestep_emb = sinusoidal_embedding_1d(
223
+ t.config.freq_dim, timestep
224
+ ).to(dtype)
225
+ time_emb = t.condition_embedder.time_embedder(timestep_emb)
226
+ # diffusers time_proj = Linear only (SiLU is applied separately)
227
+ t_mod = t.condition_embedder.time_proj(F.silu(time_emb)).unflatten(
228
+ 1, (6, t.config.num_attention_heads * t.config.attention_head_dim)
229
+ )
230
+
231
+ # --- Text/context embedding ---
232
+ # NOTE: Do NOT apply text_embedder here. The MLLM encoder's connector
233
+ # already projects to dit_dim. text_embedder is for raw text encoder
234
+ # output (text_dim → dim), which doesn't apply to MLLM output.
235
+
236
+ # --- Patchify latents ---
237
+ x = latents
238
+ if vae_source_input is not None:
239
+ vae_source_cond = self.source_embedder(vae_source_input)
240
+ x = t.patch_embedding(x)
241
+ # Get sigma for this timestep
242
+ sigma = self._scheduler_get_sigma(timestep, sigmas, timesteps_schedule)
243
+ x = x + vae_source_cond * sigma
244
+ else:
245
+ x = t.patch_embedding(x)
246
+
247
+ f, h, w = x.shape[2:]
248
+ x = rearrange(x, "b c f h w -> b (f h w) c").contiguous()
249
+
250
+ # --- 3D RoPE frequencies (real-valued cos/sin format) ---
251
+ rotary_emb = _build_rope_3d(t.rope, f, h, w, device)
252
+
253
+ # --- Reference image conditioning ---
254
+ vae_ref_input_length = 0
255
+ if vae_ref_image is not None:
256
+ if len(vae_ref_image) > 1:
257
+ vae_ref = torch.cat(vae_ref_image, dim=2) # concat along temporal
258
+ else:
259
+ vae_ref = vae_ref_image[0]
260
+
261
+ vae_ref = self.ref_embedder(vae_ref)
262
+ ref_f, ref_h, ref_w = vae_ref.shape[2:]
263
+ vae_ref = rearrange(vae_ref, "b c f h w -> b (f h w) c").contiguous()
264
+
265
+ # Recompute RoPE for extended sequence (main + ref tokens)
266
+ total_f = f + ref_f
267
+ rotary_emb = _build_rope_3d(t.rope, total_f, h, w, device)
268
+
269
+ vae_ref_input_length = vae_ref.shape[1]
270
+
271
+ if self.ref_embedder.config.ref_pad_first:
272
+ x = torch.cat([vae_ref, x], dim=1)
273
+ else:
274
+ x = torch.cat([x, vae_ref], dim=1)
275
+
276
+ # --- Transformer blocks ---
277
+ for block in t.blocks:
278
+ x = block(x, context, t_mod, rotary_emb)
279
+
280
+ # --- Output head ---
281
+ # Match diffusers' FP32 norm + modulation + projection
282
+ table = t.scale_shift_table
283
+ shift, scale = (
284
+ table.to(device=device) + time_emb.unsqueeze(1)
285
+ ).chunk(2, dim=1)
286
+ shift = shift.to(device=x.device)
287
+ scale = scale.to(device=x.device)
288
+ x = (t.norm_out(x.float()) * (1 + scale) + shift).type_as(x)
289
+ x = t.proj_out(x)
290
+
291
+ # --- Remove ref tokens from output ---
292
+ if vae_ref_image is not None and vae_ref_input_length > 0:
293
+ if self.ref_embedder.config.ref_pad_first:
294
+ x = x[:, vae_ref_input_length:, :]
295
+ else:
296
+ x = x[:, :-vae_ref_input_length, :]
297
+
298
+ # --- Unpatchify ---
299
+ patch_size = t.config.patch_size
300
+ x = rearrange(
301
+ x,
302
+ "b (f h w) (x y z c) -> b c (f x) (h y) (w z)",
303
+ f=f, h=h, w=w,
304
+ x=patch_size[0], y=patch_size[1], z=patch_size[2],
305
+ )
306
+ return x
307
+
308
+ # ------------------------------------------------------------------ #
309
+ # Main __call__ #
310
+ # ------------------------------------------------------------------ #
311
+
312
+ @torch.no_grad()
313
+ def __call__(
314
+ self,
315
+ prompt: str,
316
+ source_video: Optional[List[Image.Image]] = None,
317
+ source_input: Optional[List[Image.Image]] = None,
318
+ ref_image: Optional[List[Image.Image]] = None,
319
+ negative_prompt: Optional[str] = "",
320
+ input_video: Optional[List[Image.Image]] = None,
321
+ height: int = 480,
322
+ width: int = 832,
323
+ num_frames: int = 81,
324
+ num_inference_steps: int = 50,
325
+ guidance_scale: float = 1.0,
326
+ sigma_shift: float = 5.0,
327
+ denoising_strength: float = 1.0,
328
+ seed: Optional[int] = None,
329
+ tiled: bool = True,
330
+ tile_size: Tuple[int, int] = (30, 52),
331
+ tile_stride: Tuple[int, int] = (15, 26),
332
+ output_type: str = "pil",
333
+ progress_bar: Callable = tqdm,
334
+ ) -> List[Image.Image]:
335
+ """
336
+ Run KiwiEdit inference.
337
+
338
+ Args:
339
+ prompt: Editing instruction text.
340
+ source_video: Source video/image frames for MLLM context (also used as
341
+ source_input if source_input is not provided).
342
+ source_input: Source frames for VAE conditioning. If None but source_video
343
+ is provided, source_video is used.
344
+ ref_image: Optional reference image(s) for guided editing.
345
+ negative_prompt: Negative prompt for CFG.
346
+ input_video: Optional input video for video-to-video (adds noise then denoises).
347
+ height: Output height in pixels.
348
+ width: Output width in pixels.
349
+ num_frames: Number of output frames (1 for image editing).
350
+ num_inference_steps: Number of denoising steps.
351
+ guidance_scale: Classifier-free guidance scale.
352
+ sigma_shift: Flow matching shift parameter.
353
+ denoising_strength: How much noise to add (1.0 = full noise).
354
+ seed: Random seed for reproducibility.
355
+ tiled: Whether to use tiled VAE encoding/decoding.
356
+ tile_size: VAE tile size.
357
+ tile_stride: VAE tile stride.
358
+ output_type: "pil" for PIL Images, "latent" for raw latents.
359
+ progress_bar: Progress bar callable (e.g., tqdm).
360
+
361
+ Returns:
362
+ List of PIL Images (video frames).
363
+ """
364
+ device = self._execution_device
365
+ dtype = torch.bfloat16
366
+ # --- 1. Shape check ---
367
+ # VAE spatial factor is 16, transformer patch spatial is 2,
368
+ # so pixel dims must be multiples of 32.
369
+ height, width, num_frames = self._check_resize(
370
+ height, width, num_frames, h_div=32, w_div=32
371
+ )
372
+
373
+ # --- 2. Determine VAE parameters ---
374
+ z_dim = self.vae.config.z_dim
375
+ # Compute upsampling factor from VAE config
376
+ dim_mult = self.vae.config.get("dim_mult", [1, 2, 4, 4])
377
+ temporal_downsample = self.vae.config.get("temperal_downsample", [False, True, True])
378
+ # Wan VideoVAE spatial factor is 2^(len(dim_mult)) due to extra
379
+ # downsampling in the encoder beyond the level transitions.
380
+ spatial_factor = 2 ** len(dim_mult) # 16 for 4 levels
381
+ temporal_factor = 2 ** sum(temporal_downsample) # 4 for [F, T, T]
382
+
383
+ # --- 3. MLLM encoding (move mllm_encoder to CUDA, ~6.5 GB) ---
384
+ self._offload_to(["mllm_encoder"], device)
385
+ context = None
386
+ src_video_for_mllm = source_video
387
+ if src_video_for_mllm is not None:
388
+ self.mllm_encoder._ensure_qwen_loaded()
389
+ if ref_image is not None:
390
+ # Ref mode always uses the video path (even for a single frame)
391
+ context = self.mllm_encoder(
392
+ prompt, src_video=src_video_for_mllm, ref_image=ref_image
393
+ )
394
+ elif len(src_video_for_mllm) == 1:
395
+ context = self.mllm_encoder(
396
+ prompt, src_image=src_video_for_mllm
397
+ )
398
+ else:
399
+ context = self.mllm_encoder(
400
+ prompt, src_video=src_video_for_mllm
401
+ )
402
+ # For negative prompt: use zero context
403
+ context_nega = None
404
+ # Move context to CPU while we do VAE encoding (will move back for denoising)
405
+ if context is not None:
406
+ context = context.cpu()
407
+ self._offload_to(["mllm_encoder"], "cpu")
408
+
409
+ # --- 4. Setup scheduler ---
410
+ sigmas, timesteps = self._setup_scheduler(
411
+ num_inference_steps, denoising_strength, sigma_shift
412
+ )
413
+ sigmas = sigmas.to(device)
414
+ timesteps = timesteps.to(device)
415
+
416
+ # --- 5. Initialize noise ---
417
+ latent_length = (num_frames - 1) // temporal_factor + 1
418
+ latent_h = height // spatial_factor
419
+ latent_w = width // spatial_factor
420
+ shape = (1, z_dim, latent_length, latent_h, latent_w)
421
+
422
+ generator = None if seed is None else torch.Generator("cpu").manual_seed(seed)
423
+ noise = torch.randn(shape, generator=generator, device="cpu", dtype=torch.float32)
424
+ noise = noise.to(dtype=dtype, device=device)
425
+
426
+ # --- 6. Encode source input (move VAE to CUDA, ~0.4 GB) ---
427
+ self._offload_to(["vae"], device)
428
+ vae_source_input = None
429
+ # Fall back to source_video if source_input not provided
430
+ src_for_vae = source_input if source_input is not None else source_video
431
+ if src_for_vae is not None:
432
+ src_frames = [src_for_vae[i] for i in range(min(num_frames, len(src_for_vae)))]
433
+ # Resize source frames to match the (possibly adjusted) target dimensions
434
+ src_frames = [f.resize((width, height), Image.LANCZOS) for f in src_frames]
435
+ src_tensor = self._preprocess_video(src_frames, dtype=torch.float32, device=device)
436
+ vae_source_input = self.vae.encode(src_tensor).latent_dist.sample()
437
+ vae_source_input = vae_source_input.to(dtype=dtype)
438
+
439
+ # --- 7. Encode reference images ---
440
+ vae_ref_image = None
441
+ if ref_image is not None:
442
+ vae_ref_image = []
443
+ for item in ref_image:
444
+ target_size = (width, height)
445
+ item = ImageOps.pad(item, target_size, color="white", centering=(0.5, 0.5))
446
+ ref_tensor = self._preprocess_video([item], dtype=torch.float32, device=device)
447
+ ref_latent = self.vae.encode(ref_tensor).latent_dist.sample()
448
+ vae_ref_image.append(ref_latent.to(dtype=dtype))
449
+
450
+ # --- 8. Handle input_video (video-to-video) ---
451
+ if input_video is not None:
452
+ input_tensor = self._preprocess_video(input_video, dtype=torch.float32, device=device)
453
+ input_latents = self.vae.encode(input_tensor).latent_dist.sample()
454
+ input_latents = input_latents.to(dtype=dtype)
455
+ latents = self._scheduler_add_noise(input_latents, noise, sigmas, 0)
456
+ else:
457
+ latents = noise
458
+
459
+ # --- Offload VAE, load denoising components (~10.5 GB) ---
460
+ self._offload_to(["vae"], "cpu")
461
+ self._offload_to(["source_embedder", "ref_embedder", "transformer"], device)
462
+ # Move context back to CUDA for denoising
463
+ if context is not None:
464
+ context = context.to(device=device, dtype=dtype)
465
+
466
+ # --- 9. Denoising loop ---
467
+ for step_idx, timestep_val in enumerate(progress_bar(timesteps)):
468
+ timestep = timestep_val.unsqueeze(0).to(dtype=dtype, device=device)
469
+
470
+ # Positive prediction
471
+ noise_pred = self._model_forward(
472
+ latents=latents,
473
+ timestep=timestep,
474
+ context=context,
475
+ vae_source_input=vae_source_input,
476
+ vae_ref_image=vae_ref_image,
477
+ sigmas=sigmas,
478
+ timesteps_schedule=timesteps,
479
+ )
480
+
481
+ # CFG
482
+ # if guidance_scale != 1.0:
483
+ # noise_pred_nega = self._model_forward(
484
+ # latents=latents,
485
+ # timestep=timestep,
486
+ # context=context_nega,
487
+ # vae_source_input=vae_source_input,
488
+ # vae_ref_image=vae_ref_image,
489
+ # sigmas=sigmas,
490
+ # timesteps_schedule=timesteps,
491
+ # )
492
+ # noise_pred = noise_pred_nega + guidance_scale * (
493
+ # noise_pred_posi - noise_pred_nega
494
+ # )
495
+ # else:
496
+ # noise_pred = noise_pred_posi
497
+
498
+ # Scheduler step
499
+ latents = self._scheduler_step(noise_pred, sigmas, step_idx, latents)
500
+
501
+ # --- 10. Decode (offload denoising components, load VAE) ---
502
+ self._offload_to(["source_embedder", "ref_embedder", "transformer"], "cpu")
503
+ self._offload_to(["vae"], device)
504
+
505
+ if output_type == "latent":
506
+ return latents
507
+
508
+ video = self.vae.decode(latents).sample
509
+ video = self._vae_output_to_video(video)
510
+ return video
models/kiwi-edit/wan_video_vae.py ADDED
@@ -0,0 +1,1486 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from dataclasses import dataclass
2
+ from typing import List, Optional
3
+
4
+ from einops import rearrange, repeat
5
+
6
+ import torch
7
+ import torch.nn as nn
8
+ import torch.nn.functional as F
9
+ from tqdm import tqdm
10
+ from diffusers import ModelMixin, ConfigMixin
11
+ from diffusers.configuration_utils import register_to_config
12
+
13
+ CACHE_T = 2
14
+
15
+
16
+ def check_is_instance(model, module_class):
17
+ if isinstance(model, module_class):
18
+ return True
19
+ if hasattr(model, "module") and isinstance(model.module, module_class):
20
+ return True
21
+ return False
22
+
23
+
24
+ def block_causal_mask(x, block_size):
25
+ # params
26
+ b, n, s, _, device = *x.size(), x.device
27
+ assert s % block_size == 0
28
+ num_blocks = s // block_size
29
+
30
+ # build mask
31
+ mask = torch.zeros(b, n, s, s, dtype=torch.bool, device=device)
32
+ for i in range(num_blocks):
33
+ mask[:, :,
34
+ i * block_size:(i + 1) * block_size, :(i + 1) * block_size] = 1
35
+ return mask
36
+
37
+
38
+ class CausalConv3d(nn.Conv3d):
39
+ """
40
+ Causal 3d convolusion.
41
+ """
42
+
43
+ def __init__(self, *args, **kwargs):
44
+ super().__init__(*args, **kwargs)
45
+ self._padding = (self.padding[2], self.padding[2], self.padding[1],
46
+ self.padding[1], 2 * self.padding[0], 0)
47
+ self.padding = (0, 0, 0)
48
+
49
+ def forward(self, x, cache_x=None):
50
+ padding = list(self._padding)
51
+ if cache_x is not None and self._padding[4] > 0:
52
+ cache_x = cache_x.to(x.device)
53
+ x = torch.cat([cache_x, x], dim=2)
54
+ padding[4] -= cache_x.shape[2]
55
+ x = F.pad(x, padding)
56
+
57
+ return super().forward(x)
58
+
59
+
60
+ class RMS_norm(nn.Module):
61
+
62
+ def __init__(self, dim, channel_first=True, images=True, bias=False):
63
+ super().__init__()
64
+ broadcastable_dims = (1, 1, 1) if not images else (1, 1)
65
+ shape = (dim, *broadcastable_dims) if channel_first else (dim,)
66
+
67
+ self.channel_first = channel_first
68
+ self.scale = dim**0.5
69
+ self.gamma = nn.Parameter(torch.ones(shape))
70
+ self.bias = nn.Parameter(torch.zeros(shape)) if bias else 0.
71
+
72
+ def forward(self, x):
73
+ return F.normalize(
74
+ x, dim=(1 if self.channel_first else
75
+ -1)) * self.scale * self.gamma + self.bias
76
+
77
+
78
+ class Upsample(nn.Upsample):
79
+
80
+ def forward(self, x):
81
+ """
82
+ Fix bfloat16 support for nearest neighbor interpolation.
83
+ """
84
+ return super().forward(x.float()).type_as(x)
85
+
86
+
87
+ class Resample(nn.Module):
88
+
89
+ def __init__(self, dim, mode):
90
+ assert mode in ('none', 'upsample2d', 'upsample3d', 'downsample2d',
91
+ 'downsample3d')
92
+ super().__init__()
93
+ self.dim = dim
94
+ self.mode = mode
95
+
96
+ # layers
97
+ if mode == 'upsample2d':
98
+ self.resample = nn.Sequential(
99
+ Upsample(scale_factor=(2., 2.), mode='nearest-exact'),
100
+ nn.Conv2d(dim, dim // 2, 3, padding=1))
101
+ elif mode == 'upsample3d':
102
+ self.resample = nn.Sequential(
103
+ Upsample(scale_factor=(2., 2.), mode='nearest-exact'),
104
+ nn.Conv2d(dim, dim // 2, 3, padding=1))
105
+ self.time_conv = CausalConv3d(dim,
106
+ dim * 2, (3, 1, 1),
107
+ padding=(1, 0, 0))
108
+
109
+ elif mode == 'downsample2d':
110
+ self.resample = nn.Sequential(
111
+ nn.ZeroPad2d((0, 1, 0, 1)),
112
+ nn.Conv2d(dim, dim, 3, stride=(2, 2)))
113
+ elif mode == 'downsample3d':
114
+ self.resample = nn.Sequential(
115
+ nn.ZeroPad2d((0, 1, 0, 1)),
116
+ nn.Conv2d(dim, dim, 3, stride=(2, 2)))
117
+ self.time_conv = CausalConv3d(dim,
118
+ dim, (3, 1, 1),
119
+ stride=(2, 1, 1),
120
+ padding=(0, 0, 0))
121
+
122
+ else:
123
+ self.resample = nn.Identity()
124
+
125
+ def forward(self, x, feat_cache=None, feat_idx=[0]):
126
+ b, c, t, h, w = x.size()
127
+ if self.mode == 'upsample3d':
128
+ if feat_cache is not None:
129
+ idx = feat_idx[0]
130
+ if feat_cache[idx] is None:
131
+ feat_cache[idx] = 'Rep'
132
+ feat_idx[0] += 1
133
+ else:
134
+
135
+ cache_x = x[:, :, -CACHE_T:, :, :].clone()
136
+ if cache_x.shape[2] < 2 and feat_cache[
137
+ idx] is not None and feat_cache[idx] != 'Rep':
138
+ # cache last frame of last two chunk
139
+ cache_x = torch.cat([
140
+ feat_cache[idx][:, :, -1, :, :].unsqueeze(2).to(
141
+ cache_x.device), cache_x
142
+ ],
143
+ dim=2)
144
+ if cache_x.shape[2] < 2 and feat_cache[
145
+ idx] is not None and feat_cache[idx] == 'Rep':
146
+ cache_x = torch.cat([
147
+ torch.zeros_like(cache_x).to(cache_x.device),
148
+ cache_x
149
+ ],
150
+ dim=2)
151
+ if feat_cache[idx] == 'Rep':
152
+ x = self.time_conv(x)
153
+ else:
154
+ x = self.time_conv(x, feat_cache[idx])
155
+ feat_cache[idx] = cache_x
156
+ feat_idx[0] += 1
157
+
158
+ x = x.reshape(b, 2, c, t, h, w)
159
+ x = torch.stack((x[:, 0, :, :, :, :], x[:, 1, :, :, :, :]),
160
+ 3)
161
+ x = x.reshape(b, c, t * 2, h, w)
162
+ t = x.shape[2]
163
+ x = rearrange(x, 'b c t h w -> (b t) c h w')
164
+ x = self.resample(x)
165
+ x = rearrange(x, '(b t) c h w -> b c t h w', t=t)
166
+
167
+ if self.mode == 'downsample3d':
168
+ if feat_cache is not None:
169
+ idx = feat_idx[0]
170
+ if feat_cache[idx] is None:
171
+ feat_cache[idx] = x.clone()
172
+ feat_idx[0] += 1
173
+ else:
174
+ cache_x = x[:, :, -1:, :, :].clone()
175
+ x = self.time_conv(
176
+ torch.cat([feat_cache[idx][:, :, -1:, :, :], x], 2))
177
+ feat_cache[idx] = cache_x
178
+ feat_idx[0] += 1
179
+ return x
180
+
181
+ def init_weight(self, conv):
182
+ conv_weight = conv.weight
183
+ nn.init.zeros_(conv_weight)
184
+ c1, c2, t, h, w = conv_weight.size()
185
+ one_matrix = torch.eye(c1, c2)
186
+ init_matrix = one_matrix
187
+ nn.init.zeros_(conv_weight)
188
+ conv_weight.data[:, :, 1, 0, 0] = init_matrix
189
+ conv.weight.data.copy_(conv_weight)
190
+ nn.init.zeros_(conv.bias.data)
191
+
192
+ def init_weight2(self, conv):
193
+ conv_weight = conv.weight.data
194
+ nn.init.zeros_(conv_weight)
195
+ c1, c2, t, h, w = conv_weight.size()
196
+ init_matrix = torch.eye(c1 // 2, c2)
197
+ conv_weight[:c1 // 2, :, -1, 0, 0] = init_matrix
198
+ conv_weight[c1 // 2:, :, -1, 0, 0] = init_matrix
199
+ conv.weight.data.copy_(conv_weight)
200
+ nn.init.zeros_(conv.bias.data)
201
+
202
+
203
+
204
+ def patchify(x, patch_size):
205
+ if patch_size == 1:
206
+ return x
207
+ if x.dim() == 4:
208
+ x = rearrange(x, "b c (h q) (w r) -> b (c r q) h w", q=patch_size, r=patch_size)
209
+ elif x.dim() == 5:
210
+ x = rearrange(x,
211
+ "b c f (h q) (w r) -> b (c r q) f h w",
212
+ q=patch_size,
213
+ r=patch_size)
214
+ else:
215
+ raise ValueError(f"Invalid input shape: {x.shape}")
216
+ return x
217
+
218
+
219
+ def unpatchify(x, patch_size):
220
+ if patch_size == 1:
221
+ return x
222
+ if x.dim() == 4:
223
+ x = rearrange(x, "b (c r q) h w -> b c (h q) (w r)", q=patch_size, r=patch_size)
224
+ elif x.dim() == 5:
225
+ x = rearrange(x,
226
+ "b (c r q) f h w -> b c f (h q) (w r)",
227
+ q=patch_size,
228
+ r=patch_size)
229
+ return x
230
+
231
+
232
+ class Resample38(Resample):
233
+
234
+ def __init__(self, dim, mode):
235
+ assert mode in (
236
+ "none",
237
+ "upsample2d",
238
+ "upsample3d",
239
+ "downsample2d",
240
+ "downsample3d",
241
+ )
242
+ super(Resample, self).__init__()
243
+ self.dim = dim
244
+ self.mode = mode
245
+
246
+ # layers
247
+ if mode == "upsample2d":
248
+ self.resample = nn.Sequential(
249
+ Upsample(scale_factor=(2.0, 2.0), mode="nearest-exact"),
250
+ nn.Conv2d(dim, dim, 3, padding=1),
251
+ )
252
+ elif mode == "upsample3d":
253
+ self.resample = nn.Sequential(
254
+ Upsample(scale_factor=(2.0, 2.0), mode="nearest-exact"),
255
+ nn.Conv2d(dim, dim, 3, padding=1),
256
+ )
257
+ self.time_conv = CausalConv3d(dim, dim * 2, (3, 1, 1), padding=(1, 0, 0))
258
+ elif mode == "downsample2d":
259
+ self.resample = nn.Sequential(
260
+ nn.ZeroPad2d((0, 1, 0, 1)), nn.Conv2d(dim, dim, 3, stride=(2, 2))
261
+ )
262
+ elif mode == "downsample3d":
263
+ self.resample = nn.Sequential(
264
+ nn.ZeroPad2d((0, 1, 0, 1)), nn.Conv2d(dim, dim, 3, stride=(2, 2))
265
+ )
266
+ self.time_conv = CausalConv3d(
267
+ dim, dim, (3, 1, 1), stride=(2, 1, 1), padding=(0, 0, 0)
268
+ )
269
+ else:
270
+ self.resample = nn.Identity()
271
+
272
+ class ResidualBlock(nn.Module):
273
+
274
+ def __init__(self, in_dim, out_dim, dropout=0.0):
275
+ super().__init__()
276
+ self.in_dim = in_dim
277
+ self.out_dim = out_dim
278
+
279
+ # layers
280
+ self.residual = nn.Sequential(
281
+ RMS_norm(in_dim, images=False), nn.SiLU(),
282
+ CausalConv3d(in_dim, out_dim, 3, padding=1),
283
+ RMS_norm(out_dim, images=False), nn.SiLU(), nn.Dropout(dropout),
284
+ CausalConv3d(out_dim, out_dim, 3, padding=1))
285
+ self.shortcut = CausalConv3d(in_dim, out_dim, 1) \
286
+ if in_dim != out_dim else nn.Identity()
287
+
288
+ def forward(self, x, feat_cache=None, feat_idx=[0]):
289
+ h = self.shortcut(x)
290
+ for layer in self.residual:
291
+ if check_is_instance(layer, CausalConv3d) and feat_cache is not None:
292
+ idx = feat_idx[0]
293
+ cache_x = x[:, :, -CACHE_T:, :, :].clone()
294
+ if cache_x.shape[2] < 2 and feat_cache[idx] is not None:
295
+ # cache last frame of last two chunk
296
+ cache_x = torch.cat([
297
+ feat_cache[idx][:, :, -1, :, :].unsqueeze(2).to(
298
+ cache_x.device), cache_x
299
+ ],
300
+ dim=2)
301
+ x = layer(x, feat_cache[idx])
302
+ feat_cache[idx] = cache_x
303
+ feat_idx[0] += 1
304
+ else:
305
+ x = layer(x)
306
+ return x + h
307
+
308
+
309
+ class AttentionBlock(nn.Module):
310
+ """
311
+ Causal self-attention with a single head.
312
+ """
313
+
314
+ def __init__(self, dim):
315
+ super().__init__()
316
+ self.dim = dim
317
+
318
+ # layers
319
+ self.norm = RMS_norm(dim)
320
+ self.to_qkv = nn.Conv2d(dim, dim * 3, 1)
321
+ self.proj = nn.Conv2d(dim, dim, 1)
322
+
323
+ # zero out the last layer params
324
+ nn.init.zeros_(self.proj.weight)
325
+
326
+ def forward(self, x):
327
+ identity = x
328
+ b, c, t, h, w = x.size()
329
+ x = rearrange(x, 'b c t h w -> (b t) c h w')
330
+ x = self.norm(x)
331
+ # compute query, key, value
332
+ q, k, v = self.to_qkv(x).reshape(b * t, 1, c * 3, -1).permute(
333
+ 0, 1, 3, 2).contiguous().chunk(3, dim=-1)
334
+
335
+ # apply attention
336
+ x = F.scaled_dot_product_attention(
337
+ q,
338
+ k,
339
+ v,
340
+ #attn_mask=block_causal_mask(q, block_size=h * w)
341
+ )
342
+ x = x.squeeze(1).permute(0, 2, 1).reshape(b * t, c, h, w)
343
+
344
+ # output
345
+ x = self.proj(x)
346
+ x = rearrange(x, '(b t) c h w-> b c t h w', t=t)
347
+ return x + identity
348
+
349
+
350
+ class AvgDown3D(nn.Module):
351
+ def __init__(
352
+ self,
353
+ in_channels,
354
+ out_channels,
355
+ factor_t,
356
+ factor_s=1,
357
+ ):
358
+ super().__init__()
359
+ self.in_channels = in_channels
360
+ self.out_channels = out_channels
361
+ self.factor_t = factor_t
362
+ self.factor_s = factor_s
363
+ self.factor = self.factor_t * self.factor_s * self.factor_s
364
+
365
+ assert in_channels * self.factor % out_channels == 0
366
+ self.group_size = in_channels * self.factor // out_channels
367
+
368
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
369
+ pad_t = (self.factor_t - x.shape[2] % self.factor_t) % self.factor_t
370
+ pad = (0, 0, 0, 0, pad_t, 0)
371
+ x = F.pad(x, pad)
372
+ B, C, T, H, W = x.shape
373
+ x = x.view(
374
+ B,
375
+ C,
376
+ T // self.factor_t,
377
+ self.factor_t,
378
+ H // self.factor_s,
379
+ self.factor_s,
380
+ W // self.factor_s,
381
+ self.factor_s,
382
+ )
383
+ x = x.permute(0, 1, 3, 5, 7, 2, 4, 6).contiguous()
384
+ x = x.view(
385
+ B,
386
+ C * self.factor,
387
+ T // self.factor_t,
388
+ H // self.factor_s,
389
+ W // self.factor_s,
390
+ )
391
+ x = x.view(
392
+ B,
393
+ self.out_channels,
394
+ self.group_size,
395
+ T // self.factor_t,
396
+ H // self.factor_s,
397
+ W // self.factor_s,
398
+ )
399
+ x = x.mean(dim=2)
400
+ return x
401
+
402
+
403
+ class DupUp3D(nn.Module):
404
+ def __init__(
405
+ self,
406
+ in_channels: int,
407
+ out_channels: int,
408
+ factor_t,
409
+ factor_s=1,
410
+ ):
411
+ super().__init__()
412
+ self.in_channels = in_channels
413
+ self.out_channels = out_channels
414
+
415
+ self.factor_t = factor_t
416
+ self.factor_s = factor_s
417
+ self.factor = self.factor_t * self.factor_s * self.factor_s
418
+
419
+ assert out_channels * self.factor % in_channels == 0
420
+ self.repeats = out_channels * self.factor // in_channels
421
+
422
+ def forward(self, x: torch.Tensor, first_chunk=False) -> torch.Tensor:
423
+ x = x.repeat_interleave(self.repeats, dim=1)
424
+ x = x.view(
425
+ x.size(0),
426
+ self.out_channels,
427
+ self.factor_t,
428
+ self.factor_s,
429
+ self.factor_s,
430
+ x.size(2),
431
+ x.size(3),
432
+ x.size(4),
433
+ )
434
+ x = x.permute(0, 1, 5, 2, 6, 3, 7, 4).contiguous()
435
+ x = x.view(
436
+ x.size(0),
437
+ self.out_channels,
438
+ x.size(2) * self.factor_t,
439
+ x.size(4) * self.factor_s,
440
+ x.size(6) * self.factor_s,
441
+ )
442
+ if first_chunk:
443
+ x = x[:, :, self.factor_t - 1 :, :, :]
444
+ return x
445
+
446
+
447
+ class Down_ResidualBlock(nn.Module):
448
+ def __init__(
449
+ self, in_dim, out_dim, dropout, mult, temperal_downsample=False, down_flag=False
450
+ ):
451
+ super().__init__()
452
+
453
+ # Shortcut path with downsample
454
+ self.avg_shortcut = AvgDown3D(
455
+ in_dim,
456
+ out_dim,
457
+ factor_t=2 if temperal_downsample else 1,
458
+ factor_s=2 if down_flag else 1,
459
+ )
460
+
461
+ # Main path with residual blocks and downsample
462
+ downsamples = []
463
+ for _ in range(mult):
464
+ downsamples.append(ResidualBlock(in_dim, out_dim, dropout))
465
+ in_dim = out_dim
466
+
467
+ # Add the final downsample block
468
+ if down_flag:
469
+ mode = "downsample3d" if temperal_downsample else "downsample2d"
470
+ downsamples.append(Resample38(out_dim, mode=mode))
471
+
472
+ self.downsamples = nn.Sequential(*downsamples)
473
+
474
+ def forward(self, x, feat_cache=None, feat_idx=[0]):
475
+ x_copy = x.clone()
476
+ for module in self.downsamples:
477
+ x = module(x, feat_cache, feat_idx)
478
+
479
+ return x + self.avg_shortcut(x_copy)
480
+
481
+
482
+ class Up_ResidualBlock(nn.Module):
483
+ def __init__(
484
+ self, in_dim, out_dim, dropout, mult, temperal_upsample=False, up_flag=False
485
+ ):
486
+ super().__init__()
487
+ # Shortcut path with upsample
488
+ if up_flag:
489
+ self.avg_shortcut = DupUp3D(
490
+ in_dim,
491
+ out_dim,
492
+ factor_t=2 if temperal_upsample else 1,
493
+ factor_s=2 if up_flag else 1,
494
+ )
495
+ else:
496
+ self.avg_shortcut = None
497
+
498
+ # Main path with residual blocks and upsample
499
+ upsamples = []
500
+ for _ in range(mult):
501
+ upsamples.append(ResidualBlock(in_dim, out_dim, dropout))
502
+ in_dim = out_dim
503
+
504
+ # Add the final upsample block
505
+ if up_flag:
506
+ mode = "upsample3d" if temperal_upsample else "upsample2d"
507
+ upsamples.append(Resample38(out_dim, mode=mode))
508
+
509
+ self.upsamples = nn.Sequential(*upsamples)
510
+
511
+ def forward(self, x, feat_cache=None, feat_idx=[0], first_chunk=False):
512
+ x_main = x.clone()
513
+ for module in self.upsamples:
514
+ x_main = module(x_main, feat_cache, feat_idx)
515
+ if self.avg_shortcut is not None:
516
+ x_shortcut = self.avg_shortcut(x, first_chunk)
517
+ return x_main + x_shortcut
518
+ else:
519
+ return x_main
520
+
521
+
522
+ class Encoder3d(nn.Module):
523
+
524
+ def __init__(self,
525
+ dim=128,
526
+ z_dim=4,
527
+ dim_mult=[1, 2, 4, 4],
528
+ num_res_blocks=2,
529
+ attn_scales=[],
530
+ temperal_downsample=[True, True, False],
531
+ dropout=0.0):
532
+ super().__init__()
533
+ self.dim = dim
534
+ self.z_dim = z_dim
535
+ self.dim_mult = dim_mult
536
+ self.num_res_blocks = num_res_blocks
537
+ self.attn_scales = attn_scales
538
+ self.temperal_downsample = temperal_downsample
539
+
540
+ # dimensions
541
+ dims = [dim * u for u in [1] + dim_mult]
542
+ scale = 1.0
543
+
544
+ # init block
545
+ self.conv1 = CausalConv3d(3, dims[0], 3, padding=1)
546
+
547
+ # downsample blocks
548
+ downsamples = []
549
+ for i, (in_dim, out_dim) in enumerate(zip(dims[:-1], dims[1:])):
550
+ # residual (+attention) blocks
551
+ for _ in range(num_res_blocks):
552
+ downsamples.append(ResidualBlock(in_dim, out_dim, dropout))
553
+ if scale in attn_scales:
554
+ downsamples.append(AttentionBlock(out_dim))
555
+ in_dim = out_dim
556
+
557
+ # downsample block
558
+ if i != len(dim_mult) - 1:
559
+ mode = 'downsample3d' if temperal_downsample[
560
+ i] else 'downsample2d'
561
+ downsamples.append(Resample(out_dim, mode=mode))
562
+ scale /= 2.0
563
+ self.downsamples = nn.Sequential(*downsamples)
564
+
565
+ # middle blocks
566
+ self.middle = nn.Sequential(ResidualBlock(out_dim, out_dim, dropout),
567
+ AttentionBlock(out_dim),
568
+ ResidualBlock(out_dim, out_dim, dropout))
569
+
570
+ # output blocks
571
+ self.head = nn.Sequential(RMS_norm(out_dim, images=False), nn.SiLU(),
572
+ CausalConv3d(out_dim, z_dim, 3, padding=1))
573
+
574
+ def forward(self, x, feat_cache=None, feat_idx=[0]):
575
+ if feat_cache is not None:
576
+ idx = feat_idx[0]
577
+ cache_x = x[:, :, -CACHE_T:, :, :].clone()
578
+ if cache_x.shape[2] < 2 and feat_cache[idx] is not None:
579
+ # cache last frame of last two chunk
580
+ cache_x = torch.cat([
581
+ feat_cache[idx][:, :, -1, :, :].unsqueeze(2).to(
582
+ cache_x.device), cache_x
583
+ ],
584
+ dim=2)
585
+ x = self.conv1(x, feat_cache[idx])
586
+ feat_cache[idx] = cache_x
587
+ feat_idx[0] += 1
588
+ else:
589
+ x = self.conv1(x)
590
+
591
+ ## downsamples
592
+ for layer in self.downsamples:
593
+ if feat_cache is not None:
594
+ x = layer(x, feat_cache, feat_idx)
595
+ else:
596
+ x = layer(x)
597
+
598
+ ## middle
599
+ for layer in self.middle:
600
+ if check_is_instance(layer, ResidualBlock) and feat_cache is not None:
601
+ x = layer(x, feat_cache, feat_idx)
602
+ else:
603
+ x = layer(x)
604
+
605
+ ## head
606
+ for layer in self.head:
607
+ if check_is_instance(layer, CausalConv3d) and feat_cache is not None:
608
+ idx = feat_idx[0]
609
+ cache_x = x[:, :, -CACHE_T:, :, :].clone()
610
+ if cache_x.shape[2] < 2 and feat_cache[idx] is not None:
611
+ # cache last frame of last two chunk
612
+ cache_x = torch.cat([
613
+ feat_cache[idx][:, :, -1, :, :].unsqueeze(2).to(
614
+ cache_x.device), cache_x
615
+ ],
616
+ dim=2)
617
+ x = layer(x, feat_cache[idx])
618
+ feat_cache[idx] = cache_x
619
+ feat_idx[0] += 1
620
+ else:
621
+ x = layer(x)
622
+ return x
623
+
624
+
625
+ class Encoder3d_38(nn.Module):
626
+
627
+ def __init__(self,
628
+ dim=128,
629
+ z_dim=4,
630
+ dim_mult=[1, 2, 4, 4],
631
+ num_res_blocks=2,
632
+ attn_scales=[],
633
+ temperal_downsample=[False, True, True],
634
+ dropout=0.0):
635
+ super().__init__()
636
+ self.dim = dim
637
+ self.z_dim = z_dim
638
+ self.dim_mult = dim_mult
639
+ self.num_res_blocks = num_res_blocks
640
+ self.attn_scales = attn_scales
641
+ self.temperal_downsample = temperal_downsample
642
+
643
+ # dimensions
644
+ dims = [dim * u for u in [1] + dim_mult]
645
+ scale = 1.0
646
+
647
+ # init block
648
+ self.conv1 = CausalConv3d(12, dims[0], 3, padding=1)
649
+
650
+ # downsample blocks
651
+ downsamples = []
652
+ for i, (in_dim, out_dim) in enumerate(zip(dims[:-1], dims[1:])):
653
+ t_down_flag = (
654
+ temperal_downsample[i] if i < len(temperal_downsample) else False
655
+ )
656
+ downsamples.append(
657
+ Down_ResidualBlock(
658
+ in_dim=in_dim,
659
+ out_dim=out_dim,
660
+ dropout=dropout,
661
+ mult=num_res_blocks,
662
+ temperal_downsample=t_down_flag,
663
+ down_flag=i != len(dim_mult) - 1,
664
+ )
665
+ )
666
+ scale /= 2.0
667
+ self.downsamples = nn.Sequential(*downsamples)
668
+
669
+ # middle blocks
670
+ self.middle = nn.Sequential(
671
+ ResidualBlock(out_dim, out_dim, dropout),
672
+ AttentionBlock(out_dim),
673
+ ResidualBlock(out_dim, out_dim, dropout),
674
+ )
675
+
676
+ # # output blocks
677
+ self.head = nn.Sequential(
678
+ RMS_norm(out_dim, images=False),
679
+ nn.SiLU(),
680
+ CausalConv3d(out_dim, z_dim, 3, padding=1),
681
+ )
682
+
683
+
684
+ def forward(self, x, feat_cache=None, feat_idx=[0]):
685
+
686
+ if feat_cache is not None:
687
+ idx = feat_idx[0]
688
+ cache_x = x[:, :, -CACHE_T:, :, :].clone()
689
+ if cache_x.shape[2] < 2 and feat_cache[idx] is not None:
690
+ cache_x = torch.cat(
691
+ [
692
+ feat_cache[idx][:, :, -1, :, :].unsqueeze(2).to(cache_x.device),
693
+ cache_x,
694
+ ],
695
+ dim=2,
696
+ )
697
+ x = self.conv1(x, feat_cache[idx])
698
+ feat_cache[idx] = cache_x
699
+ feat_idx[0] += 1
700
+ else:
701
+ x = self.conv1(x)
702
+
703
+ ## downsamples
704
+ for layer in self.downsamples:
705
+ if feat_cache is not None:
706
+ x = layer(x, feat_cache, feat_idx)
707
+ else:
708
+ x = layer(x)
709
+
710
+ ## middle
711
+ for layer in self.middle:
712
+ if isinstance(layer, ResidualBlock) and feat_cache is not None:
713
+ x = layer(x, feat_cache, feat_idx)
714
+ else:
715
+ x = layer(x)
716
+
717
+ ## head
718
+ for layer in self.head:
719
+ if isinstance(layer, CausalConv3d) and feat_cache is not None:
720
+ idx = feat_idx[0]
721
+ cache_x = x[:, :, -CACHE_T:, :, :].clone()
722
+ if cache_x.shape[2] < 2 and feat_cache[idx] is not None:
723
+ cache_x = torch.cat(
724
+ [
725
+ feat_cache[idx][:, :, -1, :, :]
726
+ .unsqueeze(2)
727
+ .to(cache_x.device),
728
+ cache_x,
729
+ ],
730
+ dim=2,
731
+ )
732
+ x = layer(x, feat_cache[idx])
733
+ feat_cache[idx] = cache_x
734
+ feat_idx[0] += 1
735
+ else:
736
+ x = layer(x)
737
+
738
+ return x
739
+
740
+
741
+ class Decoder3d(nn.Module):
742
+
743
+ def __init__(self,
744
+ dim=128,
745
+ z_dim=4,
746
+ dim_mult=[1, 2, 4, 4],
747
+ num_res_blocks=2,
748
+ attn_scales=[],
749
+ temperal_upsample=[False, True, True],
750
+ dropout=0.0):
751
+ super().__init__()
752
+ self.dim = dim
753
+ self.z_dim = z_dim
754
+ self.dim_mult = dim_mult
755
+ self.num_res_blocks = num_res_blocks
756
+ self.attn_scales = attn_scales
757
+ self.temperal_upsample = temperal_upsample
758
+
759
+ # dimensions
760
+ dims = [dim * u for u in [dim_mult[-1]] + dim_mult[::-1]]
761
+ scale = 1.0 / 2**(len(dim_mult) - 2)
762
+
763
+ # init block
764
+ self.conv1 = CausalConv3d(z_dim, dims[0], 3, padding=1)
765
+
766
+ # middle blocks
767
+ self.middle = nn.Sequential(ResidualBlock(dims[0], dims[0], dropout),
768
+ AttentionBlock(dims[0]),
769
+ ResidualBlock(dims[0], dims[0], dropout))
770
+
771
+ # upsample blocks
772
+ upsamples = []
773
+ for i, (in_dim, out_dim) in enumerate(zip(dims[:-1], dims[1:])):
774
+ # residual (+attention) blocks
775
+ if i == 1 or i == 2 or i == 3:
776
+ in_dim = in_dim // 2
777
+ for _ in range(num_res_blocks + 1):
778
+ upsamples.append(ResidualBlock(in_dim, out_dim, dropout))
779
+ if scale in attn_scales:
780
+ upsamples.append(AttentionBlock(out_dim))
781
+ in_dim = out_dim
782
+
783
+ # upsample block
784
+ if i != len(dim_mult) - 1:
785
+ mode = 'upsample3d' if temperal_upsample[i] else 'upsample2d'
786
+ upsamples.append(Resample(out_dim, mode=mode))
787
+ scale *= 2.0
788
+ self.upsamples = nn.Sequential(*upsamples)
789
+
790
+ # output blocks
791
+ self.head = nn.Sequential(RMS_norm(out_dim, images=False), nn.SiLU(),
792
+ CausalConv3d(out_dim, 3, 3, padding=1))
793
+
794
+ def forward(self, x, feat_cache=None, feat_idx=[0]):
795
+ ## conv1
796
+ if feat_cache is not None:
797
+ idx = feat_idx[0]
798
+ cache_x = x[:, :, -CACHE_T:, :, :].clone()
799
+ if cache_x.shape[2] < 2 and feat_cache[idx] is not None:
800
+ # cache last frame of last two chunk
801
+ cache_x = torch.cat([
802
+ feat_cache[idx][:, :, -1, :, :].unsqueeze(2).to(
803
+ cache_x.device), cache_x
804
+ ],
805
+ dim=2)
806
+ x = self.conv1(x, feat_cache[idx])
807
+ feat_cache[idx] = cache_x
808
+ feat_idx[0] += 1
809
+ else:
810
+ x = self.conv1(x)
811
+
812
+ ## middle
813
+ for layer in self.middle:
814
+ if check_is_instance(layer, ResidualBlock) and feat_cache is not None:
815
+ x = layer(x, feat_cache, feat_idx)
816
+ else:
817
+ x = layer(x)
818
+
819
+ ## upsamples
820
+ for layer in self.upsamples:
821
+ if feat_cache is not None:
822
+ x = layer(x, feat_cache, feat_idx)
823
+ else:
824
+ x = layer(x)
825
+
826
+ ## head
827
+ for layer in self.head:
828
+ if check_is_instance(layer, CausalConv3d) and feat_cache is not None:
829
+ idx = feat_idx[0]
830
+ cache_x = x[:, :, -CACHE_T:, :, :].clone()
831
+ if cache_x.shape[2] < 2 and feat_cache[idx] is not None:
832
+ # cache last frame of last two chunk
833
+ cache_x = torch.cat([
834
+ feat_cache[idx][:, :, -1, :, :].unsqueeze(2).to(
835
+ cache_x.device), cache_x
836
+ ],
837
+ dim=2)
838
+ x = layer(x, feat_cache[idx])
839
+ feat_cache[idx] = cache_x
840
+ feat_idx[0] += 1
841
+ else:
842
+ x = layer(x)
843
+ return x
844
+
845
+
846
+
847
+ class Decoder3d_38(nn.Module):
848
+
849
+ def __init__(self,
850
+ dim=128,
851
+ z_dim=4,
852
+ dim_mult=[1, 2, 4, 4],
853
+ num_res_blocks=2,
854
+ attn_scales=[],
855
+ temperal_upsample=[False, True, True],
856
+ dropout=0.0):
857
+ super().__init__()
858
+ self.dim = dim
859
+ self.z_dim = z_dim
860
+ self.dim_mult = dim_mult
861
+ self.num_res_blocks = num_res_blocks
862
+ self.attn_scales = attn_scales
863
+ self.temperal_upsample = temperal_upsample
864
+
865
+ # dimensions
866
+ dims = [dim * u for u in [dim_mult[-1]] + dim_mult[::-1]]
867
+ scale = 1.0 / 2 ** (len(dim_mult) - 2)
868
+ # init block
869
+ self.conv1 = CausalConv3d(z_dim, dims[0], 3, padding=1)
870
+
871
+ # middle blocks
872
+ self.middle = nn.Sequential(ResidualBlock(dims[0], dims[0], dropout),
873
+ AttentionBlock(dims[0]),
874
+ ResidualBlock(dims[0], dims[0], dropout))
875
+
876
+ # upsample blocks
877
+ upsamples = []
878
+ for i, (in_dim, out_dim) in enumerate(zip(dims[:-1], dims[1:])):
879
+ t_up_flag = temperal_upsample[i] if i < len(temperal_upsample) else False
880
+ upsamples.append(
881
+ Up_ResidualBlock(in_dim=in_dim,
882
+ out_dim=out_dim,
883
+ dropout=dropout,
884
+ mult=num_res_blocks + 1,
885
+ temperal_upsample=t_up_flag,
886
+ up_flag=i != len(dim_mult) - 1))
887
+ self.upsamples = nn.Sequential(*upsamples)
888
+
889
+ # output blocks
890
+ self.head = nn.Sequential(RMS_norm(out_dim, images=False), nn.SiLU(),
891
+ CausalConv3d(out_dim, 12, 3, padding=1))
892
+
893
+
894
+ def forward(self, x, feat_cache=None, feat_idx=[0], first_chunk=False):
895
+ if feat_cache is not None:
896
+ idx = feat_idx[0]
897
+ cache_x = x[:, :, -CACHE_T:, :, :].clone()
898
+ if cache_x.shape[2] < 2 and feat_cache[idx] is not None:
899
+ cache_x = torch.cat(
900
+ [
901
+ feat_cache[idx][:, :, -1, :, :].unsqueeze(2).to(cache_x.device),
902
+ cache_x,
903
+ ],
904
+ dim=2,
905
+ )
906
+ x = self.conv1(x, feat_cache[idx])
907
+ feat_cache[idx] = cache_x
908
+ feat_idx[0] += 1
909
+ else:
910
+ x = self.conv1(x)
911
+
912
+ for layer in self.middle:
913
+ if check_is_instance(layer, ResidualBlock) and feat_cache is not None:
914
+ x = layer(x, feat_cache, feat_idx)
915
+ else:
916
+ x = layer(x)
917
+
918
+ ## upsamples
919
+ for layer in self.upsamples:
920
+ if feat_cache is not None:
921
+ x = layer(x, feat_cache, feat_idx, first_chunk)
922
+ else:
923
+ x = layer(x)
924
+
925
+ ## head
926
+ for layer in self.head:
927
+ if check_is_instance(layer, CausalConv3d) and feat_cache is not None:
928
+ idx = feat_idx[0]
929
+ cache_x = x[:, :, -CACHE_T:, :, :].clone()
930
+ if cache_x.shape[2] < 2 and feat_cache[idx] is not None:
931
+ cache_x = torch.cat(
932
+ [
933
+ feat_cache[idx][:, :, -1, :, :]
934
+ .unsqueeze(2)
935
+ .to(cache_x.device),
936
+ cache_x,
937
+ ],
938
+ dim=2,
939
+ )
940
+ x = layer(x, feat_cache[idx])
941
+ feat_cache[idx] = cache_x
942
+ feat_idx[0] += 1
943
+ else:
944
+ x = layer(x)
945
+ return x
946
+
947
+
948
+ def count_conv3d(model):
949
+ count = 0
950
+ for m in model.modules():
951
+ if isinstance(m, CausalConv3d):
952
+ count += 1
953
+ return count
954
+
955
+
956
+ class VideoVAE_(nn.Module):
957
+
958
+ def __init__(self,
959
+ dim=96,
960
+ z_dim=16,
961
+ dim_mult=[1, 2, 4, 4],
962
+ num_res_blocks=2,
963
+ attn_scales=[],
964
+ temperal_downsample=[False, True, True],
965
+ dropout=0.0):
966
+ super().__init__()
967
+ self.dim = dim
968
+ self.z_dim = z_dim
969
+ self.dim_mult = dim_mult
970
+ self.num_res_blocks = num_res_blocks
971
+ self.attn_scales = attn_scales
972
+ self.temperal_downsample = temperal_downsample
973
+ self.temperal_upsample = temperal_downsample[::-1]
974
+
975
+ # modules
976
+ self.encoder = Encoder3d(dim, z_dim * 2, dim_mult, num_res_blocks,
977
+ attn_scales, self.temperal_downsample, dropout)
978
+ self.conv1 = CausalConv3d(z_dim * 2, z_dim * 2, 1)
979
+ self.conv2 = CausalConv3d(z_dim, z_dim, 1)
980
+ self.decoder = Decoder3d(dim, z_dim, dim_mult, num_res_blocks,
981
+ attn_scales, self.temperal_upsample, dropout)
982
+
983
+ def forward(self, x):
984
+ mu, log_var = self.encode(x)
985
+ z = self.reparameterize(mu, log_var)
986
+ x_recon = self.decode(z)
987
+ return x_recon, mu, log_var
988
+
989
+ def encode(self, x, scale):
990
+ self.clear_cache()
991
+ ## cache
992
+ t = x.shape[2]
993
+ iter_ = 1 + (t - 1) // 4
994
+
995
+ for i in range(iter_):
996
+ self._enc_conv_idx = [0]
997
+ if i == 0:
998
+ out = self.encoder(x[:, :, :1, :, :],
999
+ feat_cache=self._enc_feat_map,
1000
+ feat_idx=self._enc_conv_idx)
1001
+ else:
1002
+ out_ = self.encoder(x[:, :, 1 + 4 * (i - 1):1 + 4 * i, :, :],
1003
+ feat_cache=self._enc_feat_map,
1004
+ feat_idx=self._enc_conv_idx)
1005
+ out = torch.cat([out, out_], 2)
1006
+ mu, log_var = self.conv1(out).chunk(2, dim=1)
1007
+ if isinstance(scale[0], torch.Tensor):
1008
+ scale = [s.to(dtype=mu.dtype, device=mu.device) for s in scale]
1009
+ mu = (mu - scale[0].view(1, self.z_dim, 1, 1, 1)) * scale[1].view(
1010
+ 1, self.z_dim, 1, 1, 1)
1011
+ else:
1012
+ scale = scale.to(dtype=mu.dtype, device=mu.device)
1013
+ mu = (mu - scale[0]) * scale[1]
1014
+ return mu
1015
+
1016
+ def decode(self, z, scale):
1017
+ self.clear_cache()
1018
+ # z: [b,c,t,h,w]
1019
+ if isinstance(scale[0], torch.Tensor):
1020
+ scale = [s.to(dtype=z.dtype, device=z.device) for s in scale]
1021
+ z = z / scale[1].view(1, self.z_dim, 1, 1, 1) + scale[0].view(
1022
+ 1, self.z_dim, 1, 1, 1)
1023
+ else:
1024
+ scale = scale.to(dtype=z.dtype, device=z.device)
1025
+ z = z / scale[1] + scale[0]
1026
+ iter_ = z.shape[2]
1027
+ x = self.conv2(z)
1028
+ for i in range(iter_):
1029
+ self._conv_idx = [0]
1030
+ if i == 0:
1031
+ out = self.decoder(x[:, :, i:i + 1, :, :],
1032
+ feat_cache=self._feat_map,
1033
+ feat_idx=self._conv_idx)
1034
+ else:
1035
+ out_ = self.decoder(x[:, :, i:i + 1, :, :],
1036
+ feat_cache=self._feat_map,
1037
+ feat_idx=self._conv_idx)
1038
+ out = torch.cat([out, out_], 2) # may add tensor offload
1039
+ return out
1040
+
1041
+ def reparameterize(self, mu, log_var):
1042
+ std = torch.exp(0.5 * log_var)
1043
+ eps = torch.randn_like(std)
1044
+ return eps * std + mu
1045
+
1046
+ def sample(self, imgs, deterministic=False):
1047
+ mu, log_var = self.encode(imgs)
1048
+ if deterministic:
1049
+ return mu
1050
+ std = torch.exp(0.5 * log_var.clamp(-30.0, 20.0))
1051
+ return mu + std * torch.randn_like(std)
1052
+
1053
+ def clear_cache(self):
1054
+ self._conv_num = count_conv3d(self.decoder)
1055
+ self._conv_idx = [0]
1056
+ self._feat_map = [None] * self._conv_num
1057
+ # cache encode
1058
+ self._enc_conv_num = count_conv3d(self.encoder)
1059
+ self._enc_conv_idx = [0]
1060
+ self._enc_feat_map = [None] * self._enc_conv_num
1061
+
1062
+
1063
+ class WanVideoVAE(nn.Module):
1064
+
1065
+ def __init__(self, z_dim=16):
1066
+ super().__init__()
1067
+
1068
+ mean = [
1069
+ -0.7571, -0.7089, -0.9113, 0.1075, -0.1745, 0.9653, -0.1517, 1.5508,
1070
+ 0.4134, -0.0715, 0.5517, -0.3632, -0.1922, -0.9497, 0.2503, -0.2921
1071
+ ]
1072
+ std = [
1073
+ 2.8184, 1.4541, 2.3275, 2.6558, 1.2196, 1.7708, 2.6052, 2.0743,
1074
+ 3.2687, 2.1526, 2.8652, 1.5579, 1.6382, 1.1253, 2.8251, 1.9160
1075
+ ]
1076
+ self.mean = torch.tensor(mean)
1077
+ self.std = torch.tensor(std)
1078
+ self.scale = [self.mean, 1.0 / self.std]
1079
+
1080
+ # init model
1081
+ self.model = VideoVAE_(z_dim=z_dim).eval().requires_grad_(False)
1082
+ self.upsampling_factor = 8
1083
+ self.z_dim = z_dim
1084
+
1085
+
1086
+ def build_1d_mask(self, length, left_bound, right_bound, border_width):
1087
+ x = torch.ones((length,))
1088
+ if not left_bound:
1089
+ x[:border_width] = (torch.arange(border_width) + 1) / border_width
1090
+ if not right_bound:
1091
+ x[-border_width:] = torch.flip((torch.arange(border_width) + 1) / border_width, dims=(0,))
1092
+ return x
1093
+
1094
+
1095
+ def build_mask(self, data, is_bound, border_width):
1096
+ _, _, _, H, W = data.shape
1097
+ h = self.build_1d_mask(H, is_bound[0], is_bound[1], border_width[0])
1098
+ w = self.build_1d_mask(W, is_bound[2], is_bound[3], border_width[1])
1099
+
1100
+ h = repeat(h, "H -> H W", H=H, W=W)
1101
+ w = repeat(w, "W -> H W", H=H, W=W)
1102
+
1103
+ mask = torch.stack([h, w]).min(dim=0).values
1104
+ mask = rearrange(mask, "H W -> 1 1 1 H W")
1105
+ return mask
1106
+
1107
+
1108
+ def tiled_decode(self, hidden_states, device, tile_size, tile_stride):
1109
+ _, _, T, H, W = hidden_states.shape
1110
+ size_h, size_w = tile_size
1111
+ stride_h, stride_w = tile_stride
1112
+
1113
+ # Split tasks
1114
+ tasks = []
1115
+ for h in range(0, H, stride_h):
1116
+ if (h-stride_h >= 0 and h-stride_h+size_h >= H): continue
1117
+ for w in range(0, W, stride_w):
1118
+ if (w-stride_w >= 0 and w-stride_w+size_w >= W): continue
1119
+ h_, w_ = h + size_h, w + size_w
1120
+ tasks.append((h, h_, w, w_))
1121
+
1122
+ data_device = "cpu"
1123
+ computation_device = device
1124
+
1125
+ out_T = T * 4 - 3
1126
+ weight = torch.zeros((1, 1, out_T, H * self.upsampling_factor, W * self.upsampling_factor), dtype=hidden_states.dtype, device=data_device)
1127
+ values = torch.zeros((1, 3, out_T, H * self.upsampling_factor, W * self.upsampling_factor), dtype=hidden_states.dtype, device=data_device)
1128
+
1129
+ for h, h_, w, w_ in tqdm(tasks, desc="VAE decoding"):
1130
+ hidden_states_batch = hidden_states[:, :, :, h:h_, w:w_].to(computation_device)
1131
+ hidden_states_batch = self.model.decode(hidden_states_batch, self.scale).to(data_device)
1132
+
1133
+ mask = self.build_mask(
1134
+ hidden_states_batch,
1135
+ is_bound=(h==0, h_>=H, w==0, w_>=W),
1136
+ border_width=((size_h - stride_h) * self.upsampling_factor, (size_w - stride_w) * self.upsampling_factor)
1137
+ ).to(dtype=hidden_states.dtype, device=data_device)
1138
+
1139
+ target_h = h * self.upsampling_factor
1140
+ target_w = w * self.upsampling_factor
1141
+ values[
1142
+ :,
1143
+ :,
1144
+ :,
1145
+ target_h:target_h + hidden_states_batch.shape[3],
1146
+ target_w:target_w + hidden_states_batch.shape[4],
1147
+ ] += hidden_states_batch * mask
1148
+ weight[
1149
+ :,
1150
+ :,
1151
+ :,
1152
+ target_h: target_h + hidden_states_batch.shape[3],
1153
+ target_w: target_w + hidden_states_batch.shape[4],
1154
+ ] += mask
1155
+ values = values / weight
1156
+ values = values.clamp_(-1, 1)
1157
+ return values
1158
+
1159
+
1160
+ def tiled_encode(self, video, device, tile_size, tile_stride):
1161
+ _, _, T, H, W = video.shape
1162
+ size_h, size_w = tile_size
1163
+ stride_h, stride_w = tile_stride
1164
+
1165
+ # Split tasks
1166
+ tasks = []
1167
+ for h in range(0, H, stride_h):
1168
+ if (h-stride_h >= 0 and h-stride_h+size_h >= H): continue
1169
+ for w in range(0, W, stride_w):
1170
+ if (w-stride_w >= 0 and w-stride_w+size_w >= W): continue
1171
+ h_, w_ = h + size_h, w + size_w
1172
+ tasks.append((h, h_, w, w_))
1173
+
1174
+ data_device = "cpu"
1175
+ computation_device = device
1176
+
1177
+ out_T = (T + 3) // 4
1178
+ weight = torch.zeros((1, 1, out_T, H // self.upsampling_factor, W // self.upsampling_factor), dtype=video.dtype, device=data_device)
1179
+ values = torch.zeros((1, self.z_dim, out_T, H // self.upsampling_factor, W // self.upsampling_factor), dtype=video.dtype, device=data_device)
1180
+
1181
+ for h, h_, w, w_ in tqdm(tasks, desc="VAE encoding"):
1182
+ hidden_states_batch = video[:, :, :, h:h_, w:w_].to(computation_device)
1183
+ hidden_states_batch = self.model.encode(hidden_states_batch, self.scale).to(data_device)
1184
+
1185
+ mask = self.build_mask(
1186
+ hidden_states_batch,
1187
+ is_bound=(h==0, h_>=H, w==0, w_>=W),
1188
+ border_width=((size_h - stride_h) // self.upsampling_factor, (size_w - stride_w) // self.upsampling_factor)
1189
+ ).to(dtype=video.dtype, device=data_device)
1190
+
1191
+ target_h = h // self.upsampling_factor
1192
+ target_w = w // self.upsampling_factor
1193
+ values[
1194
+ :,
1195
+ :,
1196
+ :,
1197
+ target_h:target_h + hidden_states_batch.shape[3],
1198
+ target_w:target_w + hidden_states_batch.shape[4],
1199
+ ] += hidden_states_batch * mask
1200
+ weight[
1201
+ :,
1202
+ :,
1203
+ :,
1204
+ target_h: target_h + hidden_states_batch.shape[3],
1205
+ target_w: target_w + hidden_states_batch.shape[4],
1206
+ ] += mask
1207
+ values = values / weight
1208
+ return values
1209
+
1210
+
1211
+ def single_encode(self, video, device):
1212
+ video = video.to(device)
1213
+ x = self.model.encode(video, self.scale)
1214
+ return x
1215
+
1216
+
1217
+ def single_decode(self, hidden_state, device):
1218
+ hidden_state = hidden_state.to(device)
1219
+ video = self.model.decode(hidden_state, self.scale)
1220
+ return video.clamp_(-1, 1)
1221
+
1222
+
1223
+ def encode(self, videos, device, tiled=False, tile_size=(34, 34), tile_stride=(18, 16)):
1224
+ videos = [video.to("cpu") for video in videos]
1225
+ hidden_states = []
1226
+ for video in videos:
1227
+ video = video.unsqueeze(0)
1228
+ if tiled:
1229
+ tile_size = (tile_size[0] * self.upsampling_factor, tile_size[1] * self.upsampling_factor)
1230
+ tile_stride = (tile_stride[0] * self.upsampling_factor, tile_stride[1] * self.upsampling_factor)
1231
+ hidden_state = self.tiled_encode(video, device, tile_size, tile_stride)
1232
+ else:
1233
+ hidden_state = self.single_encode(video, device)
1234
+ hidden_state = hidden_state.squeeze(0)
1235
+ hidden_states.append(hidden_state)
1236
+ hidden_states = torch.stack(hidden_states)
1237
+ return hidden_states
1238
+
1239
+
1240
+ def decode(self, hidden_states, device, tiled=False, tile_size=(34, 34), tile_stride=(18, 16)):
1241
+ hidden_states = [hidden_state.to("cpu") for hidden_state in hidden_states]
1242
+ videos = []
1243
+ for hidden_state in hidden_states:
1244
+ hidden_state = hidden_state.unsqueeze(0)
1245
+ if tiled:
1246
+ video = self.tiled_decode(hidden_state, device, tile_size, tile_stride)
1247
+ else:
1248
+ video = self.single_decode(hidden_state, device)
1249
+ video = video.squeeze(0)
1250
+ videos.append(video)
1251
+ videos = torch.stack(videos)
1252
+ return videos
1253
+
1254
+
1255
+ @staticmethod
1256
+ def state_dict_converter():
1257
+ return WanVideoVAEStateDictConverter()
1258
+
1259
+
1260
+ class WanVideoVAEStateDictConverter:
1261
+
1262
+ def __init__(self):
1263
+ pass
1264
+
1265
+ def from_civitai(self, state_dict):
1266
+ state_dict_ = {}
1267
+ if 'model_state' in state_dict:
1268
+ state_dict = state_dict['model_state']
1269
+ for name in state_dict:
1270
+ state_dict_['model.' + name] = state_dict[name]
1271
+ return state_dict_
1272
+
1273
+
1274
+ class VideoVAE38_(VideoVAE_):
1275
+
1276
+ def __init__(self,
1277
+ dim=160,
1278
+ z_dim=48,
1279
+ dec_dim=256,
1280
+ dim_mult=[1, 2, 4, 4],
1281
+ num_res_blocks=2,
1282
+ attn_scales=[],
1283
+ temperal_downsample=[False, True, True],
1284
+ dropout=0.0):
1285
+ super(VideoVAE_, self).__init__()
1286
+ self.dim = dim
1287
+ self.z_dim = z_dim
1288
+ self.dim_mult = dim_mult
1289
+ self.num_res_blocks = num_res_blocks
1290
+ self.attn_scales = attn_scales
1291
+ self.temperal_downsample = temperal_downsample
1292
+ self.temperal_upsample = temperal_downsample[::-1]
1293
+
1294
+ # modules
1295
+ self.encoder = Encoder3d_38(dim, z_dim * 2, dim_mult, num_res_blocks,
1296
+ attn_scales, self.temperal_downsample, dropout)
1297
+ self.conv1 = CausalConv3d(z_dim * 2, z_dim * 2, 1)
1298
+ self.conv2 = CausalConv3d(z_dim, z_dim, 1)
1299
+ self.decoder = Decoder3d_38(dec_dim, z_dim, dim_mult, num_res_blocks,
1300
+ attn_scales, self.temperal_upsample, dropout)
1301
+
1302
+
1303
+ def encode(self, x, scale):
1304
+ self.clear_cache()
1305
+ x = patchify(x, patch_size=2)
1306
+ t = x.shape[2]
1307
+ iter_ = 1 + (t - 1) // 4
1308
+ for i in range(iter_):
1309
+ self._enc_conv_idx = [0]
1310
+ if i == 0:
1311
+ out = self.encoder(x[:, :, :1, :, :],
1312
+ feat_cache=self._enc_feat_map,
1313
+ feat_idx=self._enc_conv_idx)
1314
+ else:
1315
+ out_ = self.encoder(x[:, :, 1 + 4 * (i - 1):1 + 4 * i, :, :],
1316
+ feat_cache=self._enc_feat_map,
1317
+ feat_idx=self._enc_conv_idx)
1318
+ out = torch.cat([out, out_], 2)
1319
+ mu, log_var = self.conv1(out).chunk(2, dim=1)
1320
+ if isinstance(scale[0], torch.Tensor):
1321
+ scale = [s.to(dtype=mu.dtype, device=mu.device) for s in scale]
1322
+ mu = (mu - scale[0].view(1, self.z_dim, 1, 1, 1)) * scale[1].view(
1323
+ 1, self.z_dim, 1, 1, 1)
1324
+ else:
1325
+ scale = scale.to(dtype=mu.dtype, device=mu.device)
1326
+ mu = (mu - scale[0]) * scale[1]
1327
+ self.clear_cache()
1328
+ return mu
1329
+
1330
+
1331
+ def decode(self, z, scale):
1332
+ self.clear_cache()
1333
+ if isinstance(scale[0], torch.Tensor):
1334
+ scale = [s.to(dtype=z.dtype, device=z.device) for s in scale]
1335
+ z = z / scale[1].view(1, self.z_dim, 1, 1, 1) + scale[0].view(
1336
+ 1, self.z_dim, 1, 1, 1)
1337
+ else:
1338
+ scale = scale.to(dtype=z.dtype, device=z.device)
1339
+ z = z / scale[1] + scale[0]
1340
+ iter_ = z.shape[2]
1341
+ x = self.conv2(z)
1342
+ for i in range(iter_):
1343
+ self._conv_idx = [0]
1344
+ if i == 0:
1345
+ out = self.decoder(x[:, :, i:i + 1, :, :],
1346
+ feat_cache=self._feat_map,
1347
+ feat_idx=self._conv_idx,
1348
+ first_chunk=True)
1349
+ else:
1350
+ out_ = self.decoder(x[:, :, i:i + 1, :, :],
1351
+ feat_cache=self._feat_map,
1352
+ feat_idx=self._conv_idx)
1353
+ out = torch.cat([out, out_], 2)
1354
+ out = unpatchify(out, patch_size=2)
1355
+ self.clear_cache()
1356
+ return out
1357
+
1358
+
1359
+ class WanVideoVAE38(WanVideoVAE):
1360
+
1361
+ def __init__(self, z_dim=48, dim=160):
1362
+ super(WanVideoVAE, self).__init__()
1363
+
1364
+ mean = [
1365
+ -0.2289, -0.0052, -0.1323, -0.2339, -0.2799, 0.0174, 0.1838, 0.1557,
1366
+ -0.1382, 0.0542, 0.2813, 0.0891, 0.1570, -0.0098, 0.0375, -0.1825,
1367
+ -0.2246, -0.1207, -0.0698, 0.5109, 0.2665, -0.2108, -0.2158, 0.2502,
1368
+ -0.2055, -0.0322, 0.1109, 0.1567, -0.0729, 0.0899, -0.2799, -0.1230,
1369
+ -0.0313, -0.1649, 0.0117, 0.0723, -0.2839, -0.2083, -0.0520, 0.3748,
1370
+ 0.0152, 0.1957, 0.1433, -0.2944, 0.3573, -0.0548, -0.1681, -0.0667
1371
+ ]
1372
+ std = [
1373
+ 0.4765, 1.0364, 0.4514, 1.1677, 0.5313, 0.4990, 0.4818, 0.5013,
1374
+ 0.8158, 1.0344, 0.5894, 1.0901, 0.6885, 0.6165, 0.8454, 0.4978,
1375
+ 0.5759, 0.3523, 0.7135, 0.6804, 0.5833, 1.4146, 0.8986, 0.5659,
1376
+ 0.7069, 0.5338, 0.4889, 0.4917, 0.4069, 0.4999, 0.6866, 0.4093,
1377
+ 0.5709, 0.6065, 0.6415, 0.4944, 0.5726, 1.2042, 0.5458, 1.6887,
1378
+ 0.3971, 1.0600, 0.3943, 0.5537, 0.5444, 0.4089, 0.7468, 0.7744
1379
+ ]
1380
+ self.mean = torch.tensor(mean)
1381
+ self.std = torch.tensor(std)
1382
+ self.scale = [self.mean, 1.0 / self.std]
1383
+
1384
+ # init model
1385
+ self.model = VideoVAE38_(z_dim=z_dim, dim=dim).eval().requires_grad_(False)
1386
+ self.upsampling_factor = 16
1387
+ self.z_dim = z_dim
1388
+
1389
+
1390
+ # ─────────────────────────────────────────────────────────────────────────────
1391
+ # Diffusers-compatible wrapper (formerly kiwi_vae.py)
1392
+ # ─────────────────────────────────��───────────────────────────────────────────
1393
+
1394
+ @dataclass
1395
+ class LatentDist:
1396
+ mu: torch.Tensor
1397
+
1398
+ def sample(self):
1399
+ return self.mu
1400
+
1401
+
1402
+ @dataclass
1403
+ class EncoderOutput:
1404
+ latent_dist: LatentDist
1405
+
1406
+
1407
+ @dataclass
1408
+ class DecoderOutput:
1409
+ sample: torch.Tensor
1410
+
1411
+
1412
+ class VAE(VideoVAE_, ModelMixin, ConfigMixin):
1413
+ """
1414
+ Diffusers-compatible VAE wrapper around the original Wan VideoVAE.
1415
+ Loads weights directly from diffusion_pytorch_model.safetensors.
1416
+ """
1417
+
1418
+ @register_to_config
1419
+ def __init__(
1420
+ self,
1421
+ z_dim: int = 48,
1422
+ dim: int = 160,
1423
+ dim_mult: List[int] = [1, 2, 4, 4],
1424
+ num_res_blocks: int = 2,
1425
+ attn_scales: List[float] = [],
1426
+ temperal_downsample: List[bool] = [False, True, True],
1427
+ dropout: float = 0.0,
1428
+ vae_pth: str = "wan_vae.pth",
1429
+ latents_mean: Optional[List[float]] = None,
1430
+ latents_std: Optional[List[float]] = None,
1431
+ ):
1432
+ # Build the actual VAE backbone so diffusers can load weights without mismatch.
1433
+ if z_dim == 48:
1434
+ VideoVAE38_.__init__(
1435
+ self,
1436
+ dim=dim,
1437
+ z_dim=z_dim,
1438
+ dim_mult=dim_mult,
1439
+ num_res_blocks=num_res_blocks,
1440
+ attn_scales=attn_scales,
1441
+ temperal_downsample=temperal_downsample,
1442
+ dropout=dropout,
1443
+ )
1444
+ self._use_38 = True
1445
+ self.upsampling_factor = 16
1446
+ else:
1447
+ VideoVAE_.__init__(
1448
+ self,
1449
+ dim=dim,
1450
+ z_dim=z_dim,
1451
+ dim_mult=dim_mult,
1452
+ num_res_blocks=num_res_blocks,
1453
+ attn_scales=attn_scales,
1454
+ temperal_downsample=temperal_downsample,
1455
+ dropout=dropout,
1456
+ )
1457
+ self._use_38 = False
1458
+ self.upsampling_factor = 8
1459
+
1460
+ # Keep for config compatibility; weights are loaded by diffusers.
1461
+ self._vae_pth = vae_pth
1462
+ self.z_dim = z_dim
1463
+
1464
+ # Build latent normalization scale: [mean, 1/std]
1465
+ if latents_mean is not None and latents_std is not None:
1466
+ mean = torch.tensor(latents_mean)
1467
+ std = torch.tensor(latents_std)
1468
+ self._scale = [mean, 1.0 / std]
1469
+ else:
1470
+ self._scale = [torch.zeros(z_dim), torch.ones(z_dim)]
1471
+
1472
+ def encode(self, x):
1473
+ x = x.to(dtype=next(self.parameters()).dtype)
1474
+ if self._use_38:
1475
+ mu = VideoVAE38_.encode(self, x, self._scale)
1476
+ else:
1477
+ mu = VideoVAE_.encode(self, x, self._scale)
1478
+ return EncoderOutput(latent_dist=LatentDist(mu=mu))
1479
+
1480
+ def decode(self, z):
1481
+ z = z.to(dtype=next(self.parameters()).dtype)
1482
+ if self._use_38:
1483
+ out = VideoVAE38_.decode(self, z, self._scale)
1484
+ else:
1485
+ out = VideoVAE_.decode(self, z, self._scale)
1486
+ return DecoderOutput(sample=out)
models/rife/.DS_Store ADDED
Binary file (6.15 kB). View file
 
models/rife/._.DS_Store ADDED
Binary file (212 Bytes). View file
 
models/rife/._IFNet_HDv3.cpython-311.pyc ADDED
Binary file (212 Bytes). View file
 
models/rife/._IFNet_HDv3.py ADDED
Binary file (212 Bytes). View file
 
models/rife/._RIFE_HDv3.cpython-311.pyc ADDED
Binary file (212 Bytes). View file
 
models/rife/._RIFE_HDv3.py ADDED
Binary file (576 Bytes). View file
 
models/rife/._RIFEv4.26_0921 ADDED
Binary file (212 Bytes). View file
 
models/rife/.___pycache__ ADDED
Binary file (212 Bytes). View file
 
models/rife/._flownet.pkl ADDED
Binary file (312 Bytes). View file
 
models/rife/._refine.py ADDED
Binary file (212 Bytes). View file
 
models/rife/IFNet_HDv3.py ADDED
@@ -0,0 +1,169 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn as nn
3
+ import torch.nn.functional as F
4
+ from model.warplayer import warp
5
+ # from train_log.refine import *
6
+
7
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
8
+
9
+ def conv(in_planes, out_planes, kernel_size=3, stride=1, padding=1, dilation=1):
10
+ return nn.Sequential(
11
+ nn.Conv2d(in_planes, out_planes, kernel_size=kernel_size, stride=stride,
12
+ padding=padding, dilation=dilation, bias=True),
13
+ nn.LeakyReLU(0.2, True)
14
+ )
15
+
16
+ def conv_bn(in_planes, out_planes, kernel_size=3, stride=1, padding=1, dilation=1):
17
+ return nn.Sequential(
18
+ nn.Conv2d(in_planes, out_planes, kernel_size=kernel_size, stride=stride,
19
+ padding=padding, dilation=dilation, bias=False),
20
+ nn.BatchNorm2d(out_planes),
21
+ nn.LeakyReLU(0.2, True)
22
+ )
23
+
24
+ class Head(nn.Module):
25
+ def __init__(self):
26
+ super(Head, self).__init__()
27
+ self.cnn0 = nn.Conv2d(3, 16, 3, 2, 1)
28
+ self.cnn1 = nn.Conv2d(16, 16, 3, 1, 1)
29
+ self.cnn2 = nn.Conv2d(16, 16, 3, 1, 1)
30
+ self.cnn3 = nn.ConvTranspose2d(16, 4, 4, 2, 1)
31
+ self.relu = nn.LeakyReLU(0.2, True)
32
+
33
+ def forward(self, x, feat=False):
34
+ x0 = self.cnn0(x)
35
+ x = self.relu(x0)
36
+ x1 = self.cnn1(x)
37
+ x = self.relu(x1)
38
+ x2 = self.cnn2(x)
39
+ x = self.relu(x2)
40
+ x3 = self.cnn3(x)
41
+ if feat:
42
+ return [x0, x1, x2, x3]
43
+ return x3
44
+
45
+ class ResConv(nn.Module):
46
+ def __init__(self, c, dilation=1):
47
+ super(ResConv, self).__init__()
48
+ self.conv = nn.Conv2d(c, c, 3, 1, dilation, dilation=dilation, groups=1\
49
+ )
50
+ self.beta = nn.Parameter(torch.ones((1, c, 1, 1)), requires_grad=True)
51
+ self.relu = nn.LeakyReLU(0.2, True)
52
+
53
+ def forward(self, x):
54
+ return self.relu(self.conv(x) * self.beta + x)
55
+
56
+ class IFBlock(nn.Module):
57
+ def __init__(self, in_planes, c=64):
58
+ super(IFBlock, self).__init__()
59
+ self.conv0 = nn.Sequential(
60
+ conv(in_planes, c//2, 3, 2, 1),
61
+ conv(c//2, c, 3, 2, 1),
62
+ )
63
+ self.convblock = nn.Sequential(
64
+ ResConv(c),
65
+ ResConv(c),
66
+ ResConv(c),
67
+ ResConv(c),
68
+ ResConv(c),
69
+ ResConv(c),
70
+ ResConv(c),
71
+ ResConv(c),
72
+ )
73
+ self.lastconv = nn.Sequential(
74
+ nn.ConvTranspose2d(c, 4*13, 4, 2, 1),
75
+ nn.PixelShuffle(2)
76
+ )
77
+
78
+ def forward(self, x, flow=None, scale=1):
79
+ x = F.interpolate(x, scale_factor= 1. / scale, mode="bilinear", align_corners=False)
80
+ if flow is not None:
81
+ flow = F.interpolate(flow, scale_factor= 1. / scale, mode="bilinear", align_corners=False) * 1. / scale
82
+ x = torch.cat((x, flow), 1)
83
+ feat = self.conv0(x)
84
+ feat = self.convblock(feat)
85
+ tmp = self.lastconv(feat)
86
+ tmp = F.interpolate(tmp, scale_factor=scale, mode="bilinear", align_corners=False)
87
+ flow = tmp[:, :4] * scale
88
+ mask = tmp[:, 4:5]
89
+ feat = tmp[:, 5:]
90
+ return flow, mask, feat
91
+
92
+ class IFNet(nn.Module):
93
+ def __init__(self):
94
+ super(IFNet, self).__init__()
95
+ self.block0 = IFBlock(7+8, c=192)
96
+ self.block1 = IFBlock(8+4+8+8, c=128)
97
+ self.block2 = IFBlock(8+4+8+8, c=96)
98
+ self.block3 = IFBlock(8+4+8+8, c=64)
99
+ self.block4 = IFBlock(8+4+8+8, c=32)
100
+ self.encode = Head()
101
+
102
+ # not used during inference
103
+ '''
104
+ self.teacher = IFBlock(8+4+8+3+8, c=64)
105
+ self.caltime = nn.Sequential(
106
+ nn.Conv2d(16+9, 8, 3, 2, 1),
107
+ nn.LeakyReLU(0.2, True),
108
+ nn.Conv2d(32, 64, 3, 2, 1),
109
+ nn.LeakyReLU(0.2, True),
110
+ nn.Conv2d(64, 64, 3, 1, 1),
111
+ nn.LeakyReLU(0.2, True),
112
+ nn.Conv2d(64, 64, 3, 1, 1),
113
+ nn.LeakyReLU(0.2, True),
114
+ nn.Conv2d(64, 1, 3, 1, 1),
115
+ nn.Sigmoid()
116
+ )
117
+ '''
118
+
119
+ def forward(self, x, timestep=0.5, scale_list=[8, 4, 2, 1], training=False, fastmode=True, ensemble=False):
120
+ if training == False:
121
+ channel = x.shape[1] // 2
122
+ img0 = x[:, :channel]
123
+ img1 = x[:, channel:]
124
+ if not torch.is_tensor(timestep):
125
+ timestep = (x[:, :1].clone() * 0 + 1) * timestep
126
+ else:
127
+ timestep = timestep.repeat(1, 1, img0.shape[2], img0.shape[3])
128
+ f0 = self.encode(img0[:, :3])
129
+ f1 = self.encode(img1[:, :3])
130
+ flow_list = []
131
+ merged = []
132
+ mask_list = []
133
+ warped_img0 = img0
134
+ warped_img1 = img1
135
+ flow = None
136
+ mask = None
137
+ loss_cons = 0
138
+ block = [self.block0, self.block1, self.block2, self.block3, self.block4]
139
+ for i in range(5):
140
+ if flow is None:
141
+ flow, mask, feat = block[i](torch.cat((img0[:, :3], img1[:, :3], f0, f1, timestep), 1), None, scale=scale_list[i])
142
+ if ensemble:
143
+ print("warning: ensemble is not supported since RIFEv4.21")
144
+ else:
145
+ wf0 = warp(f0, flow[:, :2])
146
+ wf1 = warp(f1, flow[:, 2:4])
147
+ fd, m0, feat = block[i](torch.cat((warped_img0[:, :3], warped_img1[:, :3], wf0, wf1, timestep, mask, feat), 1), flow, scale=scale_list[i])
148
+ if ensemble:
149
+ print("warning: ensemble is not supported since RIFEv4.21")
150
+ else:
151
+ mask = m0
152
+ flow = flow + fd
153
+ mask_list.append(mask)
154
+ flow_list.append(flow)
155
+ warped_img0 = warp(img0, flow[:, :2])
156
+ warped_img1 = warp(img1, flow[:, 2:4])
157
+ merged.append((warped_img0, warped_img1))
158
+ mask = torch.sigmoid(mask)
159
+ merged[4] = (warped_img0 * mask + warped_img1 * (1 - mask))
160
+ if not fastmode:
161
+ print('contextnet is removed')
162
+ '''
163
+ c0 = self.contextnet(img0, flow[:, :2])
164
+ c1 = self.contextnet(img1, flow[:, 2:4])
165
+ tmp = self.unet(img0, img1, warped_img0, warped_img1, mask, flow, c0, c1)
166
+ res = tmp[:, :3] * 2 - 1
167
+ merged[4] = torch.clamp(merged[4] + res, 0, 1)
168
+ '''
169
+ return flow_list, mask_list[4], merged
models/rife/RIFE_HDv3.py ADDED
@@ -0,0 +1,89 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn as nn
3
+ import numpy as np
4
+ from torch.optim import AdamW
5
+ import torch.optim as optim
6
+ import itertools
7
+ from model.warplayer import warp
8
+ from torch.nn.parallel import DistributedDataParallel as DDP
9
+ from IFNet_HDv3 import *
10
+ import torch.nn.functional as F
11
+ from model.loss import *
12
+
13
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
14
+
15
+ class Model:
16
+ def __init__(self, local_rank=-1):
17
+ self.flownet = IFNet()
18
+ self.device()
19
+ self.optimG = AdamW(self.flownet.parameters(), lr=1e-6, weight_decay=1e-4)
20
+ self.epe = EPE()
21
+ self.version = 4.25
22
+ # self.vgg = VGGPerceptualLoss().to(device)
23
+ self.sobel = SOBEL()
24
+ if local_rank != -1:
25
+ self.flownet = DDP(self.flownet, device_ids=[local_rank], output_device=local_rank)
26
+
27
+ def train(self):
28
+ self.flownet.train()
29
+
30
+ def eval(self):
31
+ self.flownet.eval()
32
+
33
+ def device(self):
34
+ self.flownet.to(device)
35
+
36
+ def load_model(self, path, rank=0):
37
+ def convert(param):
38
+ if rank == -1:
39
+ return {
40
+ k.replace("module.", ""): v
41
+ for k, v in param.items()
42
+ if "module." in k
43
+ }
44
+ else:
45
+ return param
46
+ if rank <= 0:
47
+ if torch.cuda.is_available():
48
+ self.flownet.load_state_dict(convert(torch.load('{}/flownet.pkl'.format(path))), False)
49
+ else:
50
+ self.flownet.load_state_dict(convert(torch.load('{}/flownet.pkl'.format(path), map_location ='cpu')), False)
51
+
52
+ def save_model(self, path, rank=0):
53
+ if rank == 0:
54
+ torch.save(self.flownet.state_dict(),'{}/flownet.pkl'.format(path))
55
+
56
+ def inference(self, img0, img1, timestep=0.5, scale=1.0):
57
+ imgs = torch.cat((img0, img1), 1)
58
+ scale_list = [16/scale, 8/scale, 4/scale, 2/scale, 1/scale]
59
+ flow, mask, merged = self.flownet(imgs, timestep, scale_list)
60
+ return merged[-1]
61
+
62
+ def update(self, imgs, gt, learning_rate=0, mul=1, training=True, flow_gt=None):
63
+ for param_group in self.optimG.param_groups:
64
+ param_group['lr'] = learning_rate
65
+ img0 = imgs[:, :3]
66
+ img1 = imgs[:, 3:]
67
+ if training:
68
+ self.train()
69
+ else:
70
+ self.eval()
71
+ scale = [16, 8, 4, 2, 1]
72
+ flow, mask, merged = self.flownet(torch.cat((imgs, gt), 1), scale=scale, training=training)
73
+ loss_l1 = (merged[-1] - gt).abs().mean()
74
+ loss_smooth = self.sobel(flow[-1], flow[-1]*0).mean()
75
+ # loss_vgg = self.vgg(merged[-1], gt)
76
+ if training:
77
+ self.optimG.zero_grad()
78
+ loss_G = loss_l1 + loss_cons + loss_smooth * 0.1
79
+ loss_G.backward()
80
+ self.optimG.step()
81
+ else:
82
+ flow_teacher = flow[2]
83
+ return merged[-1], {
84
+ 'mask': mask,
85
+ 'flow': flow[-1][:, :2],
86
+ 'loss_l1': loss_l1,
87
+ 'loss_cons': loss_cons,
88
+ 'loss_smooth': loss_smooth,
89
+ }
models/rife/refine.py ADDED
@@ -0,0 +1,90 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn as nn
3
+ import numpy as np
4
+ from torch.optim import AdamW
5
+ import torch.optim as optim
6
+ import itertools
7
+ from model.warplayer import warp
8
+ from torch.nn.parallel import DistributedDataParallel as DDP
9
+ import torch.nn.functional as F
10
+
11
+ device = torch.device("cuda")
12
+
13
+ def conv(in_planes, out_planes, kernel_size=3, stride=1, padding=1, dilation=1):
14
+ return nn.Sequential(
15
+ nn.Conv2d(in_planes, out_planes, kernel_size=kernel_size, stride=stride,
16
+ padding=padding, dilation=dilation, bias=True),
17
+ nn.LeakyReLU(0.2, True)
18
+ )
19
+
20
+ def conv_woact(in_planes, out_planes, kernel_size=3, stride=1, padding=1, dilation=1):
21
+ return nn.Sequential(
22
+ nn.Conv2d(in_planes, out_planes, kernel_size=kernel_size, stride=stride,
23
+ padding=padding, dilation=dilation, bias=True),
24
+ )
25
+
26
+ def deconv(in_planes, out_planes, kernel_size=4, stride=2, padding=1):
27
+ return nn.Sequential(
28
+ torch.nn.ConvTranspose2d(in_channels=in_planes, out_channels=out_planes, kernel_size=4, stride=2, padding=1, bias=True),
29
+ nn.LeakyReLU(0.2, True)
30
+ )
31
+
32
+ class Conv2(nn.Module):
33
+ def __init__(self, in_planes, out_planes, stride=2):
34
+ super(Conv2, self).__init__()
35
+ self.conv1 = conv(in_planes, out_planes, 3, stride, 1)
36
+ self.conv2 = conv(out_planes, out_planes, 3, 1, 1)
37
+
38
+ def forward(self, x):
39
+ x = self.conv1(x)
40
+ x = self.conv2(x)
41
+ return x
42
+
43
+ c = 16
44
+ class Contextnet(nn.Module):
45
+ def __init__(self):
46
+ super(Contextnet, self).__init__()
47
+ self.conv1 = Conv2(3, c)
48
+ self.conv2 = Conv2(c, 2*c)
49
+ self.conv3 = Conv2(2*c, 4*c)
50
+ self.conv4 = Conv2(4*c, 8*c)
51
+
52
+ def forward(self, x, flow):
53
+ x = self.conv1(x)
54
+ flow = F.interpolate(flow, scale_factor=0.5, mode="bilinear", align_corners=False) * 0.5
55
+ f1 = warp(x, flow)
56
+ x = self.conv2(x)
57
+ flow = F.interpolate(flow, scale_factor=0.5, mode="bilinear", align_corners=False) * 0.5
58
+ f2 = warp(x, flow)
59
+ x = self.conv3(x)
60
+ flow = F.interpolate(flow, scale_factor=0.5, mode="bilinear", align_corners=False) * 0.5
61
+ f3 = warp(x, flow)
62
+ x = self.conv4(x)
63
+ flow = F.interpolate(flow, scale_factor=0.5, mode="bilinear", align_corners=False) * 0.5
64
+ f4 = warp(x, flow)
65
+ return [f1, f2, f3, f4]
66
+
67
+ class Unet(nn.Module):
68
+ def __init__(self):
69
+ super(Unet, self).__init__()
70
+ self.down0 = Conv2(17, 2*c)
71
+ self.down1 = Conv2(4*c, 4*c)
72
+ self.down2 = Conv2(8*c, 8*c)
73
+ self.down3 = Conv2(16*c, 16*c)
74
+ self.up0 = deconv(32*c, 8*c)
75
+ self.up1 = deconv(16*c, 4*c)
76
+ self.up2 = deconv(8*c, 2*c)
77
+ self.up3 = deconv(4*c, c)
78
+ self.conv = nn.Conv2d(c, 3, 3, 1, 1)
79
+
80
+ def forward(self, img0, img1, warped_img0, warped_img1, mask, flow, c0, c1):
81
+ s0 = self.down0(torch.cat((img0, img1, warped_img0, warped_img1, mask, flow), 1))
82
+ s1 = self.down1(torch.cat((s0, c0[0], c1[0]), 1))
83
+ s2 = self.down2(torch.cat((s1, c0[1], c1[1]), 1))
84
+ s3 = self.down3(torch.cat((s2, c0[2], c1[2]), 1))
85
+ x = self.up0(torch.cat((s3, c0[3], c1[3]), 1))
86
+ x = self.up1(torch.cat((x, s2), 1))
87
+ x = self.up2(torch.cat((x, s1), 1))
88
+ x = self.up3(torch.cat((x, s0), 1))
89
+ x = self.conv(x)
90
+ return torch.sigmoid(x)
models/seedvr2/.validation_cache.json ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "seedvr2_ema_3b_fp8_e4m3fn.safetensors": {
3
+ "size": 3391544696,
4
+ "mtime": 1772206219.677575,
5
+ "hash": "3bf1e43ebedd570e7e7a0b1b60d6a02e105978f505c8128a241cde99a8240cff"
6
+ },
7
+ "ema_vae_fp16.safetensors": {
8
+ "size": 501324814,
9
+ "mtime": 1772206245.5699334,
10
+ "hash": "20678548f420d98d26f11442d3528f8b8c94e57ee046ef93dbb7633da8612ca1"
11
+ }
12
+ }
models/seedvr2/config.json ADDED
File without changes
models/voice-presets/bruce.wav ADDED
Binary file (14.3 kB). View file
 
models/voice-presets/christian.wav ADDED
Binary file (80.4 kB). View file
 
models/voice-presets/hal.wav ADDED
Binary file (50.9 kB). View file
 
models/voice-presets/heath.wav ADDED
Binary file (43.7 kB). View file
 
models/voice-presets/ian.wav ADDED
Binary file (63.4 kB). View file
 
models/voice-presets/johnny.wav ADDED
Binary file (17.4 kB). View file
 
models/voice-presets/patrick.wav ADDED
Binary file (16.3 kB). View file
 
models/voice-presets/robert.wav ADDED
Binary file (37.2 kB). View file
 
models/voice-presets/russel.wav ADDED
Binary file (31.2 kB). View file
 
models/voice-presets/sean.wav ADDED
Binary file (73.9 kB). View file
 
models/voice-presets/sigourney.wav ADDED
Binary file (15.4 kB). View file
 
models/z-image-ControlNet-Union/config.json ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_class_name": "ZImageControlNetModel",
3
+ "_diffusers_version": "0.36.0.dev0",
4
+ "add_control_noise_refiner": "control_noise_refiner",
5
+ "all_f_patch_size": [
6
+ 1
7
+ ],
8
+ "all_patch_size": [
9
+ 2
10
+ ],
11
+ "control_in_dim": 33,
12
+ "control_layers_places": [
13
+ 0,
14
+ 2,
15
+ 4,
16
+ 6,
17
+ 8,
18
+ 10,
19
+ 12,
20
+ 14,
21
+ 16,
22
+ 18,
23
+ 20,
24
+ 22,
25
+ 24,
26
+ 26,
27
+ 28
28
+ ],
29
+ "control_refiner_layers_places": [
30
+ 0,
31
+ 1
32
+ ],
33
+ "dim": 3840,
34
+ "n_heads": 30,
35
+ "n_kv_heads": 30,
36
+ "n_refiner_layers": 2,
37
+ "norm_eps": 1e-05,
38
+ "qk_norm": true
39
+ }