Add files using upload-large-folder tool
Browse files- models/ace-step/.gitattributes +38 -0
- models/ace-step/README.md +99 -0
- models/ace-step/acestep-5Hz-lm-1.7B/added_tokens.json +0 -0
- models/ace-step/acestep-5Hz-lm-1.7B/chat_template.jinja +89 -0
- models/ace-step/acestep-5Hz-lm-1.7B/config.json +61 -0
- models/ace-step/acestep-5Hz-lm-1.7B/merges.txt +0 -0
- models/ace-step/acestep-5Hz-lm-1.7B/special_tokens_map.json +0 -0
- models/ace-step/acestep-5Hz-lm-1.7B/vocab.json +0 -0
- models/ace-step/acestep-5Hz-lm-4B/added_tokens.json +0 -0
- models/ace-step/acestep-5Hz-lm-4B/config.json +69 -0
- models/ace-step/acestep-5Hz-lm-4B/merges.txt +0 -0
- models/ace-step/acestep-5Hz-lm-4B/model.safetensors.index.json +405 -0
- models/ace-step/config.json +82 -0
- models/dettaglio-restyle/styles.json +1352 -0
- models/hunyuan-foley/config_xl.yaml +48 -0
- models/kiwi-edit/.gitattributes +36 -0
- models/kiwi-edit/README.md +48 -0
- models/kiwi-edit/__init__.py +4 -0
- models/kiwi-edit/conditional_embedder.py +33 -0
- models/kiwi-edit/mllm_encoder.py +0 -0
- models/kiwi-edit/model_index.json +35 -0
- models/kiwi-edit/pipeline_kiwi_edit.py +510 -0
- models/kiwi-edit/wan_video_vae.py +1486 -0
- models/rife/.DS_Store +0 -0
- models/rife/._.DS_Store +0 -0
- models/rife/._IFNet_HDv3.cpython-311.pyc +0 -0
- models/rife/._IFNet_HDv3.py +0 -0
- models/rife/._RIFE_HDv3.cpython-311.pyc +0 -0
- models/rife/._RIFE_HDv3.py +0 -0
- models/rife/._RIFEv4.26_0921 +0 -0
- models/rife/.___pycache__ +0 -0
- models/rife/._flownet.pkl +0 -0
- models/rife/._refine.py +0 -0
- models/rife/IFNet_HDv3.py +169 -0
- models/rife/RIFE_HDv3.py +89 -0
- models/rife/refine.py +90 -0
- models/seedvr2/.validation_cache.json +12 -0
- models/seedvr2/config.json +0 -0
- models/voice-presets/bruce.wav +0 -0
- models/voice-presets/christian.wav +0 -0
- models/voice-presets/hal.wav +0 -0
- models/voice-presets/heath.wav +0 -0
- models/voice-presets/ian.wav +0 -0
- models/voice-presets/johnny.wav +0 -0
- models/voice-presets/patrick.wav +0 -0
- models/voice-presets/robert.wav +0 -0
- models/voice-presets/russel.wav +0 -0
- models/voice-presets/sean.wav +0 -0
- models/voice-presets/sigourney.wav +0 -0
- models/z-image-ControlNet-Union/config.json +39 -0
models/ace-step/.gitattributes
ADDED
|
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
| 2 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
| 3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
| 4 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
| 5 |
+
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
| 6 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
| 7 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
| 8 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
| 9 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
| 10 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
| 11 |
+
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
| 12 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
| 13 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
| 14 |
+
*.npy filter=lfs diff=lfs merge=lfs -text
|
| 15 |
+
*.npz filter=lfs diff=lfs merge=lfs -text
|
| 16 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
| 17 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
| 18 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
| 19 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
| 20 |
+
*.pickle filter=lfs diff=lfs merge=lfs -text
|
| 21 |
+
*.pkl filter=lfs diff=lfs merge=lfs -text
|
| 22 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
| 23 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
| 24 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
| 25 |
+
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
| 26 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
| 27 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
| 28 |
+
*.tar filter=lfs diff=lfs merge=lfs -text
|
| 29 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
| 30 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
| 31 |
+
*.wasm filter=lfs diff=lfs merge=lfs -text
|
| 32 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
| 33 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
+
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
+
acestep-5Hz-lm-1.7B/tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
| 37 |
+
acestep-5Hz-lm-1.7B/tokenizer_config.json filter=lfs diff=lfs merge=lfs -text
|
| 38 |
+
Qwen3-Embedding-0.6B/tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
models/ace-step/README.md
ADDED
|
@@ -0,0 +1,99 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
library_name: transformers
|
| 3 |
+
license: mit
|
| 4 |
+
pipeline_tag: text-to-audio
|
| 5 |
+
tags:
|
| 6 |
+
- audio
|
| 7 |
+
- music
|
| 8 |
+
- text2music
|
| 9 |
+
---
|
| 10 |
+
|
| 11 |
+
<h1 align="center">ACE-Step 1.5</h1>
|
| 12 |
+
<h1 align="center">Pushing the Boundaries of Open-Source Music Generation</h1>
|
| 13 |
+
<p align="center">
|
| 14 |
+
<a href="https://ace-step.github.io/ace-step-v1.5.github.io/">Project</a> |
|
| 15 |
+
<a href="https://huggingface.co/collections/ACE-Step/ace-step-15">Hugging Face</a> |
|
| 16 |
+
<a href="https://modelscope.cn/models/ACE-Step/Ace-Step1.5">ModelScope</a> |
|
| 17 |
+
<a href="https://huggingface.co/spaces/ACE-Step/Ace-Step-v1.5">Space Demo</a> |
|
| 18 |
+
<a href="https://discord.gg/PeWDxrkdj7">Discord</a>
|
| 19 |
+
<a href="https://arxiv.org/abs/2602.00744">Tech Report</a>
|
| 20 |
+
</p>
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+

|
| 24 |
+
|
| 25 |
+
## Model Details
|
| 26 |
+
|
| 27 |
+
🚀 **ACE-Step v1.5** is a highly efficient open-source music foundation model designed to bring commercial-grade music generation to consumer hardware.
|
| 28 |
+
|
| 29 |
+
### Key Features
|
| 30 |
+
|
| 31 |
+
* **💰 Commercial-Ready:** Unlike many models trained on ambiguous datasets, ACE-Step v1.5 is designed for creators. You can strictly use the generated music for **commercial purposes**.
|
| 32 |
+
* **📚 Safe & Robust Training Data:** The model is trained on a massive, legally compliant dataset consisting of:
|
| 33 |
+
* **Licensed Data:** Professionally licensed music tracks.
|
| 34 |
+
* **Royalty-Free / No-Copyright Data:** A vast collection of public domain and royalty-free music.
|
| 35 |
+
* **Synthetic Data:** High-quality audio generated via advanced MIDI-to-Audio conversion.
|
| 36 |
+
* **⚡ Extreme Speed:** Generates a full song in under 2 seconds on an A100 and under 10 seconds on an RTX 3090.
|
| 37 |
+
* **🖥️ Consumer Hardware Friendly:** Runs locally with less than 4GB of VRAM.
|
| 38 |
+
|
| 39 |
+
### Technical Capabilities
|
| 40 |
+
|
| 41 |
+
🌉 At its core lies a novel hybrid architecture where the Language Model (LM) functions as an omni-capable planner: it transforms simple user queries into comprehensive song blueprints—scaling from short loops to 10-minute compositions—while synthesizing metadata, lyrics, and captions via Chain-of-Thought to guide the Diffusion Transformer (DiT). ⚡ Uniquely, this alignment is achieved through intrinsic reinforcement learning relying solely on the model's internal mechanisms, thereby eliminating the biases inherent in external reward models or human preferences. 🎚️
|
| 42 |
+
|
| 43 |
+
🔮 Beyond standard synthesis, ACE-Step v1.5 unifies precise stylistic control with versatile editing capabilities—such as cover generation, repainting, and vocal-to-BGM conversion—while maintaining strict adherence to prompts across 50+ languages. This paves the way for powerful tools that seamlessly integrate into the creative workflows of music artists, producers, and content creators. 🎸
|
| 44 |
+
|
| 45 |
+
- **Developed by:** [ACE-STEP]
|
| 46 |
+
- **Model type:** [Text2Music]
|
| 47 |
+
- **Language(s):** [50+ languages]
|
| 48 |
+
- **License:** [MIT]
|
| 49 |
+
|
| 50 |
+
## Evaluation
|
| 51 |
+
|
| 52 |
+

|
| 53 |
+
|
| 54 |
+
## 🏗️ Architecture
|
| 55 |
+
|
| 56 |
+
|
| 57 |
+

|
| 58 |
+
|
| 59 |
+
|
| 60 |
+
## 🦁 Model Zoo
|
| 61 |
+
|
| 62 |
+
|
| 63 |
+

|
| 64 |
+
|
| 65 |
+
### DiT Models
|
| 66 |
+
|
| 67 |
+
| DiT Model | Pre-Training | SFT | RL | CFG | Step | Refer audio | Text2Music | Cover | Repaint | Extract | Lego | Complete | Quality | Diversity | Fine-Tunability | Hugging Face |
|
| 68 |
+
|-----------|:------------:|:---:|:--:|:---:|:----:|:-----------:|:----------:|:-----:|:-------:|:-------:|:----:|:--------:|:-------:|:---------:|:---------------:|--------------|
|
| 69 |
+
| `acestep-v15-base` | ✅ | ❌ | ❌ | ✅ | 50 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | Medium | High | Easy | [Link](https://huggingface.co/ACE-Step/acestep-v15-base) |
|
| 70 |
+
| `acestep-v15-sft` | ✅ | ✅ | ❌ | ✅ | 50 | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ | High | Medium | Easy | [Link](https://huggingface.co/ACE-Step/acestep-v15-sft) |
|
| 71 |
+
| `acestep-v15-turbo` | ✅ | ✅ | ❌ | ❌ | 8 | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ | Very High | Medium | Medium | [Link](https://huggingface.co/ACE-Step/Ace-Step1.5) |
|
| 72 |
+
| `acestep-v15-turbo-rl` | ✅ | ✅ | ✅ | ❌ | 8 | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ | Very High | Medium | Medium | To be released |
|
| 73 |
+
|
| 74 |
+
### LM Models
|
| 75 |
+
|
| 76 |
+
| LM Model | Pretrain from | Pre-Training | SFT | RL | CoT metas | Query rewrite | Audio Understanding | Composition Capability | Copy Melody | Hugging Face |
|
| 77 |
+
|----------|---------------|:------------:|:---:|:--:|:---------:|:-------------:|:-------------------:|:----------------------:|:-----------:|--------------|
|
| 78 |
+
| `acestep-5Hz-lm-0.6B` | Qwen3-0.6B | ✅ | ✅ | ✅ | ✅ | ✅ | Medium | Medium | Weak | ✅ |
|
| 79 |
+
| `acestep-5Hz-lm-1.7B` | Qwen3-1.7B | ✅ | ✅ | ✅ | ✅ | ✅ | Medium | Medium | Medium | ✅ |
|
| 80 |
+
| `acestep-5Hz-lm-4B` | Qwen3-4B | ✅ | ✅ | ✅ | ✅ | ✅ | Strong | Strong | Strong | ✅ |
|
| 81 |
+
|
| 82 |
+
|
| 83 |
+
## 🙏 Acknowledgements
|
| 84 |
+
|
| 85 |
+
This project is co-led by ACE Studio and StepFun.
|
| 86 |
+
|
| 87 |
+
|
| 88 |
+
## 📖 Citation
|
| 89 |
+
|
| 90 |
+
If you find this project useful for your research, please consider citing:
|
| 91 |
+
|
| 92 |
+
```BibTeX
|
| 93 |
+
@misc{gong2026acestep,
|
| 94 |
+
title={ACE-Step 1.5: Pushing the Boundaries of Open-Source Music Generation},
|
| 95 |
+
author={Junmin Gong, Yulin Song, Wenxiao Zhao, Sen Wang, Shengyuan Xu, Jing Guo},
|
| 96 |
+
howpublished={\url{https://github.com/ace-step/ACE-Step-1.5}},
|
| 97 |
+
year={2026},
|
| 98 |
+
note={GitHub repository}
|
| 99 |
+
}
|
models/ace-step/acestep-5Hz-lm-1.7B/added_tokens.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
models/ace-step/acestep-5Hz-lm-1.7B/chat_template.jinja
ADDED
|
@@ -0,0 +1,89 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{%- if tools %}
|
| 2 |
+
{{- '<|im_start|>system\n' }}
|
| 3 |
+
{%- if messages[0].role == 'system' %}
|
| 4 |
+
{{- messages[0].content + '\n\n' }}
|
| 5 |
+
{%- endif %}
|
| 6 |
+
{{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
|
| 7 |
+
{%- for tool in tools %}
|
| 8 |
+
{{- "\n" }}
|
| 9 |
+
{{- tool | tojson }}
|
| 10 |
+
{%- endfor %}
|
| 11 |
+
{{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
|
| 12 |
+
{%- else %}
|
| 13 |
+
{%- if messages[0].role == 'system' %}
|
| 14 |
+
{{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
|
| 15 |
+
{%- endif %}
|
| 16 |
+
{%- endif %}
|
| 17 |
+
{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
|
| 18 |
+
{%- for message in messages[::-1] %}
|
| 19 |
+
{%- set index = (messages|length - 1) - loop.index0 %}
|
| 20 |
+
{%- if ns.multi_step_tool and message.role == "user" and message.content is string and not(message.content.startswith('<tool_response>') and message.content.endswith('</tool_response>')) %}
|
| 21 |
+
{%- set ns.multi_step_tool = false %}
|
| 22 |
+
{%- set ns.last_query_index = index %}
|
| 23 |
+
{%- endif %}
|
| 24 |
+
{%- endfor %}
|
| 25 |
+
{%- for message in messages %}
|
| 26 |
+
{%- if message.content is string %}
|
| 27 |
+
{%- set content = message.content %}
|
| 28 |
+
{%- else %}
|
| 29 |
+
{%- set content = '' %}
|
| 30 |
+
{%- endif %}
|
| 31 |
+
{%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
|
| 32 |
+
{{- '<|im_start|>' + message.role + '\n' + content + '<|im_end|>' + '\n' }}
|
| 33 |
+
{%- elif message.role == "assistant" %}
|
| 34 |
+
{%- set reasoning_content = '' %}
|
| 35 |
+
{%- if message.reasoning_content is string %}
|
| 36 |
+
{%- set reasoning_content = message.reasoning_content %}
|
| 37 |
+
{%- else %}
|
| 38 |
+
{%- if '</think>' in content %}
|
| 39 |
+
{%- set reasoning_content = content.split('</think>')[0].rstrip('\n').split('<think>')[-1].lstrip('\n') %}
|
| 40 |
+
{%- set content = content.split('</think>')[-1].lstrip('\n') %}
|
| 41 |
+
{%- endif %}
|
| 42 |
+
{%- endif %}
|
| 43 |
+
{%- if loop.index0 > ns.last_query_index %}
|
| 44 |
+
{%- if loop.last or (not loop.last and reasoning_content) %}
|
| 45 |
+
{{- '<|im_start|>' + message.role + '\n<think>\n' + reasoning_content.strip('\n') + '\n</think>\n\n' + content.lstrip('\n') }}
|
| 46 |
+
{%- else %}
|
| 47 |
+
{{- '<|im_start|>' + message.role + '\n' + content }}
|
| 48 |
+
{%- endif %}
|
| 49 |
+
{%- else %}
|
| 50 |
+
{{- '<|im_start|>' + message.role + '\n' + content }}
|
| 51 |
+
{%- endif %}
|
| 52 |
+
{%- if message.tool_calls %}
|
| 53 |
+
{%- for tool_call in message.tool_calls %}
|
| 54 |
+
{%- if (loop.first and content) or (not loop.first) %}
|
| 55 |
+
{{- '\n' }}
|
| 56 |
+
{%- endif %}
|
| 57 |
+
{%- if tool_call.function %}
|
| 58 |
+
{%- set tool_call = tool_call.function %}
|
| 59 |
+
{%- endif %}
|
| 60 |
+
{{- '<tool_call>\n{"name": "' }}
|
| 61 |
+
{{- tool_call.name }}
|
| 62 |
+
{{- '", "arguments": ' }}
|
| 63 |
+
{%- if tool_call.arguments is string %}
|
| 64 |
+
{{- tool_call.arguments }}
|
| 65 |
+
{%- else %}
|
| 66 |
+
{{- tool_call.arguments | tojson }}
|
| 67 |
+
{%- endif %}
|
| 68 |
+
{{- '}\n</tool_call>' }}
|
| 69 |
+
{%- endfor %}
|
| 70 |
+
{%- endif %}
|
| 71 |
+
{{- '<|im_end|>\n' }}
|
| 72 |
+
{%- elif message.role == "tool" %}
|
| 73 |
+
{%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
|
| 74 |
+
{{- '<|im_start|>user' }}
|
| 75 |
+
{%- endif %}
|
| 76 |
+
{{- '\n<tool_response>\n' }}
|
| 77 |
+
{{- content }}
|
| 78 |
+
{{- '\n</tool_response>' }}
|
| 79 |
+
{%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
|
| 80 |
+
{{- '<|im_end|>\n' }}
|
| 81 |
+
{%- endif %}
|
| 82 |
+
{%- endif %}
|
| 83 |
+
{%- endfor %}
|
| 84 |
+
{%- if add_generation_prompt %}
|
| 85 |
+
{{- '<|im_start|>assistant\n' }}
|
| 86 |
+
{%- if enable_thinking is defined and enable_thinking is false %}
|
| 87 |
+
{{- '<think>\n\n</think>\n\n' }}
|
| 88 |
+
{%- endif %}
|
| 89 |
+
{%- endif %}
|
models/ace-step/acestep-5Hz-lm-1.7B/config.json
ADDED
|
@@ -0,0 +1,61 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"architectures": [
|
| 3 |
+
"Qwen3Model"
|
| 4 |
+
],
|
| 5 |
+
"attention_bias": false,
|
| 6 |
+
"attention_dropout": 0.0,
|
| 7 |
+
"bos_token_id": 151643,
|
| 8 |
+
"dtype": "bfloat16",
|
| 9 |
+
"eos_token_id": 151645,
|
| 10 |
+
"head_dim": 128,
|
| 11 |
+
"hidden_act": "silu",
|
| 12 |
+
"hidden_size": 2048,
|
| 13 |
+
"initializer_range": 0.02,
|
| 14 |
+
"intermediate_size": 6144,
|
| 15 |
+
"layer_types": [
|
| 16 |
+
"full_attention",
|
| 17 |
+
"full_attention",
|
| 18 |
+
"full_attention",
|
| 19 |
+
"full_attention",
|
| 20 |
+
"full_attention",
|
| 21 |
+
"full_attention",
|
| 22 |
+
"full_attention",
|
| 23 |
+
"full_attention",
|
| 24 |
+
"full_attention",
|
| 25 |
+
"full_attention",
|
| 26 |
+
"full_attention",
|
| 27 |
+
"full_attention",
|
| 28 |
+
"full_attention",
|
| 29 |
+
"full_attention",
|
| 30 |
+
"full_attention",
|
| 31 |
+
"full_attention",
|
| 32 |
+
"full_attention",
|
| 33 |
+
"full_attention",
|
| 34 |
+
"full_attention",
|
| 35 |
+
"full_attention",
|
| 36 |
+
"full_attention",
|
| 37 |
+
"full_attention",
|
| 38 |
+
"full_attention",
|
| 39 |
+
"full_attention",
|
| 40 |
+
"full_attention",
|
| 41 |
+
"full_attention",
|
| 42 |
+
"full_attention",
|
| 43 |
+
"full_attention"
|
| 44 |
+
],
|
| 45 |
+
"max_position_embeddings": 40960,
|
| 46 |
+
"max_window_layers": 28,
|
| 47 |
+
"model_type": "qwen3",
|
| 48 |
+
"num_attention_heads": 16,
|
| 49 |
+
"num_hidden_layers": 28,
|
| 50 |
+
"num_key_value_heads": 8,
|
| 51 |
+
"pad_token_id": 151643,
|
| 52 |
+
"rms_norm_eps": 1e-06,
|
| 53 |
+
"rope_scaling": null,
|
| 54 |
+
"rope_theta": 1000000,
|
| 55 |
+
"sliding_window": null,
|
| 56 |
+
"tie_word_embeddings": true,
|
| 57 |
+
"transformers_version": "4.57.0.dev0",
|
| 58 |
+
"use_cache": true,
|
| 59 |
+
"use_sliding_window": false,
|
| 60 |
+
"vocab_size": 217204
|
| 61 |
+
}
|
models/ace-step/acestep-5Hz-lm-1.7B/merges.txt
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
models/ace-step/acestep-5Hz-lm-1.7B/special_tokens_map.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
models/ace-step/acestep-5Hz-lm-1.7B/vocab.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
models/ace-step/acestep-5Hz-lm-4B/added_tokens.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
models/ace-step/acestep-5Hz-lm-4B/config.json
ADDED
|
@@ -0,0 +1,69 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"architectures": [
|
| 3 |
+
"Qwen3ForCausalLM"
|
| 4 |
+
],
|
| 5 |
+
"attention_bias": false,
|
| 6 |
+
"attention_dropout": 0.0,
|
| 7 |
+
"bos_token_id": 151643,
|
| 8 |
+
"dtype": "bfloat16",
|
| 9 |
+
"eos_token_id": 151645,
|
| 10 |
+
"head_dim": 128,
|
| 11 |
+
"hidden_act": "silu",
|
| 12 |
+
"hidden_size": 2560,
|
| 13 |
+
"initializer_range": 0.02,
|
| 14 |
+
"intermediate_size": 9728,
|
| 15 |
+
"layer_types": [
|
| 16 |
+
"full_attention",
|
| 17 |
+
"full_attention",
|
| 18 |
+
"full_attention",
|
| 19 |
+
"full_attention",
|
| 20 |
+
"full_attention",
|
| 21 |
+
"full_attention",
|
| 22 |
+
"full_attention",
|
| 23 |
+
"full_attention",
|
| 24 |
+
"full_attention",
|
| 25 |
+
"full_attention",
|
| 26 |
+
"full_attention",
|
| 27 |
+
"full_attention",
|
| 28 |
+
"full_attention",
|
| 29 |
+
"full_attention",
|
| 30 |
+
"full_attention",
|
| 31 |
+
"full_attention",
|
| 32 |
+
"full_attention",
|
| 33 |
+
"full_attention",
|
| 34 |
+
"full_attention",
|
| 35 |
+
"full_attention",
|
| 36 |
+
"full_attention",
|
| 37 |
+
"full_attention",
|
| 38 |
+
"full_attention",
|
| 39 |
+
"full_attention",
|
| 40 |
+
"full_attention",
|
| 41 |
+
"full_attention",
|
| 42 |
+
"full_attention",
|
| 43 |
+
"full_attention",
|
| 44 |
+
"full_attention",
|
| 45 |
+
"full_attention",
|
| 46 |
+
"full_attention",
|
| 47 |
+
"full_attention",
|
| 48 |
+
"full_attention",
|
| 49 |
+
"full_attention",
|
| 50 |
+
"full_attention",
|
| 51 |
+
"full_attention"
|
| 52 |
+
],
|
| 53 |
+
"max_position_embeddings": 40960,
|
| 54 |
+
"max_window_layers": 36,
|
| 55 |
+
"model_type": "qwen3",
|
| 56 |
+
"num_attention_heads": 32,
|
| 57 |
+
"num_hidden_layers": 36,
|
| 58 |
+
"num_key_value_heads": 8,
|
| 59 |
+
"pad_token_id": 151643,
|
| 60 |
+
"rms_norm_eps": 1e-06,
|
| 61 |
+
"rope_scaling": null,
|
| 62 |
+
"rope_theta": 1000000,
|
| 63 |
+
"sliding_window": null,
|
| 64 |
+
"tie_word_embeddings": true,
|
| 65 |
+
"transformers_version": "4.57.1",
|
| 66 |
+
"use_cache": true,
|
| 67 |
+
"use_sliding_window": false,
|
| 68 |
+
"vocab_size": 217204
|
| 69 |
+
}
|
models/ace-step/acestep-5Hz-lm-4B/merges.txt
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
models/ace-step/acestep-5Hz-lm-4B/model.safetensors.index.json
ADDED
|
@@ -0,0 +1,405 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"metadata": {
|
| 3 |
+
"total_size": 8379108352
|
| 4 |
+
},
|
| 5 |
+
"weight_map": {
|
| 6 |
+
"model.embed_tokens.weight": "model-00001-of-00002.safetensors",
|
| 7 |
+
"model.layers.0.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
| 8 |
+
"model.layers.0.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
| 9 |
+
"model.layers.0.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
| 10 |
+
"model.layers.0.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
|
| 11 |
+
"model.layers.0.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
|
| 12 |
+
"model.layers.0.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
|
| 13 |
+
"model.layers.0.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 14 |
+
"model.layers.0.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
|
| 15 |
+
"model.layers.0.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
|
| 16 |
+
"model.layers.0.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
|
| 17 |
+
"model.layers.0.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 18 |
+
"model.layers.1.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
| 19 |
+
"model.layers.1.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
| 20 |
+
"model.layers.1.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
| 21 |
+
"model.layers.1.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
|
| 22 |
+
"model.layers.1.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
|
| 23 |
+
"model.layers.1.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
|
| 24 |
+
"model.layers.1.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 25 |
+
"model.layers.1.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
|
| 26 |
+
"model.layers.1.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
|
| 27 |
+
"model.layers.1.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
|
| 28 |
+
"model.layers.1.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 29 |
+
"model.layers.2.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
| 30 |
+
"model.layers.2.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
| 31 |
+
"model.layers.2.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
| 32 |
+
"model.layers.2.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
|
| 33 |
+
"model.layers.2.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
|
| 34 |
+
"model.layers.2.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
|
| 35 |
+
"model.layers.2.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 36 |
+
"model.layers.2.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
|
| 37 |
+
"model.layers.2.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
|
| 38 |
+
"model.layers.2.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
|
| 39 |
+
"model.layers.2.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 40 |
+
"model.layers.3.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
| 41 |
+
"model.layers.3.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
| 42 |
+
"model.layers.3.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
| 43 |
+
"model.layers.3.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
|
| 44 |
+
"model.layers.3.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
|
| 45 |
+
"model.layers.3.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
|
| 46 |
+
"model.layers.3.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 47 |
+
"model.layers.3.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
|
| 48 |
+
"model.layers.3.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
|
| 49 |
+
"model.layers.3.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
|
| 50 |
+
"model.layers.3.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 51 |
+
"model.layers.4.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
| 52 |
+
"model.layers.4.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
| 53 |
+
"model.layers.4.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
| 54 |
+
"model.layers.4.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
|
| 55 |
+
"model.layers.4.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
|
| 56 |
+
"model.layers.4.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
|
| 57 |
+
"model.layers.4.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 58 |
+
"model.layers.4.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
|
| 59 |
+
"model.layers.4.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
|
| 60 |
+
"model.layers.4.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
|
| 61 |
+
"model.layers.4.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 62 |
+
"model.layers.5.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
| 63 |
+
"model.layers.5.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
| 64 |
+
"model.layers.5.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
| 65 |
+
"model.layers.5.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
|
| 66 |
+
"model.layers.5.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
|
| 67 |
+
"model.layers.5.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
|
| 68 |
+
"model.layers.5.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 69 |
+
"model.layers.5.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
|
| 70 |
+
"model.layers.5.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
|
| 71 |
+
"model.layers.5.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
|
| 72 |
+
"model.layers.5.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 73 |
+
"model.layers.6.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
| 74 |
+
"model.layers.6.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
| 75 |
+
"model.layers.6.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
| 76 |
+
"model.layers.6.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
|
| 77 |
+
"model.layers.6.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
|
| 78 |
+
"model.layers.6.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
|
| 79 |
+
"model.layers.6.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 80 |
+
"model.layers.6.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
|
| 81 |
+
"model.layers.6.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
|
| 82 |
+
"model.layers.6.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
|
| 83 |
+
"model.layers.6.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 84 |
+
"model.layers.7.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
| 85 |
+
"model.layers.7.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
| 86 |
+
"model.layers.7.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
| 87 |
+
"model.layers.7.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
|
| 88 |
+
"model.layers.7.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
|
| 89 |
+
"model.layers.7.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
|
| 90 |
+
"model.layers.7.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 91 |
+
"model.layers.7.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
|
| 92 |
+
"model.layers.7.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
|
| 93 |
+
"model.layers.7.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
|
| 94 |
+
"model.layers.7.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 95 |
+
"model.layers.8.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
| 96 |
+
"model.layers.8.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
| 97 |
+
"model.layers.8.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
| 98 |
+
"model.layers.8.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
|
| 99 |
+
"model.layers.8.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
|
| 100 |
+
"model.layers.8.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
|
| 101 |
+
"model.layers.8.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 102 |
+
"model.layers.8.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
|
| 103 |
+
"model.layers.8.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
|
| 104 |
+
"model.layers.8.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
|
| 105 |
+
"model.layers.8.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 106 |
+
"model.layers.9.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
| 107 |
+
"model.layers.9.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
| 108 |
+
"model.layers.9.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
| 109 |
+
"model.layers.9.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
|
| 110 |
+
"model.layers.9.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
|
| 111 |
+
"model.layers.9.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
|
| 112 |
+
"model.layers.9.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 113 |
+
"model.layers.9.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
|
| 114 |
+
"model.layers.9.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
|
| 115 |
+
"model.layers.9.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
|
| 116 |
+
"model.layers.9.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 117 |
+
"model.layers.10.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
| 118 |
+
"model.layers.10.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
| 119 |
+
"model.layers.10.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
| 120 |
+
"model.layers.10.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
|
| 121 |
+
"model.layers.10.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
|
| 122 |
+
"model.layers.10.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
|
| 123 |
+
"model.layers.10.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 124 |
+
"model.layers.10.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
|
| 125 |
+
"model.layers.10.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
|
| 126 |
+
"model.layers.10.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
|
| 127 |
+
"model.layers.10.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 128 |
+
"model.layers.11.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
| 129 |
+
"model.layers.11.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
| 130 |
+
"model.layers.11.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
| 131 |
+
"model.layers.11.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
|
| 132 |
+
"model.layers.11.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
|
| 133 |
+
"model.layers.11.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
|
| 134 |
+
"model.layers.11.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 135 |
+
"model.layers.11.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
|
| 136 |
+
"model.layers.11.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
|
| 137 |
+
"model.layers.11.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
|
| 138 |
+
"model.layers.11.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 139 |
+
"model.layers.12.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
| 140 |
+
"model.layers.12.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
| 141 |
+
"model.layers.12.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
| 142 |
+
"model.layers.12.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
|
| 143 |
+
"model.layers.12.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
|
| 144 |
+
"model.layers.12.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
|
| 145 |
+
"model.layers.12.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 146 |
+
"model.layers.12.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
|
| 147 |
+
"model.layers.12.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
|
| 148 |
+
"model.layers.12.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
|
| 149 |
+
"model.layers.12.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 150 |
+
"model.layers.13.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
| 151 |
+
"model.layers.13.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
| 152 |
+
"model.layers.13.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
| 153 |
+
"model.layers.13.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
|
| 154 |
+
"model.layers.13.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
|
| 155 |
+
"model.layers.13.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
|
| 156 |
+
"model.layers.13.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 157 |
+
"model.layers.13.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
|
| 158 |
+
"model.layers.13.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
|
| 159 |
+
"model.layers.13.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
|
| 160 |
+
"model.layers.13.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 161 |
+
"model.layers.14.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
| 162 |
+
"model.layers.14.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
| 163 |
+
"model.layers.14.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
| 164 |
+
"model.layers.14.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
|
| 165 |
+
"model.layers.14.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
|
| 166 |
+
"model.layers.14.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
|
| 167 |
+
"model.layers.14.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 168 |
+
"model.layers.14.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
|
| 169 |
+
"model.layers.14.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
|
| 170 |
+
"model.layers.14.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
|
| 171 |
+
"model.layers.14.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 172 |
+
"model.layers.15.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
| 173 |
+
"model.layers.15.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
| 174 |
+
"model.layers.15.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
| 175 |
+
"model.layers.15.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
|
| 176 |
+
"model.layers.15.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
|
| 177 |
+
"model.layers.15.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
|
| 178 |
+
"model.layers.15.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 179 |
+
"model.layers.15.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
|
| 180 |
+
"model.layers.15.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
|
| 181 |
+
"model.layers.15.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
|
| 182 |
+
"model.layers.15.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 183 |
+
"model.layers.16.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
| 184 |
+
"model.layers.16.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
| 185 |
+
"model.layers.16.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
| 186 |
+
"model.layers.16.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
|
| 187 |
+
"model.layers.16.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
|
| 188 |
+
"model.layers.16.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
|
| 189 |
+
"model.layers.16.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 190 |
+
"model.layers.16.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
|
| 191 |
+
"model.layers.16.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
|
| 192 |
+
"model.layers.16.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
|
| 193 |
+
"model.layers.16.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 194 |
+
"model.layers.17.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
| 195 |
+
"model.layers.17.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
| 196 |
+
"model.layers.17.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
| 197 |
+
"model.layers.17.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
|
| 198 |
+
"model.layers.17.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
|
| 199 |
+
"model.layers.17.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
|
| 200 |
+
"model.layers.17.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 201 |
+
"model.layers.17.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
|
| 202 |
+
"model.layers.17.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
|
| 203 |
+
"model.layers.17.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
|
| 204 |
+
"model.layers.17.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 205 |
+
"model.layers.18.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
| 206 |
+
"model.layers.18.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
| 207 |
+
"model.layers.18.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
| 208 |
+
"model.layers.18.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
|
| 209 |
+
"model.layers.18.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
|
| 210 |
+
"model.layers.18.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
|
| 211 |
+
"model.layers.18.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 212 |
+
"model.layers.18.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
|
| 213 |
+
"model.layers.18.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
|
| 214 |
+
"model.layers.18.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
|
| 215 |
+
"model.layers.18.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 216 |
+
"model.layers.19.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
| 217 |
+
"model.layers.19.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
| 218 |
+
"model.layers.19.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
| 219 |
+
"model.layers.19.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
|
| 220 |
+
"model.layers.19.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
|
| 221 |
+
"model.layers.19.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
|
| 222 |
+
"model.layers.19.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 223 |
+
"model.layers.19.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
|
| 224 |
+
"model.layers.19.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
|
| 225 |
+
"model.layers.19.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
|
| 226 |
+
"model.layers.19.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
|
| 227 |
+
"model.layers.20.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
|
| 228 |
+
"model.layers.20.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
|
| 229 |
+
"model.layers.20.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
|
| 230 |
+
"model.layers.20.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
|
| 231 |
+
"model.layers.20.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
|
| 232 |
+
"model.layers.20.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
|
| 233 |
+
"model.layers.20.input_layernorm.weight": "model-00002-of-00002.safetensors",
|
| 234 |
+
"model.layers.20.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
|
| 235 |
+
"model.layers.20.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
|
| 236 |
+
"model.layers.20.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
|
| 237 |
+
"model.layers.20.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
|
| 238 |
+
"model.layers.21.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
|
| 239 |
+
"model.layers.21.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
|
| 240 |
+
"model.layers.21.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
|
| 241 |
+
"model.layers.21.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
|
| 242 |
+
"model.layers.21.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
|
| 243 |
+
"model.layers.21.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
|
| 244 |
+
"model.layers.21.input_layernorm.weight": "model-00002-of-00002.safetensors",
|
| 245 |
+
"model.layers.21.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
|
| 246 |
+
"model.layers.21.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
|
| 247 |
+
"model.layers.21.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
|
| 248 |
+
"model.layers.21.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
|
| 249 |
+
"model.layers.22.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
|
| 250 |
+
"model.layers.22.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
|
| 251 |
+
"model.layers.22.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
|
| 252 |
+
"model.layers.22.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
|
| 253 |
+
"model.layers.22.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
|
| 254 |
+
"model.layers.22.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
|
| 255 |
+
"model.layers.22.input_layernorm.weight": "model-00002-of-00002.safetensors",
|
| 256 |
+
"model.layers.22.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
|
| 257 |
+
"model.layers.22.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
|
| 258 |
+
"model.layers.22.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
|
| 259 |
+
"model.layers.22.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
|
| 260 |
+
"model.layers.23.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
|
| 261 |
+
"model.layers.23.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
|
| 262 |
+
"model.layers.23.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
|
| 263 |
+
"model.layers.23.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
|
| 264 |
+
"model.layers.23.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
|
| 265 |
+
"model.layers.23.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
|
| 266 |
+
"model.layers.23.input_layernorm.weight": "model-00002-of-00002.safetensors",
|
| 267 |
+
"model.layers.23.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
|
| 268 |
+
"model.layers.23.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
|
| 269 |
+
"model.layers.23.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
|
| 270 |
+
"model.layers.23.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
|
| 271 |
+
"model.layers.24.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
|
| 272 |
+
"model.layers.24.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
|
| 273 |
+
"model.layers.24.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
|
| 274 |
+
"model.layers.24.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
|
| 275 |
+
"model.layers.24.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
|
| 276 |
+
"model.layers.24.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
|
| 277 |
+
"model.layers.24.input_layernorm.weight": "model-00002-of-00002.safetensors",
|
| 278 |
+
"model.layers.24.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
|
| 279 |
+
"model.layers.24.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
|
| 280 |
+
"model.layers.24.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
|
| 281 |
+
"model.layers.24.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
|
| 282 |
+
"model.layers.25.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
|
| 283 |
+
"model.layers.25.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
|
| 284 |
+
"model.layers.25.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
|
| 285 |
+
"model.layers.25.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
|
| 286 |
+
"model.layers.25.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
|
| 287 |
+
"model.layers.25.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
|
| 288 |
+
"model.layers.25.input_layernorm.weight": "model-00002-of-00002.safetensors",
|
| 289 |
+
"model.layers.25.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
|
| 290 |
+
"model.layers.25.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
|
| 291 |
+
"model.layers.25.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
|
| 292 |
+
"model.layers.25.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
|
| 293 |
+
"model.layers.26.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
|
| 294 |
+
"model.layers.26.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
|
| 295 |
+
"model.layers.26.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
|
| 296 |
+
"model.layers.26.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
|
| 297 |
+
"model.layers.26.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
|
| 298 |
+
"model.layers.26.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
|
| 299 |
+
"model.layers.26.input_layernorm.weight": "model-00002-of-00002.safetensors",
|
| 300 |
+
"model.layers.26.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
|
| 301 |
+
"model.layers.26.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
|
| 302 |
+
"model.layers.26.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
|
| 303 |
+
"model.layers.26.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
|
| 304 |
+
"model.layers.27.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
|
| 305 |
+
"model.layers.27.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
|
| 306 |
+
"model.layers.27.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
|
| 307 |
+
"model.layers.27.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
|
| 308 |
+
"model.layers.27.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
|
| 309 |
+
"model.layers.27.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
|
| 310 |
+
"model.layers.27.input_layernorm.weight": "model-00002-of-00002.safetensors",
|
| 311 |
+
"model.layers.27.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
|
| 312 |
+
"model.layers.27.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
|
| 313 |
+
"model.layers.27.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
|
| 314 |
+
"model.layers.27.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
|
| 315 |
+
"model.layers.28.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
|
| 316 |
+
"model.layers.28.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
|
| 317 |
+
"model.layers.28.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
|
| 318 |
+
"model.layers.28.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
|
| 319 |
+
"model.layers.28.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
|
| 320 |
+
"model.layers.28.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
|
| 321 |
+
"model.layers.28.input_layernorm.weight": "model-00002-of-00002.safetensors",
|
| 322 |
+
"model.layers.28.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
|
| 323 |
+
"model.layers.28.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
|
| 324 |
+
"model.layers.28.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
|
| 325 |
+
"model.layers.28.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
|
| 326 |
+
"model.layers.29.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
|
| 327 |
+
"model.layers.29.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
|
| 328 |
+
"model.layers.29.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
|
| 329 |
+
"model.layers.29.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
|
| 330 |
+
"model.layers.29.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
|
| 331 |
+
"model.layers.29.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
|
| 332 |
+
"model.layers.29.input_layernorm.weight": "model-00002-of-00002.safetensors",
|
| 333 |
+
"model.layers.29.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
|
| 334 |
+
"model.layers.29.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
|
| 335 |
+
"model.layers.29.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
|
| 336 |
+
"model.layers.29.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
|
| 337 |
+
"model.layers.30.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
|
| 338 |
+
"model.layers.30.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
|
| 339 |
+
"model.layers.30.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
|
| 340 |
+
"model.layers.30.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
|
| 341 |
+
"model.layers.30.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
|
| 342 |
+
"model.layers.30.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
|
| 343 |
+
"model.layers.30.input_layernorm.weight": "model-00002-of-00002.safetensors",
|
| 344 |
+
"model.layers.30.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
|
| 345 |
+
"model.layers.30.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
|
| 346 |
+
"model.layers.30.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
|
| 347 |
+
"model.layers.30.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
|
| 348 |
+
"model.layers.31.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
|
| 349 |
+
"model.layers.31.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
|
| 350 |
+
"model.layers.31.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
|
| 351 |
+
"model.layers.31.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
|
| 352 |
+
"model.layers.31.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
|
| 353 |
+
"model.layers.31.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
|
| 354 |
+
"model.layers.31.input_layernorm.weight": "model-00002-of-00002.safetensors",
|
| 355 |
+
"model.layers.31.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
|
| 356 |
+
"model.layers.31.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
|
| 357 |
+
"model.layers.31.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
|
| 358 |
+
"model.layers.31.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
|
| 359 |
+
"model.layers.32.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
|
| 360 |
+
"model.layers.32.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
|
| 361 |
+
"model.layers.32.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
|
| 362 |
+
"model.layers.32.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
|
| 363 |
+
"model.layers.32.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
|
| 364 |
+
"model.layers.32.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
|
| 365 |
+
"model.layers.32.input_layernorm.weight": "model-00002-of-00002.safetensors",
|
| 366 |
+
"model.layers.32.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
|
| 367 |
+
"model.layers.32.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
|
| 368 |
+
"model.layers.32.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
|
| 369 |
+
"model.layers.32.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
|
| 370 |
+
"model.layers.33.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
|
| 371 |
+
"model.layers.33.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
|
| 372 |
+
"model.layers.33.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
|
| 373 |
+
"model.layers.33.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
|
| 374 |
+
"model.layers.33.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
|
| 375 |
+
"model.layers.33.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
|
| 376 |
+
"model.layers.33.input_layernorm.weight": "model-00002-of-00002.safetensors",
|
| 377 |
+
"model.layers.33.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
|
| 378 |
+
"model.layers.33.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
|
| 379 |
+
"model.layers.33.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
|
| 380 |
+
"model.layers.33.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
|
| 381 |
+
"model.layers.34.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
|
| 382 |
+
"model.layers.34.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
|
| 383 |
+
"model.layers.34.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
|
| 384 |
+
"model.layers.34.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
|
| 385 |
+
"model.layers.34.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
|
| 386 |
+
"model.layers.34.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
|
| 387 |
+
"model.layers.34.input_layernorm.weight": "model-00002-of-00002.safetensors",
|
| 388 |
+
"model.layers.34.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
|
| 389 |
+
"model.layers.34.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
|
| 390 |
+
"model.layers.34.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
|
| 391 |
+
"model.layers.34.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
|
| 392 |
+
"model.layers.35.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
|
| 393 |
+
"model.layers.35.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
|
| 394 |
+
"model.layers.35.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
|
| 395 |
+
"model.layers.35.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
|
| 396 |
+
"model.layers.35.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
|
| 397 |
+
"model.layers.35.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
|
| 398 |
+
"model.layers.35.input_layernorm.weight": "model-00002-of-00002.safetensors",
|
| 399 |
+
"model.layers.35.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
|
| 400 |
+
"model.layers.35.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
|
| 401 |
+
"model.layers.35.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
|
| 402 |
+
"model.layers.35.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
|
| 403 |
+
"model.norm.weight": "model-00002-of-00002.safetensors"
|
| 404 |
+
}
|
| 405 |
+
}
|
models/ace-step/config.json
ADDED
|
@@ -0,0 +1,82 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"architectures": [
|
| 3 |
+
"AceStepConditionGenerationModel"
|
| 4 |
+
],
|
| 5 |
+
"attention_bias": false,
|
| 6 |
+
"attention_dropout": 0.0,
|
| 7 |
+
"audio_acoustic_hidden_dim": 64,
|
| 8 |
+
"auto_map": {
|
| 9 |
+
"AutoConfig": "configuration_acestep_v15.AceStepConfig",
|
| 10 |
+
"AutoModel": "modeling_acestep_v15_turbo.AceStepConditionGenerationModel"
|
| 11 |
+
},
|
| 12 |
+
"data_proportion": 0.5,
|
| 13 |
+
"dtype": "bfloat16",
|
| 14 |
+
"fsq_dim": 2048,
|
| 15 |
+
"fsq_input_levels": [
|
| 16 |
+
8,
|
| 17 |
+
8,
|
| 18 |
+
8,
|
| 19 |
+
5,
|
| 20 |
+
5,
|
| 21 |
+
5
|
| 22 |
+
],
|
| 23 |
+
"fsq_input_num_quantizers": 1,
|
| 24 |
+
"head_dim": 128,
|
| 25 |
+
"hidden_act": "silu",
|
| 26 |
+
"hidden_size": 2048,
|
| 27 |
+
"in_channels": 192,
|
| 28 |
+
"initializer_range": 0.02,
|
| 29 |
+
"intermediate_size": 6144,
|
| 30 |
+
"is_turbo": true,
|
| 31 |
+
"layer_types": [
|
| 32 |
+
"sliding_attention",
|
| 33 |
+
"full_attention",
|
| 34 |
+
"sliding_attention",
|
| 35 |
+
"full_attention",
|
| 36 |
+
"sliding_attention",
|
| 37 |
+
"full_attention",
|
| 38 |
+
"sliding_attention",
|
| 39 |
+
"full_attention",
|
| 40 |
+
"sliding_attention",
|
| 41 |
+
"full_attention",
|
| 42 |
+
"sliding_attention",
|
| 43 |
+
"full_attention",
|
| 44 |
+
"sliding_attention",
|
| 45 |
+
"full_attention",
|
| 46 |
+
"sliding_attention",
|
| 47 |
+
"full_attention",
|
| 48 |
+
"sliding_attention",
|
| 49 |
+
"full_attention",
|
| 50 |
+
"sliding_attention",
|
| 51 |
+
"full_attention",
|
| 52 |
+
"sliding_attention",
|
| 53 |
+
"full_attention",
|
| 54 |
+
"sliding_attention",
|
| 55 |
+
"full_attention"
|
| 56 |
+
],
|
| 57 |
+
"max_position_embeddings": 32768,
|
| 58 |
+
"model_type": "acestep",
|
| 59 |
+
"model_version": "turbo",
|
| 60 |
+
"num_attention_heads": 16,
|
| 61 |
+
"num_attention_pooler_hidden_layers": 2,
|
| 62 |
+
"num_audio_decoder_hidden_layers": 24,
|
| 63 |
+
"num_hidden_layers": 24,
|
| 64 |
+
"num_key_value_heads": 8,
|
| 65 |
+
"num_lyric_encoder_hidden_layers": 8,
|
| 66 |
+
"num_timbre_encoder_hidden_layers": 4,
|
| 67 |
+
"patch_size": 2,
|
| 68 |
+
"pool_window_size": 5,
|
| 69 |
+
"rms_norm_eps": 1e-06,
|
| 70 |
+
"rope_scaling": null,
|
| 71 |
+
"rope_theta": 1000000,
|
| 72 |
+
"sliding_window": 128,
|
| 73 |
+
"text_hidden_dim": 1024,
|
| 74 |
+
"timbre_fix_frame": 750,
|
| 75 |
+
"timbre_hidden_dim": 64,
|
| 76 |
+
"timestep_mu": -0.4,
|
| 77 |
+
"timestep_sigma": 1.0,
|
| 78 |
+
"transformers_version": "4.57.0.dev0",
|
| 79 |
+
"use_cache": true,
|
| 80 |
+
"use_sliding_window": true,
|
| 81 |
+
"vocab_size": 64003
|
| 82 |
+
}
|
models/dettaglio-restyle/styles.json
ADDED
|
@@ -0,0 +1,1352 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[
|
| 2 |
+
{
|
| 3 |
+
"style_description": "cinematic still, emotional, harmonious, vignette, 4k epic detailed, shot on kodak, 35mm photo, sharp focus, high budget, cinemascope, moody, epic, gorgeous, film grain, grainy",
|
| 4 |
+
"name": "Fooocus Sharp",
|
| 5 |
+
"thumbnail": "fooocus_sharp.webp"
|
| 6 |
+
},
|
| 7 |
+
{
|
| 8 |
+
"style_description": "(masterpiece), (best quality), (ultra-detailed), illustration, disheveled hair, detailed eyes, perfect composition, moist skin, intricate details, earrings, by wlop",
|
| 9 |
+
"name": "Fooocus Masterpiece",
|
| 10 |
+
"thumbnail": "fooocus_masterpiece.webp"
|
| 11 |
+
},
|
| 12 |
+
{
|
| 13 |
+
"style_description": "photograph, 50mm, cinematic 4k epic detailed 4k epic detailed photograph shot on kodak detailed cinematic hbo dark moody, 35mm photo, grainy, vignette, vintage, Kodachrome, Lomography, stained, highly detailed, found footage",
|
| 14 |
+
"name": "Fooocus Photograph",
|
| 15 |
+
"thumbnail": "fooocus_photograph.webp"
|
| 16 |
+
},
|
| 17 |
+
{
|
| 18 |
+
"style_description": "cinematic still, emotional, harmonious, vignette, highly detailed, high budget, bokeh, cinemascope, moody, epic, gorgeous, film grain, grainy",
|
| 19 |
+
"name": "Fooocus Cinematic",
|
| 20 |
+
"thumbnail": "fooocus_cinematic.webp"
|
| 21 |
+
},
|
| 22 |
+
{
|
| 23 |
+
"style_description": "professional 3d model, octane render, highly detailed, volumetric, dramatic lighting",
|
| 24 |
+
"name": "sai-3d-model",
|
| 25 |
+
"thumbnail": "sai-3d-model.webp"
|
| 26 |
+
},
|
| 27 |
+
{
|
| 28 |
+
"style_description": "analog film photo, faded film, desaturated, 35mm photo, grainy, vignette, vintage, Kodachrome, Lomography, stained, highly detailed, found footage",
|
| 29 |
+
"name": "sai-analog film",
|
| 30 |
+
"thumbnail": "sai-analog_film.webp"
|
| 31 |
+
},
|
| 32 |
+
{
|
| 33 |
+
"style_description": "anime artwork, anime style, key visual, vibrant, studio anime, highly detailed",
|
| 34 |
+
"name": "sai-anime",
|
| 35 |
+
"thumbnail": "sai-anime.webp"
|
| 36 |
+
},
|
| 37 |
+
{
|
| 38 |
+
"style_description": "cinematic film still, shallow depth of field, vignette, highly detailed, high budget, bokeh, cinemascope, moody, epic, gorgeous, film grain, grainy",
|
| 39 |
+
"name": "sai-cinematic",
|
| 40 |
+
"thumbnail": "sai-cinematic.webp"
|
| 41 |
+
},
|
| 42 |
+
{
|
| 43 |
+
"style_description": "comic, graphic illustration, comic art, graphic novel art, vibrant, highly detailed",
|
| 44 |
+
"name": "sai-comic book",
|
| 45 |
+
"thumbnail": "sai-comic_book.webp"
|
| 46 |
+
},
|
| 47 |
+
{
|
| 48 |
+
"style_description": "play-doh style, sculpture, clay art, centered composition, Claymation",
|
| 49 |
+
"name": "sai-craft clay",
|
| 50 |
+
"thumbnail": "sai-craft_clay.webp"
|
| 51 |
+
},
|
| 52 |
+
{
|
| 53 |
+
"style_description": "concept art, digital artwork, illustrative, painterly, matte painting, highly detailed",
|
| 54 |
+
"name": "sai-digital art",
|
| 55 |
+
"thumbnail": "sai-digital_art.webp"
|
| 56 |
+
},
|
| 57 |
+
{
|
| 58 |
+
"style_description": "breathtaking, award-winning, professional, highly detailed",
|
| 59 |
+
"name": "sai-enhance",
|
| 60 |
+
"thumbnail": "sai-enhance.webp"
|
| 61 |
+
},
|
| 62 |
+
{
|
| 63 |
+
"style_description": "ethereal fantasy concept art of , magnificent, celestial, ethereal, painterly, epic, majestic, magical, fantasy art, cover art, dreamy",
|
| 64 |
+
"name": "sai-fantasy art",
|
| 65 |
+
"thumbnail": "sai-fantasy_art.webp"
|
| 66 |
+
},
|
| 67 |
+
{
|
| 68 |
+
"style_description": "isometric style, vibrant, beautiful, crisp, detailed, ultra detailed, intricate",
|
| 69 |
+
"name": "sai-isometric",
|
| 70 |
+
"thumbnail": "sai-isometric.webp"
|
| 71 |
+
},
|
| 72 |
+
{
|
| 73 |
+
"style_description": "line art drawing, professional, sleek, modern, minimalist, graphic, line art, vector graphics",
|
| 74 |
+
"name": "sai-line art",
|
| 75 |
+
"thumbnail": "sai-line_art.webp"
|
| 76 |
+
},
|
| 77 |
+
{
|
| 78 |
+
"style_description": "low-poly style, low-poly game art, polygon mesh, jagged, blocky, wireframe edges, centered composition",
|
| 79 |
+
"name": "sai-lowpoly",
|
| 80 |
+
"thumbnail": "sai-lowpoly.webp"
|
| 81 |
+
},
|
| 82 |
+
{
|
| 83 |
+
"style_description": "neonpunk style, cyberpunk, vaporwave, neon, vibes, vibrant, stunningly beautiful, crisp, detailed, sleek, ultramodern, magenta highlights, dark purple shadows, high contrast, cinematic, ultra detailed, intricate, professional",
|
| 84 |
+
"name": "sai-neonpunk",
|
| 85 |
+
"thumbnail": "sai-neonpunk.webp"
|
| 86 |
+
},
|
| 87 |
+
{
|
| 88 |
+
"style_description": "origami style, paper art, pleated paper, folded, origami art, pleats, cut and fold, centered composition",
|
| 89 |
+
"name": "sai-origami",
|
| 90 |
+
"thumbnail": "sai-origami.webp"
|
| 91 |
+
},
|
| 92 |
+
{
|
| 93 |
+
"style_description": "cinematic photo, 35mm photograph, film, bokeh, professional, 4k, highly detailed",
|
| 94 |
+
"name": "sai-photographic",
|
| 95 |
+
"thumbnail": "sai-photographic.webp"
|
| 96 |
+
},
|
| 97 |
+
{
|
| 98 |
+
"style_description": "pixel-art, low-res, blocky, pixel art style, 8-bit graphics",
|
| 99 |
+
"name": "sai-pixel art",
|
| 100 |
+
"thumbnail": "sai-pixel_art.webp"
|
| 101 |
+
},
|
| 102 |
+
{
|
| 103 |
+
"style_description": "epic cinematic shot of dynamic in motion. main subject of high budget action movie. raw photo, motion blur. best quality, high resolution",
|
| 104 |
+
"name": "mre-cinematic-dynamic",
|
| 105 |
+
"thumbnail": "mre-cinematic-dynamic.webp"
|
| 106 |
+
},
|
| 107 |
+
{
|
| 108 |
+
"style_description": "spontaneous picture of, taken by talented amateur. best quality, high resolution. magical moment, natural look. simple but good looking",
|
| 109 |
+
"name": "mre-spontaneous-picture",
|
| 110 |
+
"thumbnail": "mre-spontaneous-picture.webp"
|
| 111 |
+
},
|
| 112 |
+
{
|
| 113 |
+
"style_description": "powerful artistic vision of, breathtaking masterpiece made by great artist. best quality, high resolution",
|
| 114 |
+
"name": "mre-artistic-vision",
|
| 115 |
+
"thumbnail": "mre-artistic-vision.webp"
|
| 116 |
+
},
|
| 117 |
+
{
|
| 118 |
+
"style_description": "dark and unsettling dream showing, best quality, high resolution. created by genius but depressed mad artist. grim beauty",
|
| 119 |
+
"name": "mre-dark-dream",
|
| 120 |
+
"thumbnail": "mre-dark-dream.webp"
|
| 121 |
+
},
|
| 122 |
+
{
|
| 123 |
+
"style_description": "astonishing gloomy art made mainly of shadows and lighting, forming, masterful usage of lighting, shadows and chiaroscuro. made by black-hearted artist, drawing from darkness. best quality, high resolution",
|
| 124 |
+
"name": "mre-gloomy-art",
|
| 125 |
+
"thumbnail": "mre-gloomy-art.webp"
|
| 126 |
+
},
|
| 127 |
+
{
|
| 128 |
+
"style_description": "picture from really bad dream about terrifying, true horror. bone-chilling vision. mad world that shouldn't exist. best quality, high resolution",
|
| 129 |
+
"name": "mre-bad-dream",
|
| 130 |
+
"thumbnail": "mre-bad-dream.webp"
|
| 131 |
+
},
|
| 132 |
+
{
|
| 133 |
+
"style_description": "uncanny caliginous vision of, created by remarkable underground artist. best quality, high resolution. raw and brutal art, careless but impressive style. inspired by darkness and chaos",
|
| 134 |
+
"name": "mre-underground",
|
| 135 |
+
"thumbnail": "mre-underground.webp"
|
| 136 |
+
},
|
| 137 |
+
{
|
| 138 |
+
"style_description": "surreal painting representing strange vision of, harmonious madness, synergy with chance. unique artstyle, mindbending art, magical surrealism. best quality, high resolution",
|
| 139 |
+
"name": "mre-surreal-painting",
|
| 140 |
+
"thumbnail": "mre-surreal-painting.webp"
|
| 141 |
+
},
|
| 142 |
+
{
|
| 143 |
+
"style_description": "insanely dynamic illustration of, best quality, high resolution. crazy artstyle, careless brushstrokes, emotional and fun",
|
| 144 |
+
"name": "mre-dynamic-illustration",
|
| 145 |
+
"thumbnail": "mre-dynamic-illustration.webp"
|
| 146 |
+
},
|
| 147 |
+
{
|
| 148 |
+
"style_description": "long forgotten art created by undead artist illustrating, tribute to the death and decay. miserable art of the damned. wretched and decaying world. best quality, high resolution",
|
| 149 |
+
"name": "mre-undead-art",
|
| 150 |
+
"thumbnail": "mre-undead-art.webp"
|
| 151 |
+
},
|
| 152 |
+
{
|
| 153 |
+
"style_description": "art illustrating insane amounts of raging elemental energy turning into, avatar of elements. magical surrealism, wizardry. best quality, high resolution",
|
| 154 |
+
"name": "mre-elemental-art",
|
| 155 |
+
"thumbnail": "mre-elemental-art.webp"
|
| 156 |
+
},
|
| 157 |
+
{
|
| 158 |
+
"style_description": "winner of inter-galactic art contest illustrating, symbol of the interstellar singularity. best quality, high resolution. artstyle previously unseen in the whole galaxy",
|
| 159 |
+
"name": "mre-space-art",
|
| 160 |
+
"thumbnail": "mre-space-art.webp"
|
| 161 |
+
},
|
| 162 |
+
{
|
| 163 |
+
"style_description": "sublime ancient illustration of, predating human civilization. crude and simple, but also surprisingly beautiful artwork, made by genius primeval artist. best quality, high resolution",
|
| 164 |
+
"name": "mre-ancient-illustration",
|
| 165 |
+
"thumbnail": "mre-ancient-illustration.webp"
|
| 166 |
+
},
|
| 167 |
+
{
|
| 168 |
+
"style_description": "brave, shocking, and brutally true art showing, inspired by courage and unlimited creativity. truth found in chaos. best quality, high resolution",
|
| 169 |
+
"name": "mre-brave-art",
|
| 170 |
+
"thumbnail": "mre-brave-art.webp"
|
| 171 |
+
},
|
| 172 |
+
{
|
| 173 |
+
"style_description": "heroic fantasy painting of, in the dangerous fantasy world. airbrush over oil on canvas. best quality, high resolution",
|
| 174 |
+
"name": "mre-heroic-fantasy",
|
| 175 |
+
"thumbnail": "mre-heroic-fantasy.webp"
|
| 176 |
+
},
|
| 177 |
+
{
|
| 178 |
+
"style_description": "dark cyberpunk illustration of brutal in a world without hope, ruled by ruthless criminal corporations. best quality, high resolution",
|
| 179 |
+
"name": "mre-dark-cyberpunk",
|
| 180 |
+
"thumbnail": "mre-dark-cyberpunk.webp"
|
| 181 |
+
},
|
| 182 |
+
{
|
| 183 |
+
"style_description": "geometric and lyrical abstraction painting presenting, oil on metal. best quality, high resolution",
|
| 184 |
+
"name": "mre-lyrical-geometry",
|
| 185 |
+
"thumbnail": "mre-lyrical-geometry.webp"
|
| 186 |
+
},
|
| 187 |
+
{
|
| 188 |
+
"style_description": "big long brushstrokes of deep black sumi-e turning into symbolic painting of, master level raw art. best quality, high resolution",
|
| 189 |
+
"name": "mre-sumi-e-symbolic",
|
| 190 |
+
"thumbnail": "mre-sumi-e-symbolic.webp"
|
| 191 |
+
},
|
| 192 |
+
{
|
| 193 |
+
"style_description": "highly detailed black sumi-e painting of, in-depth study of perfection, created by a master. best quality, high resolution",
|
| 194 |
+
"name": "mre-sumi-e-detailed",
|
| 195 |
+
"thumbnail": "mre-sumi-e-detailed.webp"
|
| 196 |
+
},
|
| 197 |
+
{
|
| 198 |
+
"style_description": "manga artwork presenting, created by japanese manga artist. highly emotional. best quality, high resolution",
|
| 199 |
+
"name": "mre-manga",
|
| 200 |
+
"thumbnail": "mre-manga.webp"
|
| 201 |
+
},
|
| 202 |
+
{
|
| 203 |
+
"style_description": "anime artwork illustrating, created by japanese anime studio. highly emotional. best quality, high resolution",
|
| 204 |
+
"name": "mre-anime",
|
| 205 |
+
"thumbnail": "mre-anime.webp"
|
| 206 |
+
},
|
| 207 |
+
{
|
| 208 |
+
"style_description": "breathtaking illustration from adult comic book presenting, fabulous artwork. best quality, high resolution",
|
| 209 |
+
"name": "mre-comic",
|
| 210 |
+
"thumbnail": "mre-comic.webp"
|
| 211 |
+
},
|
| 212 |
+
{
|
| 213 |
+
"style_description": "advertising poster style, Professional, modern, product-focused, commercial, eye-catching, highly detailed",
|
| 214 |
+
"name": "ads-advertising",
|
| 215 |
+
"thumbnail": "ads-advertising.webp"
|
| 216 |
+
},
|
| 217 |
+
{
|
| 218 |
+
"style_description": "automotive advertisement style, sleek, dynamic, professional, commercial, vehicle-focused, high-resolution, highly detailed",
|
| 219 |
+
"name": "ads-automotive",
|
| 220 |
+
"thumbnail": "ads-automotive.webp"
|
| 221 |
+
},
|
| 222 |
+
{
|
| 223 |
+
"style_description": "corporate branding style, professional, clean, modern, sleek, minimalist, business-oriented, highly detailed",
|
| 224 |
+
"name": "ads-corporate",
|
| 225 |
+
"thumbnail": "ads-corporate.webp"
|
| 226 |
+
},
|
| 227 |
+
{
|
| 228 |
+
"style_description": "fashion editorial style, high fashion, trendy, stylish, editorial, magazine style, professional, highly detailed",
|
| 229 |
+
"name": "ads-fashion editorial",
|
| 230 |
+
"thumbnail": "ads-fashion_editorial.webp"
|
| 231 |
+
},
|
| 232 |
+
{
|
| 233 |
+
"style_description": "food photography style, appetizing, professional, culinary, high-resolution, commercial, highly detailed",
|
| 234 |
+
"name": "ads-food photography",
|
| 235 |
+
"thumbnail": "ads-food_photography.webp"
|
| 236 |
+
},
|
| 237 |
+
{
|
| 238 |
+
"style_description": "gourmet food photo of, soft natural lighting, macro details, vibrant colors, fresh ingredients, glistening textures, bokeh background, styled plating, wooden tabletop, garnished, tantalizing, editorial quality",
|
| 239 |
+
"name": "ads-gourmet food photography",
|
| 240 |
+
"thumbnail": "ads-gourmet_food_photography.webp"
|
| 241 |
+
},
|
| 242 |
+
{
|
| 243 |
+
"style_description": "luxury product style, elegant, sophisticated, high-end, luxurious, professional, highly detailed",
|
| 244 |
+
"name": "ads-luxury",
|
| 245 |
+
"thumbnail": "ads-luxury.webp"
|
| 246 |
+
},
|
| 247 |
+
{
|
| 248 |
+
"style_description": "retail packaging style, vibrant, enticing, commercial, product-focused, eye-catching, professional, highly detailed",
|
| 249 |
+
"name": "ads-retail",
|
| 250 |
+
"thumbnail": "ads-retail.webp"
|
| 251 |
+
},
|
| 252 |
+
{
|
| 253 |
+
"style_description": "abstract style, non-representational, colors and shapes, expression of feelings, imaginative, highly detailed",
|
| 254 |
+
"name": "artstyle-abstract",
|
| 255 |
+
"thumbnail": "artstyle-abstract.webp"
|
| 256 |
+
},
|
| 257 |
+
{
|
| 258 |
+
"style_description": "abstract expressionist painting, energetic brushwork, bold colors, abstract forms, expressive, emotional",
|
| 259 |
+
"name": "artstyle-abstract expressionism",
|
| 260 |
+
"thumbnail": "artstyle-abstract_expressionism.webp"
|
| 261 |
+
},
|
| 262 |
+
{
|
| 263 |
+
"style_description": "art deco style, geometric shapes, bold colors, luxurious, elegant, decorative, symmetrical, ornate, detailed",
|
| 264 |
+
"name": "artstyle-art deco",
|
| 265 |
+
"thumbnail": "artstyle-art_deco.webp"
|
| 266 |
+
},
|
| 267 |
+
{
|
| 268 |
+
"style_description": "art nouveau style, elegant, decorative, curvilinear forms, nature-inspired, ornate, detailed",
|
| 269 |
+
"name": "artstyle-art nouveau",
|
| 270 |
+
"thumbnail": "artstyle-art_nouveau.webp"
|
| 271 |
+
},
|
| 272 |
+
{
|
| 273 |
+
"style_description": "constructivist style, geometric shapes, bold colors, dynamic composition, propaganda art style",
|
| 274 |
+
"name": "artstyle-constructivist",
|
| 275 |
+
"thumbnail": "artstyle-constructivist.webp"
|
| 276 |
+
},
|
| 277 |
+
{
|
| 278 |
+
"style_description": "cubist artwork, geometric shapes, abstract, innovative, revolutionary",
|
| 279 |
+
"name": "artstyle-cubist",
|
| 280 |
+
"thumbnail": "artstyle-cubist.webp"
|
| 281 |
+
},
|
| 282 |
+
{
|
| 283 |
+
"style_description": "expressionist, raw, emotional, dynamic, distortion for emotional effect, vibrant, use of unusual colors, detailed",
|
| 284 |
+
"name": "artstyle-expressionist",
|
| 285 |
+
"thumbnail": "artstyle-expressionist.webp"
|
| 286 |
+
},
|
| 287 |
+
{
|
| 288 |
+
"style_description": "graffiti style, street art, vibrant, urban, detailed, tag, mural",
|
| 289 |
+
"name": "artstyle-graffiti",
|
| 290 |
+
"thumbnail": "artstyle-graffiti.webp"
|
| 291 |
+
},
|
| 292 |
+
{
|
| 293 |
+
"style_description": "hyperrealistic art, extremely high-resolution details, photographic, realism pushed to extreme, fine texture, incredibly lifelike",
|
| 294 |
+
"name": "artstyle-hyperrealism",
|
| 295 |
+
"thumbnail": "artstyle-hyperrealism.webp"
|
| 296 |
+
},
|
| 297 |
+
{
|
| 298 |
+
"style_description": "impressionist painting, loose brushwork, vibrant color, light and shadow play, captures feeling over form",
|
| 299 |
+
"name": "artstyle-impressionist",
|
| 300 |
+
"thumbnail": "artstyle-impressionist.webp"
|
| 301 |
+
},
|
| 302 |
+
{
|
| 303 |
+
"style_description": "pointillism style, composed entirely of small, distinct dots of color, vibrant, highly detailed",
|
| 304 |
+
"name": "artstyle-pointillism",
|
| 305 |
+
"thumbnail": "artstyle-pointillism.webp"
|
| 306 |
+
},
|
| 307 |
+
{
|
| 308 |
+
"style_description": "pop Art style, bright colors, bold outlines, popular culture themes, ironic or kitsch",
|
| 309 |
+
"name": "artstyle-pop art",
|
| 310 |
+
"thumbnail": "artstyle-pop_art.webp"
|
| 311 |
+
},
|
| 312 |
+
{
|
| 313 |
+
"style_description": "psychedelic style, vibrant colors, swirling patterns, abstract forms, surreal, trippy",
|
| 314 |
+
"name": "artstyle-psychedelic",
|
| 315 |
+
"thumbnail": "artstyle-psychedelic.webp"
|
| 316 |
+
},
|
| 317 |
+
{
|
| 318 |
+
"style_description": "renaissance style, realistic, perspective, light and shadow, religious or mythological themes, highly detailed",
|
| 319 |
+
"name": "artstyle-renaissance",
|
| 320 |
+
"thumbnail": "artstyle-renaissance.webp"
|
| 321 |
+
},
|
| 322 |
+
{
|
| 323 |
+
"style_description": "steampunk style, antique, mechanical, brass and copper tones, gears, intricate, detailed",
|
| 324 |
+
"name": "artstyle-steampunk",
|
| 325 |
+
"thumbnail": "artstyle-steampunk.webp"
|
| 326 |
+
},
|
| 327 |
+
{
|
| 328 |
+
"style_description": "surrealist art, dreamlike, mysterious, provocative, symbolic, intricate, detailed",
|
| 329 |
+
"name": "artstyle-surrealist",
|
| 330 |
+
"thumbnail": "artstyle-surrealist.webp"
|
| 331 |
+
},
|
| 332 |
+
{
|
| 333 |
+
"style_description": "typographic art, stylized, intricate, detailed, artistic, text-based",
|
| 334 |
+
"name": "artstyle-typography",
|
| 335 |
+
"thumbnail": "artstyle-typography.webp"
|
| 336 |
+
},
|
| 337 |
+
{
|
| 338 |
+
"style_description": "watercolor painting, vibrant, beautiful, painterly, detailed, textural, artistic",
|
| 339 |
+
"name": "artstyle-watercolor",
|
| 340 |
+
"thumbnail": "artstyle-watercolor.webp"
|
| 341 |
+
},
|
| 342 |
+
{
|
| 343 |
+
"style_description": "biomechanical style, blend of organic and mechanical elements, futuristic, cybernetic, detailed, intricate",
|
| 344 |
+
"name": "futuristic-biomechanical",
|
| 345 |
+
"thumbnail": "futuristic-biomechanical.webp"
|
| 346 |
+
},
|
| 347 |
+
{
|
| 348 |
+
"style_description": "biomechanical cyberpunk, cybernetics, human-machine fusion, dystopian, organic meets artificial, dark, intricate, highly detailed",
|
| 349 |
+
"name": "futuristic-biomechanical cyberpunk",
|
| 350 |
+
"thumbnail": "futuristic-biomechanical_cyberpunk.webp"
|
| 351 |
+
},
|
| 352 |
+
{
|
| 353 |
+
"style_description": "cybernetic style, futuristic, technological, cybernetic enhancements, robotics, artificial intelligence themes",
|
| 354 |
+
"name": "futuristic-cybernetic",
|
| 355 |
+
"thumbnail": "futuristic-cybernetic.webp"
|
| 356 |
+
},
|
| 357 |
+
{
|
| 358 |
+
"style_description": "cybernetic robot, android, AI, machine, metal, wires, tech, futuristic, highly detailed",
|
| 359 |
+
"name": "futuristic-cybernetic robot",
|
| 360 |
+
"thumbnail": "futuristic-cybernetic_robot.webp"
|
| 361 |
+
},
|
| 362 |
+
{
|
| 363 |
+
"style_description": "cyberpunk cityscape, neon lights, dark alleys, skyscrapers, futuristic, vibrant colors, high contrast, highly detailed",
|
| 364 |
+
"name": "futuristic-cyberpunk cityscape",
|
| 365 |
+
"thumbnail": "futuristic-cyberpunk_cityscape.webp"
|
| 366 |
+
},
|
| 367 |
+
{
|
| 368 |
+
"style_description": "futuristic style, sleek, modern, ultramodern, high tech, detailed",
|
| 369 |
+
"name": "futuristic-futuristic",
|
| 370 |
+
"thumbnail": "futuristic-futuristic.webp"
|
| 371 |
+
},
|
| 372 |
+
{
|
| 373 |
+
"style_description": "retro cyberpunk, 80's inspired, synthwave, neon, vibrant, detailed, retro futurism",
|
| 374 |
+
"name": "futuristic-retro cyberpunk",
|
| 375 |
+
"thumbnail": "futuristic-retro_cyberpunk.webp"
|
| 376 |
+
},
|
| 377 |
+
{
|
| 378 |
+
"style_description": "retro-futuristic, vintage sci-fi, 50s and 60s style, atomic age, vibrant, highly detailed",
|
| 379 |
+
"name": "futuristic-retro futurism",
|
| 380 |
+
"thumbnail": "futuristic-retro_futurism.webp"
|
| 381 |
+
},
|
| 382 |
+
{
|
| 383 |
+
"style_description": "sci-fi style, futuristic, technological, alien worlds, space themes, advanced civilizations",
|
| 384 |
+
"name": "futuristic-sci-fi",
|
| 385 |
+
"thumbnail": "futuristic-sci-fi.webp"
|
| 386 |
+
},
|
| 387 |
+
{
|
| 388 |
+
"style_description": "vaporwave style, retro aesthetic, cyberpunk, vibrant, neon colors, vintage 80s and 90s style, highly detailed",
|
| 389 |
+
"name": "futuristic-vaporwave",
|
| 390 |
+
"thumbnail": "futuristic-vaporwave.webp"
|
| 391 |
+
},
|
| 392 |
+
{
|
| 393 |
+
"style_description": "Bubble Bobble style, 8-bit, cute, pixelated, fantasy, vibrant, reminiscent of Bubble Bobble game",
|
| 394 |
+
"name": "game-bubble bobble",
|
| 395 |
+
"thumbnail": "game-bubble_bobble.webp"
|
| 396 |
+
},
|
| 397 |
+
{
|
| 398 |
+
"style_description": "cyberpunk game style, neon, dystopian, futuristic, digital, vibrant, detailed, high contrast, reminiscent of cyberpunk genre video games",
|
| 399 |
+
"name": "game-cyberpunk game",
|
| 400 |
+
"thumbnail": "game-cyberpunk_game.webp"
|
| 401 |
+
},
|
| 402 |
+
{
|
| 403 |
+
"style_description": "fighting game style, dynamic, vibrant, action-packed, detailed character design, reminiscent of fighting video games",
|
| 404 |
+
"name": "game-fighting game",
|
| 405 |
+
"thumbnail": "game-fighting_game.webp"
|
| 406 |
+
},
|
| 407 |
+
{
|
| 408 |
+
"style_description": "GTA-style artwork, satirical, exaggerated, pop art style, vibrant colors, iconic characters, action-packed",
|
| 409 |
+
"name": "game-gta",
|
| 410 |
+
"thumbnail": "game-gta.webp"
|
| 411 |
+
},
|
| 412 |
+
{
|
| 413 |
+
"style_description": "Super Mario style, vibrant, cute, cartoony, fantasy, playful, reminiscent of Super Mario series",
|
| 414 |
+
"name": "game-mario",
|
| 415 |
+
"thumbnail": "game-mario.webp"
|
| 416 |
+
},
|
| 417 |
+
{
|
| 418 |
+
"style_description": "Minecraft style, blocky, pixelated, vibrant colors, recognizable characters and objects, game assets",
|
| 419 |
+
"name": "game-minecraft",
|
| 420 |
+
"thumbnail": "game-minecraft.webp"
|
| 421 |
+
},
|
| 422 |
+
{
|
| 423 |
+
"style_description": "Pokémon style, vibrant, cute, anime, fantasy, reminiscent of Pokémon series",
|
| 424 |
+
"name": "game-pokemon",
|
| 425 |
+
"thumbnail": "game-pokemon.webp"
|
| 426 |
+
},
|
| 427 |
+
{
|
| 428 |
+
"style_description": "retro arcade style, 8-bit, pixelated, vibrant, classic video game, old school gaming, reminiscent of 80s and 90s arcade games",
|
| 429 |
+
"name": "game-retro arcade",
|
| 430 |
+
"thumbnail": "game-retro_arcade.webp"
|
| 431 |
+
},
|
| 432 |
+
{
|
| 433 |
+
"style_description": "retro game art, 16-bit, vibrant colors, pixelated, nostalgic, charming, fun",
|
| 434 |
+
"name": "game-retro game",
|
| 435 |
+
"thumbnail": "game-retro_game.webp"
|
| 436 |
+
},
|
| 437 |
+
{
|
| 438 |
+
"style_description": "strategy game style, overhead view, detailed map, units, reminiscent of real-time strategy video games",
|
| 439 |
+
"name": "game-strategy game",
|
| 440 |
+
"thumbnail": "game-strategy_game.webp"
|
| 441 |
+
},
|
| 442 |
+
{
|
| 443 |
+
"style_description": "Street Fighter style, vibrant, dynamic, arcade, 2D fighting game, highly detailed, reminiscent of Street Fighter series",
|
| 444 |
+
"name": "game-streetfighter",
|
| 445 |
+
"thumbnail": "game-streetfighter.webp"
|
| 446 |
+
},
|
| 447 |
+
{
|
| 448 |
+
"style_description": "Legend of Zelda style, vibrant, fantasy, detailed, epic, heroic, reminiscent of The Legend of Zelda series",
|
| 449 |
+
"name": "game-zelda",
|
| 450 |
+
"thumbnail": "game-zelda.webp"
|
| 451 |
+
},
|
| 452 |
+
{
|
| 453 |
+
"style_description": "architectural style, clean lines, geometric shapes, minimalist, modern, architectural drawing, highly detailed",
|
| 454 |
+
"name": "misc-architectural",
|
| 455 |
+
"thumbnail": "misc-architectural.webp"
|
| 456 |
+
},
|
| 457 |
+
{
|
| 458 |
+
"style_description": "disco-themed, vibrant, groovy, retro 70s style, shiny disco balls, neon lights, dance floor, highly detailed",
|
| 459 |
+
"name": "misc-disco",
|
| 460 |
+
"thumbnail": "misc-disco.webp"
|
| 461 |
+
},
|
| 462 |
+
{
|
| 463 |
+
"style_description": "dreamscape, surreal, ethereal, dreamy, mysterious, fantasy, highly detailed",
|
| 464 |
+
"name": "misc-dreamscape",
|
| 465 |
+
"thumbnail": "misc-dreamscape.webp"
|
| 466 |
+
},
|
| 467 |
+
{
|
| 468 |
+
"style_description": "dystopian style, bleak, post-apocalyptic, somber, dramatic, highly detailed",
|
| 469 |
+
"name": "misc-dystopian",
|
| 470 |
+
"thumbnail": "misc-dystopian.webp"
|
| 471 |
+
},
|
| 472 |
+
{
|
| 473 |
+
"style_description": "fairy tale, magical, fantastical, enchanting, storybook style, highly detailed",
|
| 474 |
+
"name": "misc-fairy tale",
|
| 475 |
+
"thumbnail": "misc-fairy_tale.webp"
|
| 476 |
+
},
|
| 477 |
+
{
|
| 478 |
+
"style_description": "gothic style, dark, mysterious, haunting, dramatic, ornate, detailed",
|
| 479 |
+
"name": "misc-gothic",
|
| 480 |
+
"thumbnail": "misc-gothic.webp"
|
| 481 |
+
},
|
| 482 |
+
{
|
| 483 |
+
"style_description": "grunge style, textured, distressed, vintage, edgy, punk rock vibe, dirty, noisy",
|
| 484 |
+
"name": "misc-grunge",
|
| 485 |
+
"thumbnail": "misc-grunge.webp"
|
| 486 |
+
},
|
| 487 |
+
{
|
| 488 |
+
"style_description": "horror-themed, eerie, unsettling, dark, spooky, suspenseful, grim, highly detailed",
|
| 489 |
+
"name": "misc-horror",
|
| 490 |
+
"thumbnail": "misc-horror.webp"
|
| 491 |
+
},
|
| 492 |
+
{
|
| 493 |
+
"style_description": "kawaii style, cute, adorable, brightly colored, cheerful, anime influence, highly detailed",
|
| 494 |
+
"name": "misc-kawaii",
|
| 495 |
+
"thumbnail": "misc-kawaii.webp"
|
| 496 |
+
},
|
| 497 |
+
{
|
| 498 |
+
"style_description": "lovecraftian horror, eldritch, cosmic horror, unknown, mysterious, surreal, highly detailed",
|
| 499 |
+
"name": "misc-lovecraftian",
|
| 500 |
+
"thumbnail": "misc-lovecraftian.webp"
|
| 501 |
+
},
|
| 502 |
+
{
|
| 503 |
+
"style_description": "macabre style, dark, gothic, grim, haunting, highly detailed",
|
| 504 |
+
"name": "misc-macabre",
|
| 505 |
+
"thumbnail": "misc-macabre.webp"
|
| 506 |
+
},
|
| 507 |
+
{
|
| 508 |
+
"style_description": "manga style, vibrant, high-energy, detailed, iconic, Japanese comic style",
|
| 509 |
+
"name": "misc-manga",
|
| 510 |
+
"thumbnail": "misc-manga.webp"
|
| 511 |
+
},
|
| 512 |
+
{
|
| 513 |
+
"style_description": "metropolis-themed, urban, cityscape, skyscrapers, modern, futuristic, highly detailed",
|
| 514 |
+
"name": "misc-metropolis",
|
| 515 |
+
"thumbnail": "misc-metropolis.webp"
|
| 516 |
+
},
|
| 517 |
+
{
|
| 518 |
+
"style_description": "minimalist style, simple, clean, uncluttered, modern, elegant",
|
| 519 |
+
"name": "misc-minimalist",
|
| 520 |
+
"thumbnail": "misc-minimalist.webp"
|
| 521 |
+
},
|
| 522 |
+
{
|
| 523 |
+
"style_description": "monochrome, black and white, contrast, tone, texture, detailed",
|
| 524 |
+
"name": "misc-monochrome",
|
| 525 |
+
"thumbnail": "misc-monochrome.webp"
|
| 526 |
+
},
|
| 527 |
+
{
|
| 528 |
+
"style_description": "nautical-themed, sea, ocean, ships, maritime, beach, marine life, highly detailed",
|
| 529 |
+
"name": "misc-nautical",
|
| 530 |
+
"thumbnail": "misc-nautical.webp"
|
| 531 |
+
},
|
| 532 |
+
{
|
| 533 |
+
"style_description": "space-themed, cosmic, celestial, stars, galaxies, nebulas, planets, science fiction, highly detailed",
|
| 534 |
+
"name": "misc-space",
|
| 535 |
+
"thumbnail": "misc-space.webp"
|
| 536 |
+
},
|
| 537 |
+
{
|
| 538 |
+
"style_description": "stained glass style, vibrant, beautiful, translucent, intricate, detailed",
|
| 539 |
+
"name": "misc-stained glass",
|
| 540 |
+
"thumbnail": "misc-stained_glass.webp"
|
| 541 |
+
},
|
| 542 |
+
{
|
| 543 |
+
"style_description": "techwear fashion, futuristic, cyberpunk, urban, tactical, sleek, dark, highly detailed",
|
| 544 |
+
"name": "misc-techwear fashion",
|
| 545 |
+
"thumbnail": "misc-techwear_fashion.webp"
|
| 546 |
+
},
|
| 547 |
+
{
|
| 548 |
+
"style_description": "tribal style, indigenous, ethnic, traditional patterns, bold, natural colors, highly detailed",
|
| 549 |
+
"name": "misc-tribal",
|
| 550 |
+
"thumbnail": "misc-tribal.webp"
|
| 551 |
+
},
|
| 552 |
+
{
|
| 553 |
+
"style_description": "zentangle, intricate, abstract, monochrome, patterns, meditative, highly detailed",
|
| 554 |
+
"name": "misc-zentangle",
|
| 555 |
+
"thumbnail": "misc-zentangle.webp"
|
| 556 |
+
},
|
| 557 |
+
{
|
| 558 |
+
"style_description": "collage style, mixed media, layered, textural, detailed, artistic",
|
| 559 |
+
"name": "papercraft-collage",
|
| 560 |
+
"thumbnail": "papercraft-collage.webp"
|
| 561 |
+
},
|
| 562 |
+
{
|
| 563 |
+
"style_description": "flat papercut style, silhouette, clean cuts, paper, sharp edges, minimalist, color block",
|
| 564 |
+
"name": "papercraft-flat papercut",
|
| 565 |
+
"thumbnail": "papercraft-flatpapercut.webp"
|
| 566 |
+
},
|
| 567 |
+
{
|
| 568 |
+
"style_description": "kirigami representation of, 3D, paper folding, paper cutting, Japanese, intricate, symmetrical, precision, clean lines",
|
| 569 |
+
"name": "papercraft-kirigami",
|
| 570 |
+
"thumbnail": "papercraft-kirigami.webp"
|
| 571 |
+
},
|
| 572 |
+
{
|
| 573 |
+
"style_description": "paper mache representation of, 3D, sculptural, textured, handmade, vibrant, fun",
|
| 574 |
+
"name": "papercraft-paper mache",
|
| 575 |
+
"thumbnail": "papercraft-paper_mache.webp"
|
| 576 |
+
},
|
| 577 |
+
{
|
| 578 |
+
"style_description": "paper quilling art of, intricate, delicate, curling, rolling, shaping, coiling, loops, 3D, dimensional, ornamental",
|
| 579 |
+
"name": "papercraft-paper quilling",
|
| 580 |
+
"thumbnail": "papercraft-paper_quilling.webp"
|
| 581 |
+
},
|
| 582 |
+
{
|
| 583 |
+
"style_description": "papercut collage of, mixed media, textured paper, overlapping, asymmetrical, abstract, vibrant",
|
| 584 |
+
"name": "papercraft-papercut collage",
|
| 585 |
+
"thumbnail": "papercraft-papercut_collage.webp"
|
| 586 |
+
},
|
| 587 |
+
{
|
| 588 |
+
"style_description": "3D papercut shadow box of, layered, dimensional, depth, silhouette, shadow, papercut, handmade, high contrast",
|
| 589 |
+
"name": "papercraft-papercut shadow box",
|
| 590 |
+
"thumbnail": "papercraft-papercut_shadow_box.webp"
|
| 591 |
+
},
|
| 592 |
+
{
|
| 593 |
+
"style_description": "stacked papercut art of, 3D, layered, dimensional, depth, precision cut, stacked layers, papercut, high contrast",
|
| 594 |
+
"name": "papercraft-stacked papercut",
|
| 595 |
+
"thumbnail": "papercraft-stacked_papercut.webp"
|
| 596 |
+
},
|
| 597 |
+
{
|
| 598 |
+
"style_description": "thick layered papercut art of, deep 3D, volumetric, dimensional, depth, thick paper, high stack, heavy texture, tangible layers",
|
| 599 |
+
"name": "papercraft-thick layered papercut",
|
| 600 |
+
"thumbnail": "papercraft-thick_layered_papercut.webp"
|
| 601 |
+
},
|
| 602 |
+
{
|
| 603 |
+
"style_description": "alien-themed, extraterrestrial, cosmic, otherworldly, mysterious, sci-fi, highly detailed",
|
| 604 |
+
"name": "photo-alien",
|
| 605 |
+
"thumbnail": "photo-alien.webp"
|
| 606 |
+
},
|
| 607 |
+
{
|
| 608 |
+
"style_description": "film noir style, monochrome, high contrast, dramatic shadows, 1940s style, mysterious, cinematic",
|
| 609 |
+
"name": "photo-film noir",
|
| 610 |
+
"thumbnail": "photo-film_noir.webp"
|
| 611 |
+
},
|
| 612 |
+
{
|
| 613 |
+
"style_description": "glamorous photo, high fashion, luxurious, extravagant, stylish, sensual, opulent, elegance, stunning beauty, professional, high contrast, detailed",
|
| 614 |
+
"name": "photo-glamour",
|
| 615 |
+
"thumbnail": "photo-glamour.webp"
|
| 616 |
+
},
|
| 617 |
+
{
|
| 618 |
+
"style_description": "HDR photo of, High dynamic range, vivid, rich details, clear shadows and highlights, realistic, intense, enhanced contrast, highly detailed",
|
| 619 |
+
"name": "photo-hdr",
|
| 620 |
+
"thumbnail": "photo-hdr.webp"
|
| 621 |
+
},
|
| 622 |
+
{
|
| 623 |
+
"style_description": "iphone photo, large depth of field, deep depth of field, highly detailed",
|
| 624 |
+
"name": "photo-iphone photographic",
|
| 625 |
+
"thumbnail": "photo-iphone_photographic.webp"
|
| 626 |
+
},
|
| 627 |
+
{
|
| 628 |
+
"style_description": "long exposure photo of, Blurred motion, streaks of light, surreal, dreamy, ghosting effect, highly detailed",
|
| 629 |
+
"name": "photo-long exposure",
|
| 630 |
+
"thumbnail": "photo-long_exposure.webp"
|
| 631 |
+
},
|
| 632 |
+
{
|
| 633 |
+
"style_description": "neon noir, cyberpunk, dark, rainy streets, neon signs, high contrast, low light, vibrant, highly detailed",
|
| 634 |
+
"name": "photo-neon noir",
|
| 635 |
+
"thumbnail": "photo-neon_noir.webp"
|
| 636 |
+
},
|
| 637 |
+
{
|
| 638 |
+
"style_description": "silhouette style, high contrast, minimalistic, black and white, stark, dramatic",
|
| 639 |
+
"name": "photo-silhouette",
|
| 640 |
+
"thumbnail": "photo-silhouette.webp"
|
| 641 |
+
},
|
| 642 |
+
{
|
| 643 |
+
"style_description": "tilt-shift photo of, selective focus, miniature effect, blurred background, highly detailed, vibrant, perspective control",
|
| 644 |
+
"name": "photo-tilt-shift",
|
| 645 |
+
"thumbnail": "photo-tilt-shift.webp"
|
| 646 |
+
},
|
| 647 |
+
{
|
| 648 |
+
"style_description": "UHD, 8K, ultra detailed, a cinematic photograph of, beautiful lighting, great composition",
|
| 649 |
+
"name": "cinematic-diva",
|
| 650 |
+
"thumbnail": "cinematic-diva.webp"
|
| 651 |
+
},
|
| 652 |
+
{
|
| 653 |
+
"style_description": "Abstract Expressionism Art, High contrast, minimalistic, colorful, stark, dramatic, expressionism",
|
| 654 |
+
"name": "Abstract Expressionism",
|
| 655 |
+
"thumbnail": "abstract_expressionism.webp"
|
| 656 |
+
},
|
| 657 |
+
{
|
| 658 |
+
"style_description": "Academia, preppy Ivy League style, stark, dramatic, chic boarding school, academia",
|
| 659 |
+
"name": "Academia",
|
| 660 |
+
"thumbnail": "academia.webp"
|
| 661 |
+
},
|
| 662 |
+
{
|
| 663 |
+
"style_description": "Action Figure, plastic collectable action figure, collectable toy action figure",
|
| 664 |
+
"name": "Action Figure",
|
| 665 |
+
"thumbnail": "action_figure.webp"
|
| 666 |
+
},
|
| 667 |
+
{
|
| 668 |
+
"style_description": "Adorable 3D Character, 3D render, adorable character, 3D art",
|
| 669 |
+
"name": "Adorable 3D Character",
|
| 670 |
+
"thumbnail": "adorable_3d_character.webp"
|
| 671 |
+
},
|
| 672 |
+
{
|
| 673 |
+
"style_description": "Adorable Kawaii, pretty, cute, adorable, kawaii",
|
| 674 |
+
"name": "Adorable Kawaii",
|
| 675 |
+
"thumbnail": "adorable_kawaii.webp"
|
| 676 |
+
},
|
| 677 |
+
{
|
| 678 |
+
"style_description": "Art Deco, sleek, geometric forms, art deco style",
|
| 679 |
+
"name": "Art Deco",
|
| 680 |
+
"thumbnail": "art_deco.webp"
|
| 681 |
+
},
|
| 682 |
+
{
|
| 683 |
+
"style_description": "Art Nouveau, beautiful art, sleek, organic forms, long, sinuous, art nouveau style",
|
| 684 |
+
"name": "Art Nouveau",
|
| 685 |
+
"thumbnail": "art_nouveau.webp"
|
| 686 |
+
},
|
| 687 |
+
{
|
| 688 |
+
"style_description": "Astral Aura, astral, colorful aura, vibrant energy",
|
| 689 |
+
"name": "Astral Aura",
|
| 690 |
+
"thumbnail": "astral_aura.webp"
|
| 691 |
+
},
|
| 692 |
+
{
|
| 693 |
+
"style_description": "Avant-garde, unusual, experimental, avant-garde art",
|
| 694 |
+
"name": "Avant-garde",
|
| 695 |
+
"thumbnail": "avant-garde.webp"
|
| 696 |
+
},
|
| 697 |
+
{
|
| 698 |
+
"style_description": "Baroque, dramatic, exuberant, grandeur, baroque art",
|
| 699 |
+
"name": "Baroque",
|
| 700 |
+
"thumbnail": "baroque.webp"
|
| 701 |
+
},
|
| 702 |
+
{
|
| 703 |
+
"style_description": "Bauhaus-Style Poster, simple geometric shapes, clean lines, primary colors, Bauhaus-Style Poster",
|
| 704 |
+
"name": "Bauhaus-Style Poster",
|
| 705 |
+
"thumbnail": "bauhaus-style_poster.webp"
|
| 706 |
+
},
|
| 707 |
+
{
|
| 708 |
+
"style_description": "Blueprint Schematic Drawing, technical drawing, blueprint, schematic",
|
| 709 |
+
"name": "Blueprint Schematic Drawing",
|
| 710 |
+
"thumbnail": "blueprint_schematic_drawing.webp"
|
| 711 |
+
},
|
| 712 |
+
{
|
| 713 |
+
"style_description": "Caricature, exaggerated, comical, caricature",
|
| 714 |
+
"name": "Caricature",
|
| 715 |
+
"thumbnail": "caricature.webp"
|
| 716 |
+
},
|
| 717 |
+
{
|
| 718 |
+
"style_description": "Cel Shaded Art, 2D, flat color, toon shading, cel shaded style",
|
| 719 |
+
"name": "Cel Shaded Art",
|
| 720 |
+
"thumbnail": "cel_shaded_art.webp"
|
| 721 |
+
},
|
| 722 |
+
{
|
| 723 |
+
"style_description": "Character Design Sheet, character reference sheet, character turn around",
|
| 724 |
+
"name": "Character Design Sheet",
|
| 725 |
+
"thumbnail": "character_design_sheet.webp"
|
| 726 |
+
},
|
| 727 |
+
{
|
| 728 |
+
"style_description": "Classicism Art, inspired by Roman and Greek culture, clarity, harmonious, classicism art",
|
| 729 |
+
"name": "Classicism Art",
|
| 730 |
+
"thumbnail": "classicism_art.webp"
|
| 731 |
+
},
|
| 732 |
+
{
|
| 733 |
+
"style_description": "Color Field Painting, abstract, simple, geometic, color field painting style",
|
| 734 |
+
"name": "Color Field Painting",
|
| 735 |
+
"thumbnail": "color_field_painting.webp"
|
| 736 |
+
},
|
| 737 |
+
{
|
| 738 |
+
"style_description": "Colored Pencil Art, colored pencil strokes, light color, visible paper texture, colored pencil art",
|
| 739 |
+
"name": "Colored Pencil Art",
|
| 740 |
+
"thumbnail": "colored_pencil_art.webp"
|
| 741 |
+
},
|
| 742 |
+
{
|
| 743 |
+
"style_description": "Conceptual Art, concept art",
|
| 744 |
+
"name": "Conceptual Art",
|
| 745 |
+
"thumbnail": "conceptual_art.webp"
|
| 746 |
+
},
|
| 747 |
+
{
|
| 748 |
+
"style_description": "Constructivism Art, minimalistic, geometric forms, constructivism art",
|
| 749 |
+
"name": "Constructivism",
|
| 750 |
+
"thumbnail": "constructivism.webp"
|
| 751 |
+
},
|
| 752 |
+
{
|
| 753 |
+
"style_description": "Cubism Art, flat geometric forms, cubism art",
|
| 754 |
+
"name": "Cubism",
|
| 755 |
+
"thumbnail": "cubism.webp"
|
| 756 |
+
},
|
| 757 |
+
{
|
| 758 |
+
"style_description": "Dadaism Art, satirical, nonsensical, dadaism art",
|
| 759 |
+
"name": "Dadaism",
|
| 760 |
+
"thumbnail": "dadaism.webp"
|
| 761 |
+
},
|
| 762 |
+
{
|
| 763 |
+
"style_description": "Dark Fantasy Art, dark, moody, dark fantasy style",
|
| 764 |
+
"name": "Dark Fantasy",
|
| 765 |
+
"thumbnail": "dark_fantasy.webp"
|
| 766 |
+
},
|
| 767 |
+
{
|
| 768 |
+
"style_description": "Dark Moody Atmosphere, dramatic, mysterious, dark moody atmosphere",
|
| 769 |
+
"name": "Dark Moody Atmosphere",
|
| 770 |
+
"thumbnail": "dark_moody_atmosphere.webp"
|
| 771 |
+
},
|
| 772 |
+
{
|
| 773 |
+
"style_description": "DMT Art Style, bright colors, surreal visuals, swirling patterns, DMT art style",
|
| 774 |
+
"name": "DMT Art Style",
|
| 775 |
+
"thumbnail": "dmt_art_style.webp"
|
| 776 |
+
},
|
| 777 |
+
{
|
| 778 |
+
"style_description": "Doodle Art Style, drawing, freeform, swirling patterns, doodle art style",
|
| 779 |
+
"name": "Doodle Art",
|
| 780 |
+
"thumbnail": "doodle_art.webp"
|
| 781 |
+
},
|
| 782 |
+
{
|
| 783 |
+
"style_description": "Double Exposure Style, double image ghost effect, image combination, double exposure style",
|
| 784 |
+
"name": "Double Exposure",
|
| 785 |
+
"thumbnail": "double_exposure.webp"
|
| 786 |
+
},
|
| 787 |
+
{
|
| 788 |
+
"style_description": "Dripping Paint Splatter Art, dramatic, paint drips, splatters, dripping paint",
|
| 789 |
+
"name": "Dripping Paint Splatter Art",
|
| 790 |
+
"thumbnail": "dripping_paint_splatter_art.webp"
|
| 791 |
+
},
|
| 792 |
+
{
|
| 793 |
+
"style_description": "Expressionism Art Style, movement, contrast, emotional, exaggerated forms, expressionism art style",
|
| 794 |
+
"name": "Expressionism",
|
| 795 |
+
"thumbnail": "expressionism.webp"
|
| 796 |
+
},
|
| 797 |
+
{
|
| 798 |
+
"style_description": "Faded Polaroid Photo, analog, old faded photo, old polaroid",
|
| 799 |
+
"name": "Faded Polaroid Photo",
|
| 800 |
+
"thumbnail": "faded_polaroid_photo.webp"
|
| 801 |
+
},
|
| 802 |
+
{
|
| 803 |
+
"style_description": "Fauvism Art, painterly, bold colors, textured brushwork, fauvism art",
|
| 804 |
+
"name": "Fauvism",
|
| 805 |
+
"thumbnail": "fauvism.webp"
|
| 806 |
+
},
|
| 807 |
+
{
|
| 808 |
+
"style_description": "Flat 2D Art, simple flat color, 2-dimensional, Flat 2D Art Style",
|
| 809 |
+
"name": "Flat 2D Art",
|
| 810 |
+
"thumbnail": "flat_2d_art.webp"
|
| 811 |
+
},
|
| 812 |
+
{
|
| 813 |
+
"style_description": "Fortnite Art Style, 3D cartoon, colorful, Fortnite Art Style",
|
| 814 |
+
"name": "Fortnite Art Style",
|
| 815 |
+
"thumbnail": "fortnite_art_style.webp"
|
| 816 |
+
},
|
| 817 |
+
{
|
| 818 |
+
"style_description": "Futurism Art Style, dynamic, dramatic, Futurism Art Style",
|
| 819 |
+
"name": "Futurism",
|
| 820 |
+
"thumbnail": "futurism.webp"
|
| 821 |
+
},
|
| 822 |
+
{
|
| 823 |
+
"style_description": "Glitchcore Art Style, dynamic, dramatic, distorted, vibrant colors, glitchcore art style",
|
| 824 |
+
"name": "Glitchcore",
|
| 825 |
+
"thumbnail": "glitchcore.webp"
|
| 826 |
+
},
|
| 827 |
+
{
|
| 828 |
+
"style_description": "Glo-fi Art Style, dynamic, dramatic, vibrant colors, glo-fi art style",
|
| 829 |
+
"name": "Glo-fi",
|
| 830 |
+
"thumbnail": "glo-fi.webp"
|
| 831 |
+
},
|
| 832 |
+
{
|
| 833 |
+
"style_description": "Googie Art Style, dynamic, dramatic, 1950's futurism, bold boomerang angles, Googie art style",
|
| 834 |
+
"name": "Googie Art Style",
|
| 835 |
+
"thumbnail": "googie_art_style.webp"
|
| 836 |
+
},
|
| 837 |
+
{
|
| 838 |
+
"style_description": "Graffiti Art Style, dynamic, dramatic, vibrant colors, graffiti art style",
|
| 839 |
+
"name": "Graffiti Art",
|
| 840 |
+
"thumbnail": "graffiti_art.webp"
|
| 841 |
+
},
|
| 842 |
+
{
|
| 843 |
+
"style_description": "Harlem Renaissance Art Style, dynamic, dramatic, 1920s African American culture, Harlem Renaissance art style",
|
| 844 |
+
"name": "Harlem Renaissance Art",
|
| 845 |
+
"thumbnail": "harlem_renaissance_art.webp"
|
| 846 |
+
},
|
| 847 |
+
{
|
| 848 |
+
"style_description": "High Fashion, dynamic, dramatic, haute couture, elegant, ornate clothing, High Fashion",
|
| 849 |
+
"name": "High Fashion",
|
| 850 |
+
"thumbnail": "high_fashion.webp"
|
| 851 |
+
},
|
| 852 |
+
{
|
| 853 |
+
"style_description": "Idyllic, peaceful, happy, pleasant, happy, harmonious, picturesque, charming",
|
| 854 |
+
"name": "Idyllic",
|
| 855 |
+
"thumbnail": "idyllic.webp"
|
| 856 |
+
},
|
| 857 |
+
{
|
| 858 |
+
"style_description": "Impressionism, painterly, small brushstrokes, visible brushstrokes, impressionistic style",
|
| 859 |
+
"name": "Impressionism",
|
| 860 |
+
"thumbnail": "impressionism.webp"
|
| 861 |
+
},
|
| 862 |
+
{
|
| 863 |
+
"style_description": "Infographic Drawing, diagram, infographic",
|
| 864 |
+
"name": "Infographic Drawing",
|
| 865 |
+
"thumbnail": "infographic_drawing.webp"
|
| 866 |
+
},
|
| 867 |
+
{
|
| 868 |
+
"style_description": "Ink Dripping Drawing, ink drawing, dripping ink",
|
| 869 |
+
"name": "Ink Dripping Drawing",
|
| 870 |
+
"thumbnail": "ink_dripping_drawing.webp"
|
| 871 |
+
},
|
| 872 |
+
{
|
| 873 |
+
"style_description": "Japanese Ink Drawing, ink drawing, inkwash, Japanese Ink Drawing",
|
| 874 |
+
"name": "Japanese Ink Drawing",
|
| 875 |
+
"thumbnail": "japanese_ink_drawing.webp"
|
| 876 |
+
},
|
| 877 |
+
{
|
| 878 |
+
"style_description": "Knolling Photography, flat lay photography, object arrangment, knolling photography",
|
| 879 |
+
"name": "Knolling Photography",
|
| 880 |
+
"thumbnail": "knolling_photography.webp"
|
| 881 |
+
},
|
| 882 |
+
{
|
| 883 |
+
"style_description": "Light Cheery Atmosphere, happy, joyful, cheerful, carefree, gleeful, lighthearted, pleasant atmosphere",
|
| 884 |
+
"name": "Light Cheery Atmosphere",
|
| 885 |
+
"thumbnail": "light_cheery_atmosphere.webp"
|
| 886 |
+
},
|
| 887 |
+
{
|
| 888 |
+
"style_description": "Logo Design, dynamic graphic art, vector art, minimalist, professional logo design",
|
| 889 |
+
"name": "Logo Design",
|
| 890 |
+
"thumbnail": "logo_design.webp"
|
| 891 |
+
},
|
| 892 |
+
{
|
| 893 |
+
"style_description": "Luxurious Elegance, extravagant, ornate, designer, opulent, picturesque, lavish",
|
| 894 |
+
"name": "Luxurious Elegance",
|
| 895 |
+
"thumbnail": "luxurious_elegance.webp"
|
| 896 |
+
},
|
| 897 |
+
{
|
| 898 |
+
"style_description": "Macro Photography, close-up, macro 100mm, macro photography",
|
| 899 |
+
"name": "Macro Photography",
|
| 900 |
+
"thumbnail": "macro_photography.webp"
|
| 901 |
+
},
|
| 902 |
+
{
|
| 903 |
+
"style_description": "Mandala art style, complex, circular design, mandala",
|
| 904 |
+
"name": "Mandala Art",
|
| 905 |
+
"thumbnail": "mandala_art.webp"
|
| 906 |
+
},
|
| 907 |
+
{
|
| 908 |
+
"style_description": "Marker Drawing, bold marker lines, visibile paper texture, marker drawing",
|
| 909 |
+
"name": "Marker Drawing",
|
| 910 |
+
"thumbnail": "marker_drawing.webp"
|
| 911 |
+
},
|
| 912 |
+
{
|
| 913 |
+
"style_description": "Medievalism, inspired by The Middle Ages, medieval art, elaborate patterns and decoration, Medievalism",
|
| 914 |
+
"name": "Medievalism",
|
| 915 |
+
"thumbnail": "medievalism.webp"
|
| 916 |
+
},
|
| 917 |
+
{
|
| 918 |
+
"style_description": "Minimalism, abstract, simple geometic shapes, hard edges, sleek contours, Minimalism",
|
| 919 |
+
"name": "Minimalism",
|
| 920 |
+
"thumbnail": "minimalism.webp"
|
| 921 |
+
},
|
| 922 |
+
{
|
| 923 |
+
"style_description": "Neo-Baroque, ornate and elaborate, dynaimc, Neo-Baroque",
|
| 924 |
+
"name": "Neo-Baroque",
|
| 925 |
+
"thumbnail": "neo-baroque.webp"
|
| 926 |
+
},
|
| 927 |
+
{
|
| 928 |
+
"style_description": "Neo-Byzantine, grand decorative religious style, Orthodox Christian inspired, Neo-Byzantine",
|
| 929 |
+
"name": "Neo-Byzantine",
|
| 930 |
+
"thumbnail": "neo-byzantine.webp"
|
| 931 |
+
},
|
| 932 |
+
{
|
| 933 |
+
"style_description": "Neo-Futurism, high-tech, curves, spirals, flowing lines, idealistic future, Neo-Futurism",
|
| 934 |
+
"name": "Neo-Futurism",
|
| 935 |
+
"thumbnail": "neo-futurism.webp"
|
| 936 |
+
},
|
| 937 |
+
{
|
| 938 |
+
"style_description": "Neo-Impressionism, tiny dabs of color, Pointillism, painterly, Neo-Impressionism",
|
| 939 |
+
"name": "Neo-Impressionism",
|
| 940 |
+
"thumbnail": "neo-impressionism.webp"
|
| 941 |
+
},
|
| 942 |
+
{
|
| 943 |
+
"style_description": "Neo-Rococo, curved forms, naturalistic ornamentation, elaborate, decorative, gaudy, Neo-Rococo",
|
| 944 |
+
"name": "Neo-Rococo",
|
| 945 |
+
"thumbnail": "neo-rococo.webp"
|
| 946 |
+
},
|
| 947 |
+
{
|
| 948 |
+
"style_description": "Neoclassicism, ancient Rome and Greece inspired, idealic, sober colors, Neoclassicism",
|
| 949 |
+
"name": "Neoclassicism",
|
| 950 |
+
"thumbnail": "neoclassicism.webp"
|
| 951 |
+
},
|
| 952 |
+
{
|
| 953 |
+
"style_description": "Op Art, optical illusion, abstract, geometric pattern, impression of movement, Op Art",
|
| 954 |
+
"name": "Op Art",
|
| 955 |
+
"thumbnail": "op_art.webp"
|
| 956 |
+
},
|
| 957 |
+
{
|
| 958 |
+
"style_description": "Ornate and Intricate, decorative, highly detailed, elaborate, ornate, intricate",
|
| 959 |
+
"name": "Ornate and Intricate",
|
| 960 |
+
"thumbnail": "ornate_and_intricate.webp"
|
| 961 |
+
},
|
| 962 |
+
{
|
| 963 |
+
"style_description": "Pencil Sketch Drawing, black and white drawing, graphite drawing",
|
| 964 |
+
"name": "Pencil Sketch Drawing",
|
| 965 |
+
"thumbnail": "pencil_sketch_drawing.webp"
|
| 966 |
+
},
|
| 967 |
+
{
|
| 968 |
+
"style_description": "Pop Art, vivid colors, flat color, 2D, strong lines, Pop Art",
|
| 969 |
+
"name": "Pop Art 2",
|
| 970 |
+
"thumbnail": "pop_art_2.webp"
|
| 971 |
+
},
|
| 972 |
+
{
|
| 973 |
+
"style_description": "Rococo, flamboyant, pastel colors, curved lines, elaborate detail, Rococo",
|
| 974 |
+
"name": "Rococo",
|
| 975 |
+
"thumbnail": "rococo.webp"
|
| 976 |
+
},
|
| 977 |
+
{
|
| 978 |
+
"style_description": "Silhouette Art, high contrast, well defined, Silhouette Art",
|
| 979 |
+
"name": "Silhouette Art",
|
| 980 |
+
"thumbnail": "silhouette_art.webp"
|
| 981 |
+
},
|
| 982 |
+
{
|
| 983 |
+
"style_description": "Simple Vector Art, 2D flat, simple shapes, minimalistic, professional graphic, flat color, high contrast, Simple Vector Art",
|
| 984 |
+
"name": "Simple Vector Art",
|
| 985 |
+
"thumbnail": "simple_vector_art.webp"
|
| 986 |
+
},
|
| 987 |
+
{
|
| 988 |
+
"style_description": "Sketchup, CAD, professional design, Sketchup",
|
| 989 |
+
"name": "Sketchup",
|
| 990 |
+
"thumbnail": "sketchup.webp"
|
| 991 |
+
},
|
| 992 |
+
{
|
| 993 |
+
"style_description": "Steampunk, retrofuturistic science fantasy, steam-powered tech, vintage industry, gears, neo-victorian, steampunk",
|
| 994 |
+
"name": "Steampunk 2",
|
| 995 |
+
"thumbnail": "steampunk_2.webp"
|
| 996 |
+
},
|
| 997 |
+
{
|
| 998 |
+
"style_description": "Surrealism, expressive, dramatic, organic lines and forms, dreamlike and mysterious, Surrealism",
|
| 999 |
+
"name": "Surrealism",
|
| 1000 |
+
"thumbnail": "surrealism.webp"
|
| 1001 |
+
},
|
| 1002 |
+
{
|
| 1003 |
+
"style_description": "Suprematism, abstract, limited color palette, geometric forms, Suprematism",
|
| 1004 |
+
"name": "Suprematism",
|
| 1005 |
+
"thumbnail": "suprematism.webp"
|
| 1006 |
+
},
|
| 1007 |
+
{
|
| 1008 |
+
"style_description": "Terragen, beautiful massive landscape, epic scenery, Terragen",
|
| 1009 |
+
"name": "Terragen",
|
| 1010 |
+
"thumbnail": "terragen.webp"
|
| 1011 |
+
},
|
| 1012 |
+
{
|
| 1013 |
+
"style_description": "Tranquil Relaxing Atmosphere, calming style, soothing colors, peaceful, idealic, Tranquil Relaxing Atmosphere",
|
| 1014 |
+
"name": "Tranquil Relaxing Atmosphere",
|
| 1015 |
+
"thumbnail": "tranquil_relaxing_atmosphere.webp"
|
| 1016 |
+
},
|
| 1017 |
+
{
|
| 1018 |
+
"style_description": "Vector Art Stickers, professional vector design, sticker designs, Sticker Sheet",
|
| 1019 |
+
"name": "Sticker Designs",
|
| 1020 |
+
"thumbnail": "sticker_designs.webp"
|
| 1021 |
+
},
|
| 1022 |
+
{
|
| 1023 |
+
"style_description": "Vibrant Rim Light, bright rim light, high contrast, bold edge light",
|
| 1024 |
+
"name": "Vibrant Rim Light",
|
| 1025 |
+
"thumbnail": "vibrant_rim_light.webp"
|
| 1026 |
+
},
|
| 1027 |
+
{
|
| 1028 |
+
"style_description": "Volumetric Lighting, light depth, dramatic atmospheric lighting, Volumetric Lighting",
|
| 1029 |
+
"name": "Volumetric Lighting",
|
| 1030 |
+
"thumbnail": "volumetric_lighting.webp"
|
| 1031 |
+
},
|
| 1032 |
+
{
|
| 1033 |
+
"style_description": "Watercolor style painting, visible paper texture, colorwash, watercolor",
|
| 1034 |
+
"name": "Watercolor 2",
|
| 1035 |
+
"thumbnail": "watercolor_2.webp"
|
| 1036 |
+
},
|
| 1037 |
+
{
|
| 1038 |
+
"style_description": "Whimsical and Playful, imaginative, fantastical, bight colors, stylized, happy, Whimsical and Playful",
|
| 1039 |
+
"name": "Whimsical and Playful",
|
| 1040 |
+
"thumbnail": "whimsical_and_playful.webp"
|
| 1041 |
+
},
|
| 1042 |
+
{
|
| 1043 |
+
"style_description": "Chromolithograph, Vibrant colors, intricate details, rich color saturation, meticulous registration, multi-layered printing, decorative elements, historical charm, artistic reproductions, commercial posters, nostalgic, ornate compositions.",
|
| 1044 |
+
"name": "MK Chromolithography",
|
| 1045 |
+
"thumbnail": "mk_chromolithography.webp"
|
| 1046 |
+
},
|
| 1047 |
+
{
|
| 1048 |
+
"style_description": "Cross processing print, Experimental color shifts, unconventional tonalities, vibrant and surreal hues, heightened contrasts, unpredictable results, artistic unpredictability, retro and vintage feel, dynamic color interplay, abstract and dreamlike.",
|
| 1049 |
+
"name": "MK Cross Processing Print",
|
| 1050 |
+
"thumbnail": "mk_cross_processing_print.webp"
|
| 1051 |
+
},
|
| 1052 |
+
{
|
| 1053 |
+
"style_description": "Dufaycolor photograph, Vintage color palette, distinctive color rendering, soft and dreamy atmosphere, historical charm, unique color process, grainy texture, evocative mood, nostalgic aesthetic, hand-tinted appearance, artistic patina.",
|
| 1054 |
+
"name": "MK Dufaycolor Photograph",
|
| 1055 |
+
"thumbnail": "mk_dufaycolor_photograph.webp"
|
| 1056 |
+
},
|
| 1057 |
+
{
|
| 1058 |
+
"style_description": "Herbarium drawing. Botanical accuracy, old botanical book illustration, detailed illustrations, pressed plants, delicate and precise linework, scientific documentation, meticulous presentation, educational purpose, organic compositions, timeless aesthetic, naturalistic beauty.",
|
| 1059 |
+
"name": "MK Herbarium",
|
| 1060 |
+
"thumbnail": "mk_herbarium.webp"
|
| 1061 |
+
},
|
| 1062 |
+
{
|
| 1063 |
+
"style_description": "punk collage style, mixed media, papercut,textured paper, overlapping, ripped posters, safety pins, chaotic layers, graffiti-style elements, anarchy symbols, vintage photos, cut-and-paste aesthetic, bold typography, distorted images, political messages, urban decay, distressed textures, newspaper clippings, spray paint, rebellious icons, DIY spirit, vivid colors, punk band logos, edgy and raw compositions",
|
| 1064 |
+
"name": "MK Punk Collage",
|
| 1065 |
+
"thumbnail": "mk_punk_collage.webp"
|
| 1066 |
+
},
|
| 1067 |
+
{
|
| 1068 |
+
"style_description": "mosaic style, fragmented, assembled, colorful, highly detailed",
|
| 1069 |
+
"name": "MK mosaic",
|
| 1070 |
+
"thumbnail": "mk_mosaic.webp"
|
| 1071 |
+
},
|
| 1072 |
+
{
|
| 1073 |
+
"style_description": "Oil painting by Van Gogh, Expressive, impasto, swirling brushwork, vibrant, brush strokes, Brushstroke-heavy, Textured, Impasto, Colorful, Dynamic, Bold, Distinctive, Vibrant, Whirling, Expressive, Dramatic, Swirling, Layered, Intense, Contrastive, Atmospheric, Luminous, Textural, Evocative, SpiraledVan Gogh style",
|
| 1074 |
+
"name": "MK Van Gogh",
|
| 1075 |
+
"thumbnail": "mk_van_gogh.webp"
|
| 1076 |
+
},
|
| 1077 |
+
{
|
| 1078 |
+
"style_description": "centered black and white high contrast line drawing, coloring book style, monochrome, blank white background",
|
| 1079 |
+
"name": "MK Coloring Book",
|
| 1080 |
+
"thumbnail": "mk_coloring_book.webp"
|
| 1081 |
+
},
|
| 1082 |
+
{
|
| 1083 |
+
"style_description": "Oil painting by John Singer Sargent, Elegant, refined, masterful technique,realistic portrayal, subtle play of light, captivating expression, rich details, harmonious colors, skillful composition, brush strokes, chiaroscuro.",
|
| 1084 |
+
"name": "MK Singer Sargent",
|
| 1085 |
+
"thumbnail": "mk_singer_sargent.webp"
|
| 1086 |
+
},
|
| 1087 |
+
{
|
| 1088 |
+
"style_description": "Oil painting by Jackson Pollock, Abstract expressionism, drip painting, chaotic composition, energetic, spontaneous, unconventional technique, dynamic, bold, distinctive, vibrant, intense, expressive, energetic, layered, non-representational, gestural.",
|
| 1089 |
+
"name": "MK Pollock",
|
| 1090 |
+
"thumbnail": "mk_pollock.webp"
|
| 1091 |
+
},
|
| 1092 |
+
{
|
| 1093 |
+
"style_description": "Artwork by Jean-Michel Basquiat, Neo-expressionism, street art influence, graffiti-inspired, raw, energetic, bold colors, dynamic composition, chaotic, layered, textural, expressive, spontaneous, distinctive, symbolic,energetic brushstrokes.",
|
| 1094 |
+
"name": "MK Basquiat",
|
| 1095 |
+
"thumbnail": "mk_basquiat.webp"
|
| 1096 |
+
},
|
| 1097 |
+
{
|
| 1098 |
+
"style_description": "Artwork in the style of Andy Warhol, Pop art, vibrant colors, bold compositions, repetition of iconic imagery, celebrity culture, commercial aesthetics, mass production influence, stylized simplicity, cultural commentary, graphical elements, distinctive portraits.",
|
| 1099 |
+
"name": "MK Andy Warhol",
|
| 1100 |
+
"thumbnail": "mk_andy_warhol.webp"
|
| 1101 |
+
},
|
| 1102 |
+
{
|
| 1103 |
+
"style_description": "Halftone print of, Dot matrix pattern, grayscale tones, vintage aesthetic, newspaper print vibe, stylized dots, visual texture, black and white contrasts, retro appearance, artistic pointillism,pop culture, (Roy Lichtenstein style:1.5).",
|
| 1104 |
+
"name": "MK Halftone print",
|
| 1105 |
+
"thumbnail": "mk_halftone_print.webp"
|
| 1106 |
+
},
|
| 1107 |
+
{
|
| 1108 |
+
"style_description": "Gond painting, Intricate patterns, vibrant colors, detailed motifs, nature-inspired themes, tribal folklore, fine lines, intricate detailing, storytelling compositions, mystical and folkloric, cultural richness.",
|
| 1109 |
+
"name": "MK Gond Painting",
|
| 1110 |
+
"thumbnail": "mk_gond_painting.webp"
|
| 1111 |
+
},
|
| 1112 |
+
{
|
| 1113 |
+
"style_description": "Albumen print, Sepia tones, fine details, subtle tonal gradations, delicate highlights, vintage aesthetic, soft and muted atmosphere, historical charm, rich textures, meticulous craftsmanship, classic photographic technique, vignetting.",
|
| 1114 |
+
"name": "MK Albumen Print",
|
| 1115 |
+
"thumbnail": "mk_albumen_print.webp"
|
| 1116 |
+
},
|
| 1117 |
+
{
|
| 1118 |
+
"style_description": "Aquatint print, Soft tonal gradations, atmospheric effects, velvety textures, rich contrasts, fine details, etching process, delicate lines, nuanced shading, expressive and moody atmosphere, artistic depth.",
|
| 1119 |
+
"name": "MK Aquatint Print",
|
| 1120 |
+
"thumbnail": "mk_aquatint_print.webp"
|
| 1121 |
+
},
|
| 1122 |
+
{
|
| 1123 |
+
"style_description": "Anthotype print, Monochrome dye, soft and muted colors, organic textures, ephemeral and delicate appearance, low details, watercolor canvas, low contrast, overexposed, silhouette, textured paper.",
|
| 1124 |
+
"name": "MK Anthotype Print",
|
| 1125 |
+
"thumbnail": "mk_anthotype_print.webp"
|
| 1126 |
+
},
|
| 1127 |
+
{
|
| 1128 |
+
"style_description": "A sculpture made of ivory, made of, Sculptures, Inuit art style, intricate carvings, natural materials, storytelling motifs, arctic wildlife themes, symbolic representations, cultural traditions, earthy tones, harmonious compositions, spiritual and mythological elements.",
|
| 1129 |
+
"name": "MK Inuit Carving",
|
| 1130 |
+
"thumbnail": "mk_inuit_carving.webp"
|
| 1131 |
+
},
|
| 1132 |
+
{
|
| 1133 |
+
"style_description": "Bromoil print, Painterly effects, sepia tones, textured surfaces, rich contrasts, expressive brushwork, tonal variations, vintage aesthetic, atmospheric mood, handmade quality, artistic experimentation, darkroom craftsmanship, vignetting.",
|
| 1134 |
+
"name": "MK Bromoil Print",
|
| 1135 |
+
"thumbnail": "mk_bromoil_print.webp"
|
| 1136 |
+
},
|
| 1137 |
+
{
|
| 1138 |
+
"style_description": "Calotype print, Soft focus, subtle tonal range, paper negative process, fine details, vintage aesthetic, artistic experimentation, atmospheric mood, early photographic charm, handmade quality, vignetting.",
|
| 1139 |
+
"name": "MK Calotype Print",
|
| 1140 |
+
"thumbnail": "mk_calotype_print.webp"
|
| 1141 |
+
},
|
| 1142 |
+
{
|
| 1143 |
+
"style_description": "Color sketchnote, Hand-drawn elements, vibrant colors, visual hierarchy, playful illustrations, varied typography, graphic icons, organic and dynamic layout, personalized touches, creative expression, engaging storytelling.",
|
| 1144 |
+
"name": "MK Color Sketchnote",
|
| 1145 |
+
"thumbnail": "mk_color_sketchnote.webp"
|
| 1146 |
+
},
|
| 1147 |
+
{
|
| 1148 |
+
"style_description": "A sculpture made of blue pattern porcelain of, Classic design, blue and white color scheme, intricate detailing, floral motifs, onion-shaped elements, historical charm, rococo, white ware, cobalt blue, underglaze pattern, fine craftsmanship, traditional elegance, delicate patterns, vintage aesthetic, Meissen, Blue Onion pattern, Cibulak.",
|
| 1149 |
+
"name": "MK Cibulak Porcelain",
|
| 1150 |
+
"thumbnail": "mk_cibulak_porcelain.webp"
|
| 1151 |
+
},
|
| 1152 |
+
{
|
| 1153 |
+
"style_description": "Alcohol ink art, Fluid and vibrant colors, unpredictable patterns, organic textures, translucent layers, abstract compositions, ethereal and dreamy effects, free-flowing movement, expressive brushstrokes, contemporary aesthetic, wet textured paper.",
|
| 1154 |
+
"name": "MK Alcohol Ink Art",
|
| 1155 |
+
"thumbnail": "mk_alcohol_ink_art.webp"
|
| 1156 |
+
},
|
| 1157 |
+
{
|
| 1158 |
+
"style_description": "One line art, Continuous and unbroken black line, minimalistic, simplicity, economical use of space, flowing and dynamic, symbolic representations, contemporary aesthetic, evocative and abstract, white background.",
|
| 1159 |
+
"name": "MK One Line Art",
|
| 1160 |
+
"thumbnail": "mk_one_line_art.webp"
|
| 1161 |
+
},
|
| 1162 |
+
{
|
| 1163 |
+
"style_description": "Blacklight paint, Fluorescent pigments, vibrant and surreal colors, ethereal glow, otherworldly effects, dynamic and psychedelic compositions, neon aesthetics, transformative in ultraviolet light, contemporary and experimental.",
|
| 1164 |
+
"name": "MK Blacklight Paint",
|
| 1165 |
+
"thumbnail": "mk_blacklight_paint.webp"
|
| 1166 |
+
},
|
| 1167 |
+
{
|
| 1168 |
+
"style_description": "A sculpture made of Carnival glass, Iridescent surfaces, vibrant colors, intricate patterns, opalescent hues, reflective and prismatic effects, Art Nouveau and Art Deco influences, vintage charm, intricate detailing, lustrous and luminous appearance, Carnival Glass style.",
|
| 1169 |
+
"name": "MK Carnival Glass",
|
| 1170 |
+
"thumbnail": "mk_carnival_glass.webp"
|
| 1171 |
+
},
|
| 1172 |
+
{
|
| 1173 |
+
"style_description": "Cyanotype print, Prussian blue tones, distinctive coloration, high contrast, blueprint aesthetics, atmospheric mood, sun-exposed paper, silhouette effects, delicate details, historical charm, handmade and experimental quality.",
|
| 1174 |
+
"name": "MK Cyanotype Print",
|
| 1175 |
+
"thumbnail": "mk_cyanotype_print.webp"
|
| 1176 |
+
},
|
| 1177 |
+
{
|
| 1178 |
+
"style_description": "Cross-stitching, Intricate patterns, embroidery thread, sewing, fine details, precise stitches, textile artistry, symmetrical designs, varied color palette, traditional and contemporary motifs, handmade and crafted,canvas, nostalgic charm.",
|
| 1179 |
+
"name": "MK Cross-Stitching",
|
| 1180 |
+
"thumbnail": "mk_cross-stitching.webp"
|
| 1181 |
+
},
|
| 1182 |
+
{
|
| 1183 |
+
"style_description": "Encaustic paint, Textured surfaces, translucent layers, luminous quality, wax medium, rich color saturation, fluid and organic shapes, contemporary and historical influences, mixed media elements, atmospheric depth.",
|
| 1184 |
+
"name": "MK Encaustic Paint",
|
| 1185 |
+
"thumbnail": "mk_encaustic_paint.webp"
|
| 1186 |
+
},
|
| 1187 |
+
{
|
| 1188 |
+
"style_description": "Embroidery, Intricate stitching, embroidery thread, fine details, varied thread textures, textile artistry, embellished surfaces, diverse color palette, traditional and contemporary motifs, handmade and crafted, tactile and ornate.",
|
| 1189 |
+
"name": "MK Embroidery",
|
| 1190 |
+
"thumbnail": "mk_embroidery.webp"
|
| 1191 |
+
},
|
| 1192 |
+
{
|
| 1193 |
+
"style_description": "Gyotaku, Fish impressions, realistic details, ink rubbings, textured surfaces, traditional Japanese art form, nature-inspired compositions, artistic representation of marine life, black and white contrasts, cultural significance.",
|
| 1194 |
+
"name": "MK Gyotaku",
|
| 1195 |
+
"thumbnail": "mk_gyotaku.webp"
|
| 1196 |
+
},
|
| 1197 |
+
{
|
| 1198 |
+
"style_description": "Luminogram, Photogram technique, ethereal and abstract effects, light and shadow interplay, luminous quality, experimental process, direct light exposure, unique and unpredictable results, artistic experimentation.",
|
| 1199 |
+
"name": "MK Luminogram",
|
| 1200 |
+
"thumbnail": "mk_luminogram.webp"
|
| 1201 |
+
},
|
| 1202 |
+
{
|
| 1203 |
+
"style_description": "Lite Brite art, Luminous and colorful designs, pixelated compositions, retro aesthetic, glowing effects, creative patterns, interactive and playful, nostalgic charm, vibrant and dynamic arrangements.",
|
| 1204 |
+
"name": "MK Lite Brite Art",
|
| 1205 |
+
"thumbnail": "mk_lite_brite_art.webp"
|
| 1206 |
+
},
|
| 1207 |
+
{
|
| 1208 |
+
"style_description": "Mokume-gane, Wood-grain patterns, mixed metal layers, intricate and organic designs, traditional Japanese metalwork, harmonious color combinations, artisanal craftsmanship, unique and layered textures, cultural and historical significance.",
|
| 1209 |
+
"name": "MK Mokume-gane",
|
| 1210 |
+
"thumbnail": "mk_mokume-gane.webp"
|
| 1211 |
+
},
|
| 1212 |
+
{
|
| 1213 |
+
"style_description": "a sculpture made of peebles, Pebble art style,natural materials, textured surfaces, balanced compositions, organic forms, harmonious arrangements, tactile and 3D effects, beach-inspired aesthetic, creative storytelling, artisanal craftsmanship.",
|
| 1214 |
+
"name": "Pebble Art",
|
| 1215 |
+
"thumbnail": "pebble_art.webp"
|
| 1216 |
+
},
|
| 1217 |
+
{
|
| 1218 |
+
"style_description": "Palekh art, Miniature paintings, intricate details, vivid colors, folkloric themes, lacquer finish, storytelling compositions, symbolic elements, Russian folklore influence, cultural and historical significance.",
|
| 1219 |
+
"name": "MK Palekh",
|
| 1220 |
+
"thumbnail": "mk_palekh.webp"
|
| 1221 |
+
},
|
| 1222 |
+
{
|
| 1223 |
+
"style_description": "Suminagashi, Floating ink patterns, marbled effects, delicate and ethereal designs, water-based ink, fluid and unpredictable compositions, meditative process, monochromatic or subtle color palette, Japanese artistic tradition.",
|
| 1224 |
+
"name": "MK Suminagashi",
|
| 1225 |
+
"thumbnail": "mk_suminagashi.webp"
|
| 1226 |
+
},
|
| 1227 |
+
{
|
| 1228 |
+
"style_description": "A Scrimshaw engraving of, Intricate engravings on a spermwhale's teeth, marine motifs, detailed scenes, nautical themes, black and white contrasts, historical craftsmanship, artisanal carving, storytelling compositions, maritime heritage.",
|
| 1229 |
+
"name": "MK Scrimshaw",
|
| 1230 |
+
"thumbnail": "mk_scrimshaw.webp"
|
| 1231 |
+
},
|
| 1232 |
+
{
|
| 1233 |
+
"style_description": "Shibori, Textured fabric, intricate patterns, resist-dyeing technique, indigo or vibrant colors, organic and flowing designs, Japanese textile art, cultural tradition, tactile and visual interest.",
|
| 1234 |
+
"name": "MK Shibori",
|
| 1235 |
+
"thumbnail": "mk_shibori.webp"
|
| 1236 |
+
},
|
| 1237 |
+
{
|
| 1238 |
+
"style_description": "A sculpture made of Vitreous enamel, Smooth and glossy surfaces, vibrant colors, glass-like finish, durable and resilient, intricate detailing, traditional and contemporary applications, artistic craftsmanship, jewelry and decorative objects, Vitreous enamel, colored glass.",
|
| 1239 |
+
"name": "MK Vitreous Enamel",
|
| 1240 |
+
"thumbnail": "mk_vitreous_enamel.webp"
|
| 1241 |
+
},
|
| 1242 |
+
{
|
| 1243 |
+
"style_description": "Ukiyo-e, Woodblock prints, vibrant colors, intricate details, depictions of landscapes, kabuki actors, beautiful women, cultural scenes, traditional Japanese art, artistic craftsmanship, historical significance.",
|
| 1244 |
+
"name": "MK Ukiyo-e",
|
| 1245 |
+
"thumbnail": "mk_ukiyo-e.webp"
|
| 1246 |
+
},
|
| 1247 |
+
{
|
| 1248 |
+
"style_description": "vintage airline poster, classic aviation fonts, pastel colors, elegant aircraft illustrations, scenic destinations, distressed textures, retro travel allure",
|
| 1249 |
+
"name": "MK vintage-airline-poster",
|
| 1250 |
+
"thumbnail": "mk_vintage-airline-poster.webp"
|
| 1251 |
+
},
|
| 1252 |
+
{
|
| 1253 |
+
"style_description": "vintage travel poster, retro fonts, muted colors, scenic illustrations, iconic landmarks, distressed textures, nostalgic vibes",
|
| 1254 |
+
"name": "MK vintage-travel-poster",
|
| 1255 |
+
"thumbnail": "mk_vintage-travel-poster.webp"
|
| 1256 |
+
},
|
| 1257 |
+
{
|
| 1258 |
+
"style_description": "Bauhaus-inspired, minimalism, geometric precision, primary colors, sans-serif typography, asymmetry, functional design",
|
| 1259 |
+
"name": "MK bauhaus-style",
|
| 1260 |
+
"thumbnail": "mk_bauhaus-style.webp"
|
| 1261 |
+
},
|
| 1262 |
+
{
|
| 1263 |
+
"style_description": "Afrofuturism illustration, vibrant colors, futuristic elements, cultural symbolism, cosmic imagery, dynamic patterns, empowering narratives",
|
| 1264 |
+
"name": "MK afrofuturism",
|
| 1265 |
+
"thumbnail": "mk_afrofuturism.webp"
|
| 1266 |
+
},
|
| 1267 |
+
{
|
| 1268 |
+
"style_description": "Atompunk illustation, retro-futuristic, atomic age aesthetics, sleek lines, metallic textures, futuristic technology, optimism, energy",
|
| 1269 |
+
"name": "MK atompunk",
|
| 1270 |
+
"thumbnail": "mk_atompunk.webp"
|
| 1271 |
+
},
|
| 1272 |
+
{
|
| 1273 |
+
"style_description": "Constructivism, geometric abstraction, bold colors, industrial aesthetics, dynamic compositions, utilitarian design, revolutionary spirit",
|
| 1274 |
+
"name": "MK constructivism",
|
| 1275 |
+
"thumbnail": "mk_constructivism.webp"
|
| 1276 |
+
},
|
| 1277 |
+
{
|
| 1278 |
+
"style_description": "Chicano art, bold colors, cultural symbolism, muralism, lowrider aesthetics, barrio life, political messages, social activism, Mexico",
|
| 1279 |
+
"name": "MK chicano-art",
|
| 1280 |
+
"thumbnail": "mk_chicano-art.webp"
|
| 1281 |
+
},
|
| 1282 |
+
{
|
| 1283 |
+
"style_description": "De Stijl Art, neoplasticism, primary colors, geometric abstraction, horizontal and vertical lines, simplicity, harmony, utopian ideals",
|
| 1284 |
+
"name": "MK de-stijl",
|
| 1285 |
+
"thumbnail": "mk_de-stijl.webp"
|
| 1286 |
+
},
|
| 1287 |
+
{
|
| 1288 |
+
"style_description": "Dayak art sculpture of, intricate patterns, nature-inspired motifs, vibrant colors, traditional craftsmanship, cultural symbolism, storytelling",
|
| 1289 |
+
"name": "MK dayak-art",
|
| 1290 |
+
"thumbnail": "mk_dayak-art.webp"
|
| 1291 |
+
},
|
| 1292 |
+
{
|
| 1293 |
+
"style_description": "Fayum portrait, encaustic painting, realistic facial features, warm earth tones, serene expressions, ancient Egyptian influences",
|
| 1294 |
+
"name": "MK fayum-portrait",
|
| 1295 |
+
"thumbnail": "mk_fayum-portrait.webp"
|
| 1296 |
+
},
|
| 1297 |
+
{
|
| 1298 |
+
"style_description": "Illuminated manuscript, intricate calligraphy, rich colors, detailed illustrations, gold leaf accents, ornate borders, religious, historical, medieval",
|
| 1299 |
+
"name": "MK illuminated-manuscript",
|
| 1300 |
+
"thumbnail": "mk_illuminated-manuscript.webp"
|
| 1301 |
+
},
|
| 1302 |
+
{
|
| 1303 |
+
"style_description": "Kalighat painting, bold lines, vibrant colors, narrative storytelling, cultural motifs, flat compositions, expressive characters",
|
| 1304 |
+
"name": "MK kalighat-painting",
|
| 1305 |
+
"thumbnail": "mk_kalighat-painting.webp"
|
| 1306 |
+
},
|
| 1307 |
+
{
|
| 1308 |
+
"style_description": "Madhubani painting, intricate patterns, vibrant colors, nature-inspired motifs, cultural storytelling, symmetry, folk art aesthetics",
|
| 1309 |
+
"name": "MK madhubani-painting",
|
| 1310 |
+
"thumbnail": "mk_madhubani-painting.webp"
|
| 1311 |
+
},
|
| 1312 |
+
{
|
| 1313 |
+
"style_description": "Pictorialism illustration, soft focus, atmospheric effects, artistic interpretation, tonality, muted colors, evocative storytelling",
|
| 1314 |
+
"name": "MK pictorialism",
|
| 1315 |
+
"thumbnail": "mk_pictorialism.webp"
|
| 1316 |
+
},
|
| 1317 |
+
{
|
| 1318 |
+
"style_description": "Pichwai painting, intricate detailing, vibrant colors, religious themes, nature motifs, devotional storytelling, gold leaf accents",
|
| 1319 |
+
"name": "MK pichwai-painting",
|
| 1320 |
+
"thumbnail": "mk_pichwai-painting.webp"
|
| 1321 |
+
},
|
| 1322 |
+
{
|
| 1323 |
+
"style_description": "Patachitra painting, bold outlines, vibrant colors, intricate detailing, mythological themes, storytelling, traditional craftsmanship",
|
| 1324 |
+
"name": "MK patachitra-painting",
|
| 1325 |
+
"thumbnail": "mk_patachitra-painting.webp"
|
| 1326 |
+
},
|
| 1327 |
+
{
|
| 1328 |
+
"style_description": "Samoan art-inspired wooden sculpture, traditional motifs, natural elements, bold colors, cultural symbolism, storytelling, craftsmanship",
|
| 1329 |
+
"name": "MK samoan-art-inspired",
|
| 1330 |
+
"thumbnail": "mk_samoan-art-inspired.webp"
|
| 1331 |
+
},
|
| 1332 |
+
{
|
| 1333 |
+
"style_description": "Tlingit art, formline design, natural elements, animal motifs, bold colors, cultural storytelling, traditional craftsmanship, Alaska traditional art, (totem:1.5)",
|
| 1334 |
+
"name": "MK tlingit-art",
|
| 1335 |
+
"thumbnail": "mk_tlingit-art.webp"
|
| 1336 |
+
},
|
| 1337 |
+
{
|
| 1338 |
+
"style_description": "Painting by Adnate, realistic portraits, street art, large-scale murals, subdued color palette, social narratives",
|
| 1339 |
+
"name": "MK adnate-style",
|
| 1340 |
+
"thumbnail": "mk_adnate-style.webp"
|
| 1341 |
+
},
|
| 1342 |
+
{
|
| 1343 |
+
"style_description": "Painting by Ron English, pop-surrealism, cultural subversion, iconic mash-ups, vibrant and bold colors, satirical commentary",
|
| 1344 |
+
"name": "MK ron-english-style",
|
| 1345 |
+
"thumbnail": "mk_ron-english-style.webp"
|
| 1346 |
+
},
|
| 1347 |
+
{
|
| 1348 |
+
"style_description": "Painting by Shepard Fairey, street art, political activism, iconic stencils, bold typography, high contrast, red, black, and white color palette",
|
| 1349 |
+
"name": "MK shepard-fairey-style",
|
| 1350 |
+
"thumbnail": "mk_shepard-fairey-style.webp"
|
| 1351 |
+
}
|
| 1352 |
+
]
|
models/hunyuan-foley/config_xl.yaml
ADDED
|
@@ -0,0 +1,48 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
model_config:
|
| 2 |
+
model_name: HunyuanVideo-Foley-XL
|
| 3 |
+
model_type: 1d
|
| 4 |
+
model_precision: bf16
|
| 5 |
+
model_kwargs:
|
| 6 |
+
depth_triple_blocks: 12
|
| 7 |
+
depth_single_blocks: 24
|
| 8 |
+
hidden_size: 1408
|
| 9 |
+
num_heads: 11
|
| 10 |
+
mlp_ratio: 4
|
| 11 |
+
mlp_act_type: "gelu_tanh"
|
| 12 |
+
qkv_bias: True
|
| 13 |
+
qk_norm: True
|
| 14 |
+
qk_norm_type: "rms"
|
| 15 |
+
attn_mode: "torch"
|
| 16 |
+
embedder_type: "default"
|
| 17 |
+
interleaved_audio_visual_rope: True
|
| 18 |
+
enable_learnable_empty_visual_feat: True
|
| 19 |
+
sync_modulation: False
|
| 20 |
+
add_sync_feat_to_audio: True
|
| 21 |
+
cross_attention: True
|
| 22 |
+
use_attention_mask: False
|
| 23 |
+
condition_projection: "linear"
|
| 24 |
+
sync_feat_dim: 768 # syncformer 768 dim
|
| 25 |
+
condition_dim: 768 # clap 768 text condition dim (clip-text)
|
| 26 |
+
clip_dim: 768 # siglip2 visual dim
|
| 27 |
+
audio_vae_latent_dim: 128
|
| 28 |
+
audio_frame_rate: 50
|
| 29 |
+
patch_size: 1
|
| 30 |
+
rope_dim_list: null
|
| 31 |
+
rope_theta: 10000
|
| 32 |
+
text_length: 77
|
| 33 |
+
clip_length: 64
|
| 34 |
+
sync_length: 192
|
| 35 |
+
depth_triple_ssl_encoder: null
|
| 36 |
+
depth_single_ssl_encoder: 8
|
| 37 |
+
use_repa_with_audiossl: True
|
| 38 |
+
|
| 39 |
+
diffusion_config:
|
| 40 |
+
denoise_type: "flow"
|
| 41 |
+
flow_path_type: "linear"
|
| 42 |
+
flow_predict_type: "velocity"
|
| 43 |
+
flow_reverse: True
|
| 44 |
+
flow_solver: "euler"
|
| 45 |
+
sample_flow_shift: 1.0
|
| 46 |
+
sample_use_flux_shift: False
|
| 47 |
+
flux_base_shift: 0.5
|
| 48 |
+
flux_max_shift: 1.15
|
models/kiwi-edit/.gitattributes
ADDED
|
@@ -0,0 +1,36 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
| 2 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
| 3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
| 4 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
| 5 |
+
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
| 6 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
| 7 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
| 8 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
| 9 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
| 10 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
| 11 |
+
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
| 12 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
| 13 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
| 14 |
+
*.npy filter=lfs diff=lfs merge=lfs -text
|
| 15 |
+
*.npz filter=lfs diff=lfs merge=lfs -text
|
| 16 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
| 17 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
| 18 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
| 19 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
| 20 |
+
*.pickle filter=lfs diff=lfs merge=lfs -text
|
| 21 |
+
*.pkl filter=lfs diff=lfs merge=lfs -text
|
| 22 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
| 23 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
| 24 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
| 25 |
+
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
| 26 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
| 27 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
| 28 |
+
*.tar filter=lfs diff=lfs merge=lfs -text
|
| 29 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
| 30 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
| 31 |
+
*.wasm filter=lfs diff=lfs merge=lfs -text
|
| 32 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
| 33 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
+
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
+
processor/tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
models/kiwi-edit/README.md
ADDED
|
@@ -0,0 +1,48 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
library_name: diffusers
|
| 3 |
+
pipeline_tag: image-to-video
|
| 4 |
+
---
|
| 5 |
+
|
| 6 |
+
# Kiwi-Edit: Versatile Video Editing via Instruction and Reference Guidance
|
| 7 |
+
|
| 8 |
+
Kiwi-Edit is a versatile video editing framework built on an MLLM encoder and a video Diffusion Transformer (DiT). It supports both instruction-based video editing and reference-guided editing (using a reference image and instruction).
|
| 9 |
+
|
| 10 |
+
- **Paper:** [Kiwi-Edit: Versatile Video Editing via Instruction and Reference Guidance](https://huggingface.co/papers/2603.02175)
|
| 11 |
+
- **Project Page:** [https://showlab.github.io/Kiwi-Edit/](https://showlab.github.io/Kiwi-Edit/)
|
| 12 |
+
- **Repository:** [https://github.com/showlab/Kiwi-Edit](https://github.com/showlab/Kiwi-Edit)
|
| 13 |
+
|
| 14 |
+
## Model Description
|
| 15 |
+
|
| 16 |
+
Kiwi-Edit introduces a unified editing architecture that synergizes learnable queries and latent visual features for reference semantic guidance. It addresses the challenge of precise visual control in instruction-based editing by allowing users to provide a reference image to guide the transformation. The framework achieves significant performance improvements in instruction following and reference fidelity through a scalable data generation pipeline and a multi-stage training curriculum.
|
| 17 |
+
|
| 18 |
+
## Usage
|
| 19 |
+
|
| 20 |
+
This model is compatible with the `diffusers` library. To run inference, follow the installation instructions in the [official repository](https://github.com/showlab/Kiwi-Edit).
|
| 21 |
+
|
| 22 |
+
### Quick Test with Diffusers
|
| 23 |
+
|
| 24 |
+
You can run a quick test on a demo video using the following command provided in the repository:
|
| 25 |
+
|
| 26 |
+
```bash
|
| 27 |
+
python diffusers_demo.py \
|
| 28 |
+
--video_path ./demo_data/video/source/0005e4ad9f49814db1d3f2296b911abf.mp4 \
|
| 29 |
+
--prompt "Remove the monkey." \
|
| 30 |
+
--save_path output.mp4 \
|
| 31 |
+
--model_path linyq/kiwi-edit-5b-instruct-only-diffusers
|
| 32 |
+
```
|
| 33 |
+
|
| 34 |
+
## Citation
|
| 35 |
+
|
| 36 |
+
If you find this work useful, please cite:
|
| 37 |
+
|
| 38 |
+
```bibtex
|
| 39 |
+
@misc{kiwiedit,
|
| 40 |
+
title={Kiwi-Edit: Versatile Video Editing via Instruction and Reference Guidance},
|
| 41 |
+
author={Yiqi Lin and Guoqiang Liang and Ziyun Zeng and Zechen Bai and Yanzhe Chen and Mike Zheng Shou},
|
| 42 |
+
year={2026},
|
| 43 |
+
eprint={2603.02175},
|
| 44 |
+
archivePrefix={arXiv},
|
| 45 |
+
primaryClass={cs.CV},
|
| 46 |
+
url={https://arxiv.org/abs/2603.02175},
|
| 47 |
+
}
|
| 48 |
+
```
|
models/kiwi-edit/__init__.py
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from pipeline_kiwi_edit import KiwiEditPipeline
|
| 2 |
+
from mllm_encoder import MLLMEncoder
|
| 3 |
+
from conditional_embedder import ConditionalEmbedder
|
| 4 |
+
from wan_video_vae import VAE
|
models/kiwi-edit/conditional_embedder.py
ADDED
|
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import torch
|
| 2 |
+
import torch.nn as nn
|
| 3 |
+
from diffusers import ModelMixin, ConfigMixin
|
| 4 |
+
from diffusers.configuration_utils import register_to_config
|
| 5 |
+
|
| 6 |
+
|
| 7 |
+
class ConditionalEmbedder(ModelMixin, ConfigMixin):
|
| 8 |
+
"""
|
| 9 |
+
Patchifies VAE-encoded conditions (source video or reference image)
|
| 10 |
+
into the DiT hidden dimension space via a Conv3d layer.
|
| 11 |
+
"""
|
| 12 |
+
|
| 13 |
+
@register_to_config
|
| 14 |
+
def __init__(
|
| 15 |
+
self,
|
| 16 |
+
in_dim: int = 48,
|
| 17 |
+
dim: int = 3072,
|
| 18 |
+
patch_size: list = [1, 2, 2],
|
| 19 |
+
zero_init: bool = True,
|
| 20 |
+
ref_pad_first: bool = False,
|
| 21 |
+
):
|
| 22 |
+
super().__init__()
|
| 23 |
+
kernel_size = tuple(patch_size)
|
| 24 |
+
self.patch_embedding = nn.Conv3d(
|
| 25 |
+
in_dim, dim, kernel_size=kernel_size, stride=kernel_size
|
| 26 |
+
)
|
| 27 |
+
self.ref_pad_first = ref_pad_first
|
| 28 |
+
if zero_init:
|
| 29 |
+
nn.init.zeros_(self.patch_embedding.weight)
|
| 30 |
+
nn.init.zeros_(self.patch_embedding.bias)
|
| 31 |
+
|
| 32 |
+
def forward(self, x: torch.Tensor) -> torch.Tensor:
|
| 33 |
+
return self.patch_embedding(x)
|
models/kiwi-edit/mllm_encoder.py
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
models/kiwi-edit/model_index.json
ADDED
|
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"_class_name": [
|
| 3 |
+
"pipeline_kiwi_edit",
|
| 4 |
+
"KiwiEditPipeline"
|
| 5 |
+
],
|
| 6 |
+
"_diffusers_version": "0.32.0",
|
| 7 |
+
"processor": [
|
| 8 |
+
"transformers",
|
| 9 |
+
"AutoProcessor"
|
| 10 |
+
],
|
| 11 |
+
"transformer": [
|
| 12 |
+
"diffusers",
|
| 13 |
+
"WanTransformer3DModel"
|
| 14 |
+
],
|
| 15 |
+
"vae": [
|
| 16 |
+
"wan_video_vae",
|
| 17 |
+
"VAE"
|
| 18 |
+
],
|
| 19 |
+
"scheduler": [
|
| 20 |
+
"diffusers",
|
| 21 |
+
"FlowMatchEulerDiscreteScheduler"
|
| 22 |
+
],
|
| 23 |
+
"mllm_encoder": [
|
| 24 |
+
"mllm_encoder",
|
| 25 |
+
"MLLMEncoder"
|
| 26 |
+
],
|
| 27 |
+
"source_embedder": [
|
| 28 |
+
"conditional_embedder",
|
| 29 |
+
"ConditionalEmbedder"
|
| 30 |
+
],
|
| 31 |
+
"ref_embedder": [
|
| 32 |
+
"conditional_embedder",
|
| 33 |
+
"ConditionalEmbedder"
|
| 34 |
+
]
|
| 35 |
+
}
|
models/kiwi-edit/pipeline_kiwi_edit.py
ADDED
|
@@ -0,0 +1,510 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import torch
|
| 2 |
+
import torch.nn.functional as F
|
| 3 |
+
import numpy as np
|
| 4 |
+
from typing import Optional, List, Union, Callable, Tuple
|
| 5 |
+
from PIL import Image, ImageOps
|
| 6 |
+
from einops import rearrange
|
| 7 |
+
from tqdm import tqdm
|
| 8 |
+
from diffusers import DiffusionPipeline
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
def sinusoidal_embedding_1d(dim, position):
|
| 12 |
+
"""1D sinusoidal positional embedding for timesteps."""
|
| 13 |
+
sinusoid = torch.outer(
|
| 14 |
+
position.type(torch.float64),
|
| 15 |
+
torch.pow(
|
| 16 |
+
10000,
|
| 17 |
+
-torch.arange(dim // 2, dtype=torch.float64, device=position.device).div(
|
| 18 |
+
dim // 2
|
| 19 |
+
),
|
| 20 |
+
),
|
| 21 |
+
)
|
| 22 |
+
x = torch.cat([torch.cos(sinusoid), torch.sin(sinusoid)], dim=1)
|
| 23 |
+
return x.to(position.dtype)
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
def _build_rope_3d(rope_module, f, h, w, device):
|
| 27 |
+
"""
|
| 28 |
+
Build 3D RoPE (cos, sin) for a given (f, h, w) grid using the
|
| 29 |
+
WanRotaryPosEmbed module's precomputed buffers.
|
| 30 |
+
|
| 31 |
+
Returns:
|
| 32 |
+
(freqs_cos, freqs_sin) each of shape [1, f*h*w, 1, head_dim]
|
| 33 |
+
"""
|
| 34 |
+
split_sizes = [rope_module.t_dim, rope_module.h_dim, rope_module.w_dim]
|
| 35 |
+
cos_parts = rope_module.freqs_cos.split(split_sizes, dim=1)
|
| 36 |
+
sin_parts = rope_module.freqs_sin.split(split_sizes, dim=1)
|
| 37 |
+
|
| 38 |
+
cos_f = cos_parts[0][:f].view(f, 1, 1, -1).expand(f, h, w, -1)
|
| 39 |
+
cos_h = cos_parts[1][:h].view(1, h, 1, -1).expand(f, h, w, -1)
|
| 40 |
+
cos_w = cos_parts[2][:w].view(1, 1, w, -1).expand(f, h, w, -1)
|
| 41 |
+
|
| 42 |
+
sin_f = sin_parts[0][:f].view(f, 1, 1, -1).expand(f, h, w, -1)
|
| 43 |
+
sin_h = sin_parts[1][:h].view(1, h, 1, -1).expand(f, h, w, -1)
|
| 44 |
+
sin_w = sin_parts[2][:w].view(1, 1, w, -1).expand(f, h, w, -1)
|
| 45 |
+
|
| 46 |
+
freqs_cos = torch.cat([cos_f, cos_h, cos_w], dim=-1).reshape(1, f * h * w, 1, -1).to(device)
|
| 47 |
+
freqs_sin = torch.cat([sin_f, sin_h, sin_w], dim=-1).reshape(1, f * h * w, 1, -1).to(device)
|
| 48 |
+
return freqs_cos, freqs_sin
|
| 49 |
+
|
| 50 |
+
|
| 51 |
+
class KiwiEditPipeline(DiffusionPipeline):
|
| 52 |
+
"""
|
| 53 |
+
Pipeline for reference-guided video and image editing using KiwiEdit.
|
| 54 |
+
|
| 55 |
+
This pipeline uses a Qwen2.5-VL multimodal LLM encoder for understanding
|
| 56 |
+
editing instructions with source visual context, a WanTransformer3DModel
|
| 57 |
+
for diffusion, and AutoencoderKLWan for VAE encoding/decoding.
|
| 58 |
+
|
| 59 |
+
Args:
|
| 60 |
+
transformer: WanTransformer3DModel - DiT backbone for denoising.
|
| 61 |
+
vae: AutoencoderKLWan - 3D causal VAE.
|
| 62 |
+
scheduler: FlowMatchEulerDiscreteScheduler or compatible scheduler.
|
| 63 |
+
mllm_encoder: MLLMEncoder - Qwen2.5-VL MLLM with learnable queries.
|
| 64 |
+
processor: AutoProcessor - Qwen2.5-VL processor/tokenizer bundle.
|
| 65 |
+
source_embedder: ConditionalEmbedder - VAE source conditioning.
|
| 66 |
+
ref_embedder: ConditionalEmbedder - VAE reference conditioning.
|
| 67 |
+
"""
|
| 68 |
+
|
| 69 |
+
# NOTE: model_cpu_offload_seq is NOT used -- manual offload in __call__
|
| 70 |
+
# handles the out-of-order VAE calls and the interleaved denoising loop.
|
| 71 |
+
model_cpu_offload_seq = "mllm_encoder->source_embedder->ref_embedder->transformer->vae"
|
| 72 |
+
|
| 73 |
+
@property
|
| 74 |
+
def _execution_device(self):
|
| 75 |
+
"""Override: always CUDA — manual offload handles component placement."""
|
| 76 |
+
return torch.device("cuda")
|
| 77 |
+
|
| 78 |
+
def _offload_to(self, components, device):
|
| 79 |
+
"""Move named components to device. components: list of attr names."""
|
| 80 |
+
import gc
|
| 81 |
+
for name in components:
|
| 82 |
+
comp = getattr(self, name, None)
|
| 83 |
+
if comp is not None:
|
| 84 |
+
if str(device) != "cpu":
|
| 85 |
+
comp.to(device=device, dtype=torch.bfloat16)
|
| 86 |
+
else:
|
| 87 |
+
comp.to(device)
|
| 88 |
+
if str(device) == "cpu":
|
| 89 |
+
gc.collect()
|
| 90 |
+
torch.cuda.empty_cache()
|
| 91 |
+
|
| 92 |
+
def __init__(
|
| 93 |
+
self,
|
| 94 |
+
transformer,
|
| 95 |
+
vae,
|
| 96 |
+
scheduler,
|
| 97 |
+
mllm_encoder,
|
| 98 |
+
source_embedder,
|
| 99 |
+
ref_embedder,
|
| 100 |
+
processor=None,
|
| 101 |
+
):
|
| 102 |
+
super().__init__()
|
| 103 |
+
if isinstance(processor, (list, tuple)):
|
| 104 |
+
# Diffusers may pass the raw model_index spec; let MLLMEncoder resolve it later.
|
| 105 |
+
processor = None
|
| 106 |
+
self.register_modules(
|
| 107 |
+
transformer=transformer,
|
| 108 |
+
vae=vae,
|
| 109 |
+
scheduler=scheduler,
|
| 110 |
+
mllm_encoder=mllm_encoder,
|
| 111 |
+
processor=processor,
|
| 112 |
+
source_embedder=source_embedder,
|
| 113 |
+
ref_embedder=ref_embedder,
|
| 114 |
+
)
|
| 115 |
+
if processor is not None:
|
| 116 |
+
self.mllm_encoder.processor = processor
|
| 117 |
+
|
| 118 |
+
# ------------------------------------------------------------------ #
|
| 119 |
+
# Helper utilities #
|
| 120 |
+
# ------------------------------------------------------------------ #
|
| 121 |
+
|
| 122 |
+
@staticmethod
|
| 123 |
+
def _check_resize(height, width, num_frames, h_div=16, w_div=16, t_div=4, t_rem=1):
|
| 124 |
+
"""Round height/width/num_frames to valid values."""
|
| 125 |
+
if height % h_div != 0:
|
| 126 |
+
height = (height + h_div - 1) // h_div * h_div
|
| 127 |
+
if width % w_div != 0:
|
| 128 |
+
width = (width + w_div - 1) // w_div * w_div
|
| 129 |
+
if num_frames % t_div != t_rem:
|
| 130 |
+
num_frames = (num_frames + t_div - 1) // t_div * t_div + t_rem
|
| 131 |
+
return height, width, num_frames
|
| 132 |
+
|
| 133 |
+
@staticmethod
|
| 134 |
+
def _preprocess_image(image: Image.Image, dtype, device):
|
| 135 |
+
"""Convert PIL Image to tensor in [-1, 1]."""
|
| 136 |
+
arr = np.array(image, dtype=np.float32)
|
| 137 |
+
tensor = torch.from_numpy(arr).to(dtype=dtype, device=device)
|
| 138 |
+
tensor = tensor / 127.5 - 1.0 # [0, 255] -> [-1, 1]
|
| 139 |
+
tensor = tensor.permute(2, 0, 1) # H W C -> C H W
|
| 140 |
+
return tensor
|
| 141 |
+
|
| 142 |
+
def _preprocess_video(self, frames: List[Image.Image], dtype, device):
|
| 143 |
+
"""Convert list of PIL Images to tensor [1, C, T, H, W] in [-1, 1]."""
|
| 144 |
+
tensors = [self._preprocess_image(f, dtype, device) for f in frames]
|
| 145 |
+
video = torch.stack(tensors, dim=1) # C T H W
|
| 146 |
+
return video.unsqueeze(0) # 1 C T H W
|
| 147 |
+
|
| 148 |
+
@staticmethod
|
| 149 |
+
def _vae_output_to_video(vae_output):
|
| 150 |
+
"""Convert VAE output tensor to list of PIL Images."""
|
| 151 |
+
# vae_output shape: [B, C, T, H, W] or [T, H, W, C]
|
| 152 |
+
if vae_output.dim() == 5:
|
| 153 |
+
vae_output = vae_output.squeeze(0).permute(1, 2, 3, 0) # T H W C
|
| 154 |
+
frames = []
|
| 155 |
+
for t in range(vae_output.shape[0]):
|
| 156 |
+
frame = ((vae_output[t] + 1.0) * 127.5).clamp(0, 255)
|
| 157 |
+
frame = frame.to(device="cpu", dtype=torch.uint8).numpy()
|
| 158 |
+
frames.append(Image.fromarray(frame))
|
| 159 |
+
return frames
|
| 160 |
+
|
| 161 |
+
# ------------------------------------------------------------------ #
|
| 162 |
+
# Custom Flow Match Scheduler #
|
| 163 |
+
# ------------------------------------------------------------------ #
|
| 164 |
+
|
| 165 |
+
def _setup_scheduler(self, num_inference_steps, denoising_strength=1.0, shift=5.0):
|
| 166 |
+
"""
|
| 167 |
+
Set up flow-match sigmas and timesteps matching the original diffsynth
|
| 168 |
+
FlowMatchScheduler with extra_one_step=True and shift.
|
| 169 |
+
"""
|
| 170 |
+
sigma_min = 0.003 / 1.002
|
| 171 |
+
sigma_max = 1.0
|
| 172 |
+
sigma_start = sigma_min + (sigma_max - sigma_min) * denoising_strength
|
| 173 |
+
# extra_one_step: generate N+1 points, drop last
|
| 174 |
+
sigmas = torch.linspace(sigma_start, sigma_min, num_inference_steps + 1)[:-1]
|
| 175 |
+
# Apply shift
|
| 176 |
+
sigmas = shift * sigmas / (1 + (shift - 1) * sigmas)
|
| 177 |
+
timesteps = sigmas * 1000 # num_train_timesteps = 1000
|
| 178 |
+
return sigmas, timesteps
|
| 179 |
+
|
| 180 |
+
def _scheduler_step(self, model_output, sigmas, step_index, sample):
|
| 181 |
+
"""Euler step for flow matching."""
|
| 182 |
+
sigma = sigmas[step_index]
|
| 183 |
+
if step_index + 1 >= len(sigmas):
|
| 184 |
+
sigma_next = 0.0
|
| 185 |
+
else:
|
| 186 |
+
sigma_next = sigmas[step_index + 1]
|
| 187 |
+
return sample + model_output * (sigma_next - sigma)
|
| 188 |
+
|
| 189 |
+
def _scheduler_add_noise(self, original_samples, noise, sigmas, step_index):
|
| 190 |
+
"""Add noise at given timestep for img2img / video2video."""
|
| 191 |
+
sigma = sigmas[step_index]
|
| 192 |
+
return (1 - sigma) * original_samples + sigma * noise
|
| 193 |
+
|
| 194 |
+
def _scheduler_get_sigma(self, timestep, sigmas, timesteps):
|
| 195 |
+
"""Get sigma for a given timestep."""
|
| 196 |
+
timestep_id = torch.argmin((timesteps - timestep).abs())
|
| 197 |
+
return sigmas[timestep_id]
|
| 198 |
+
|
| 199 |
+
# ------------------------------------------------------------------ #
|
| 200 |
+
# Transformer forward helpers #
|
| 201 |
+
# ------------------------------------------------------------------ #
|
| 202 |
+
|
| 203 |
+
def _model_forward(
|
| 204 |
+
self,
|
| 205 |
+
latents,
|
| 206 |
+
timestep,
|
| 207 |
+
context,
|
| 208 |
+
vae_source_input=None,
|
| 209 |
+
vae_ref_image=None,
|
| 210 |
+
sigmas=None,
|
| 211 |
+
timesteps_schedule=None,
|
| 212 |
+
):
|
| 213 |
+
"""
|
| 214 |
+
Custom DiT forward pass that handles source/ref conditioning.
|
| 215 |
+
Mirrors model_fn_wan_video from the original diffsynth pipeline.
|
| 216 |
+
"""
|
| 217 |
+
device = latents.device
|
| 218 |
+
dtype = latents.dtype
|
| 219 |
+
t = self.transformer
|
| 220 |
+
|
| 221 |
+
# --- Timestep embedding ---
|
| 222 |
+
timestep_emb = sinusoidal_embedding_1d(
|
| 223 |
+
t.config.freq_dim, timestep
|
| 224 |
+
).to(dtype)
|
| 225 |
+
time_emb = t.condition_embedder.time_embedder(timestep_emb)
|
| 226 |
+
# diffusers time_proj = Linear only (SiLU is applied separately)
|
| 227 |
+
t_mod = t.condition_embedder.time_proj(F.silu(time_emb)).unflatten(
|
| 228 |
+
1, (6, t.config.num_attention_heads * t.config.attention_head_dim)
|
| 229 |
+
)
|
| 230 |
+
|
| 231 |
+
# --- Text/context embedding ---
|
| 232 |
+
# NOTE: Do NOT apply text_embedder here. The MLLM encoder's connector
|
| 233 |
+
# already projects to dit_dim. text_embedder is for raw text encoder
|
| 234 |
+
# output (text_dim → dim), which doesn't apply to MLLM output.
|
| 235 |
+
|
| 236 |
+
# --- Patchify latents ---
|
| 237 |
+
x = latents
|
| 238 |
+
if vae_source_input is not None:
|
| 239 |
+
vae_source_cond = self.source_embedder(vae_source_input)
|
| 240 |
+
x = t.patch_embedding(x)
|
| 241 |
+
# Get sigma for this timestep
|
| 242 |
+
sigma = self._scheduler_get_sigma(timestep, sigmas, timesteps_schedule)
|
| 243 |
+
x = x + vae_source_cond * sigma
|
| 244 |
+
else:
|
| 245 |
+
x = t.patch_embedding(x)
|
| 246 |
+
|
| 247 |
+
f, h, w = x.shape[2:]
|
| 248 |
+
x = rearrange(x, "b c f h w -> b (f h w) c").contiguous()
|
| 249 |
+
|
| 250 |
+
# --- 3D RoPE frequencies (real-valued cos/sin format) ---
|
| 251 |
+
rotary_emb = _build_rope_3d(t.rope, f, h, w, device)
|
| 252 |
+
|
| 253 |
+
# --- Reference image conditioning ---
|
| 254 |
+
vae_ref_input_length = 0
|
| 255 |
+
if vae_ref_image is not None:
|
| 256 |
+
if len(vae_ref_image) > 1:
|
| 257 |
+
vae_ref = torch.cat(vae_ref_image, dim=2) # concat along temporal
|
| 258 |
+
else:
|
| 259 |
+
vae_ref = vae_ref_image[0]
|
| 260 |
+
|
| 261 |
+
vae_ref = self.ref_embedder(vae_ref)
|
| 262 |
+
ref_f, ref_h, ref_w = vae_ref.shape[2:]
|
| 263 |
+
vae_ref = rearrange(vae_ref, "b c f h w -> b (f h w) c").contiguous()
|
| 264 |
+
|
| 265 |
+
# Recompute RoPE for extended sequence (main + ref tokens)
|
| 266 |
+
total_f = f + ref_f
|
| 267 |
+
rotary_emb = _build_rope_3d(t.rope, total_f, h, w, device)
|
| 268 |
+
|
| 269 |
+
vae_ref_input_length = vae_ref.shape[1]
|
| 270 |
+
|
| 271 |
+
if self.ref_embedder.config.ref_pad_first:
|
| 272 |
+
x = torch.cat([vae_ref, x], dim=1)
|
| 273 |
+
else:
|
| 274 |
+
x = torch.cat([x, vae_ref], dim=1)
|
| 275 |
+
|
| 276 |
+
# --- Transformer blocks ---
|
| 277 |
+
for block in t.blocks:
|
| 278 |
+
x = block(x, context, t_mod, rotary_emb)
|
| 279 |
+
|
| 280 |
+
# --- Output head ---
|
| 281 |
+
# Match diffusers' FP32 norm + modulation + projection
|
| 282 |
+
table = t.scale_shift_table
|
| 283 |
+
shift, scale = (
|
| 284 |
+
table.to(device=device) + time_emb.unsqueeze(1)
|
| 285 |
+
).chunk(2, dim=1)
|
| 286 |
+
shift = shift.to(device=x.device)
|
| 287 |
+
scale = scale.to(device=x.device)
|
| 288 |
+
x = (t.norm_out(x.float()) * (1 + scale) + shift).type_as(x)
|
| 289 |
+
x = t.proj_out(x)
|
| 290 |
+
|
| 291 |
+
# --- Remove ref tokens from output ---
|
| 292 |
+
if vae_ref_image is not None and vae_ref_input_length > 0:
|
| 293 |
+
if self.ref_embedder.config.ref_pad_first:
|
| 294 |
+
x = x[:, vae_ref_input_length:, :]
|
| 295 |
+
else:
|
| 296 |
+
x = x[:, :-vae_ref_input_length, :]
|
| 297 |
+
|
| 298 |
+
# --- Unpatchify ---
|
| 299 |
+
patch_size = t.config.patch_size
|
| 300 |
+
x = rearrange(
|
| 301 |
+
x,
|
| 302 |
+
"b (f h w) (x y z c) -> b c (f x) (h y) (w z)",
|
| 303 |
+
f=f, h=h, w=w,
|
| 304 |
+
x=patch_size[0], y=patch_size[1], z=patch_size[2],
|
| 305 |
+
)
|
| 306 |
+
return x
|
| 307 |
+
|
| 308 |
+
# ------------------------------------------------------------------ #
|
| 309 |
+
# Main __call__ #
|
| 310 |
+
# ------------------------------------------------------------------ #
|
| 311 |
+
|
| 312 |
+
@torch.no_grad()
|
| 313 |
+
def __call__(
|
| 314 |
+
self,
|
| 315 |
+
prompt: str,
|
| 316 |
+
source_video: Optional[List[Image.Image]] = None,
|
| 317 |
+
source_input: Optional[List[Image.Image]] = None,
|
| 318 |
+
ref_image: Optional[List[Image.Image]] = None,
|
| 319 |
+
negative_prompt: Optional[str] = "",
|
| 320 |
+
input_video: Optional[List[Image.Image]] = None,
|
| 321 |
+
height: int = 480,
|
| 322 |
+
width: int = 832,
|
| 323 |
+
num_frames: int = 81,
|
| 324 |
+
num_inference_steps: int = 50,
|
| 325 |
+
guidance_scale: float = 1.0,
|
| 326 |
+
sigma_shift: float = 5.0,
|
| 327 |
+
denoising_strength: float = 1.0,
|
| 328 |
+
seed: Optional[int] = None,
|
| 329 |
+
tiled: bool = True,
|
| 330 |
+
tile_size: Tuple[int, int] = (30, 52),
|
| 331 |
+
tile_stride: Tuple[int, int] = (15, 26),
|
| 332 |
+
output_type: str = "pil",
|
| 333 |
+
progress_bar: Callable = tqdm,
|
| 334 |
+
) -> List[Image.Image]:
|
| 335 |
+
"""
|
| 336 |
+
Run KiwiEdit inference.
|
| 337 |
+
|
| 338 |
+
Args:
|
| 339 |
+
prompt: Editing instruction text.
|
| 340 |
+
source_video: Source video/image frames for MLLM context (also used as
|
| 341 |
+
source_input if source_input is not provided).
|
| 342 |
+
source_input: Source frames for VAE conditioning. If None but source_video
|
| 343 |
+
is provided, source_video is used.
|
| 344 |
+
ref_image: Optional reference image(s) for guided editing.
|
| 345 |
+
negative_prompt: Negative prompt for CFG.
|
| 346 |
+
input_video: Optional input video for video-to-video (adds noise then denoises).
|
| 347 |
+
height: Output height in pixels.
|
| 348 |
+
width: Output width in pixels.
|
| 349 |
+
num_frames: Number of output frames (1 for image editing).
|
| 350 |
+
num_inference_steps: Number of denoising steps.
|
| 351 |
+
guidance_scale: Classifier-free guidance scale.
|
| 352 |
+
sigma_shift: Flow matching shift parameter.
|
| 353 |
+
denoising_strength: How much noise to add (1.0 = full noise).
|
| 354 |
+
seed: Random seed for reproducibility.
|
| 355 |
+
tiled: Whether to use tiled VAE encoding/decoding.
|
| 356 |
+
tile_size: VAE tile size.
|
| 357 |
+
tile_stride: VAE tile stride.
|
| 358 |
+
output_type: "pil" for PIL Images, "latent" for raw latents.
|
| 359 |
+
progress_bar: Progress bar callable (e.g., tqdm).
|
| 360 |
+
|
| 361 |
+
Returns:
|
| 362 |
+
List of PIL Images (video frames).
|
| 363 |
+
"""
|
| 364 |
+
device = self._execution_device
|
| 365 |
+
dtype = torch.bfloat16
|
| 366 |
+
# --- 1. Shape check ---
|
| 367 |
+
# VAE spatial factor is 16, transformer patch spatial is 2,
|
| 368 |
+
# so pixel dims must be multiples of 32.
|
| 369 |
+
height, width, num_frames = self._check_resize(
|
| 370 |
+
height, width, num_frames, h_div=32, w_div=32
|
| 371 |
+
)
|
| 372 |
+
|
| 373 |
+
# --- 2. Determine VAE parameters ---
|
| 374 |
+
z_dim = self.vae.config.z_dim
|
| 375 |
+
# Compute upsampling factor from VAE config
|
| 376 |
+
dim_mult = self.vae.config.get("dim_mult", [1, 2, 4, 4])
|
| 377 |
+
temporal_downsample = self.vae.config.get("temperal_downsample", [False, True, True])
|
| 378 |
+
# Wan VideoVAE spatial factor is 2^(len(dim_mult)) due to extra
|
| 379 |
+
# downsampling in the encoder beyond the level transitions.
|
| 380 |
+
spatial_factor = 2 ** len(dim_mult) # 16 for 4 levels
|
| 381 |
+
temporal_factor = 2 ** sum(temporal_downsample) # 4 for [F, T, T]
|
| 382 |
+
|
| 383 |
+
# --- 3. MLLM encoding (move mllm_encoder to CUDA, ~6.5 GB) ---
|
| 384 |
+
self._offload_to(["mllm_encoder"], device)
|
| 385 |
+
context = None
|
| 386 |
+
src_video_for_mllm = source_video
|
| 387 |
+
if src_video_for_mllm is not None:
|
| 388 |
+
self.mllm_encoder._ensure_qwen_loaded()
|
| 389 |
+
if ref_image is not None:
|
| 390 |
+
# Ref mode always uses the video path (even for a single frame)
|
| 391 |
+
context = self.mllm_encoder(
|
| 392 |
+
prompt, src_video=src_video_for_mllm, ref_image=ref_image
|
| 393 |
+
)
|
| 394 |
+
elif len(src_video_for_mllm) == 1:
|
| 395 |
+
context = self.mllm_encoder(
|
| 396 |
+
prompt, src_image=src_video_for_mllm
|
| 397 |
+
)
|
| 398 |
+
else:
|
| 399 |
+
context = self.mllm_encoder(
|
| 400 |
+
prompt, src_video=src_video_for_mllm
|
| 401 |
+
)
|
| 402 |
+
# For negative prompt: use zero context
|
| 403 |
+
context_nega = None
|
| 404 |
+
# Move context to CPU while we do VAE encoding (will move back for denoising)
|
| 405 |
+
if context is not None:
|
| 406 |
+
context = context.cpu()
|
| 407 |
+
self._offload_to(["mllm_encoder"], "cpu")
|
| 408 |
+
|
| 409 |
+
# --- 4. Setup scheduler ---
|
| 410 |
+
sigmas, timesteps = self._setup_scheduler(
|
| 411 |
+
num_inference_steps, denoising_strength, sigma_shift
|
| 412 |
+
)
|
| 413 |
+
sigmas = sigmas.to(device)
|
| 414 |
+
timesteps = timesteps.to(device)
|
| 415 |
+
|
| 416 |
+
# --- 5. Initialize noise ---
|
| 417 |
+
latent_length = (num_frames - 1) // temporal_factor + 1
|
| 418 |
+
latent_h = height // spatial_factor
|
| 419 |
+
latent_w = width // spatial_factor
|
| 420 |
+
shape = (1, z_dim, latent_length, latent_h, latent_w)
|
| 421 |
+
|
| 422 |
+
generator = None if seed is None else torch.Generator("cpu").manual_seed(seed)
|
| 423 |
+
noise = torch.randn(shape, generator=generator, device="cpu", dtype=torch.float32)
|
| 424 |
+
noise = noise.to(dtype=dtype, device=device)
|
| 425 |
+
|
| 426 |
+
# --- 6. Encode source input (move VAE to CUDA, ~0.4 GB) ---
|
| 427 |
+
self._offload_to(["vae"], device)
|
| 428 |
+
vae_source_input = None
|
| 429 |
+
# Fall back to source_video if source_input not provided
|
| 430 |
+
src_for_vae = source_input if source_input is not None else source_video
|
| 431 |
+
if src_for_vae is not None:
|
| 432 |
+
src_frames = [src_for_vae[i] for i in range(min(num_frames, len(src_for_vae)))]
|
| 433 |
+
# Resize source frames to match the (possibly adjusted) target dimensions
|
| 434 |
+
src_frames = [f.resize((width, height), Image.LANCZOS) for f in src_frames]
|
| 435 |
+
src_tensor = self._preprocess_video(src_frames, dtype=torch.float32, device=device)
|
| 436 |
+
vae_source_input = self.vae.encode(src_tensor).latent_dist.sample()
|
| 437 |
+
vae_source_input = vae_source_input.to(dtype=dtype)
|
| 438 |
+
|
| 439 |
+
# --- 7. Encode reference images ---
|
| 440 |
+
vae_ref_image = None
|
| 441 |
+
if ref_image is not None:
|
| 442 |
+
vae_ref_image = []
|
| 443 |
+
for item in ref_image:
|
| 444 |
+
target_size = (width, height)
|
| 445 |
+
item = ImageOps.pad(item, target_size, color="white", centering=(0.5, 0.5))
|
| 446 |
+
ref_tensor = self._preprocess_video([item], dtype=torch.float32, device=device)
|
| 447 |
+
ref_latent = self.vae.encode(ref_tensor).latent_dist.sample()
|
| 448 |
+
vae_ref_image.append(ref_latent.to(dtype=dtype))
|
| 449 |
+
|
| 450 |
+
# --- 8. Handle input_video (video-to-video) ---
|
| 451 |
+
if input_video is not None:
|
| 452 |
+
input_tensor = self._preprocess_video(input_video, dtype=torch.float32, device=device)
|
| 453 |
+
input_latents = self.vae.encode(input_tensor).latent_dist.sample()
|
| 454 |
+
input_latents = input_latents.to(dtype=dtype)
|
| 455 |
+
latents = self._scheduler_add_noise(input_latents, noise, sigmas, 0)
|
| 456 |
+
else:
|
| 457 |
+
latents = noise
|
| 458 |
+
|
| 459 |
+
# --- Offload VAE, load denoising components (~10.5 GB) ---
|
| 460 |
+
self._offload_to(["vae"], "cpu")
|
| 461 |
+
self._offload_to(["source_embedder", "ref_embedder", "transformer"], device)
|
| 462 |
+
# Move context back to CUDA for denoising
|
| 463 |
+
if context is not None:
|
| 464 |
+
context = context.to(device=device, dtype=dtype)
|
| 465 |
+
|
| 466 |
+
# --- 9. Denoising loop ---
|
| 467 |
+
for step_idx, timestep_val in enumerate(progress_bar(timesteps)):
|
| 468 |
+
timestep = timestep_val.unsqueeze(0).to(dtype=dtype, device=device)
|
| 469 |
+
|
| 470 |
+
# Positive prediction
|
| 471 |
+
noise_pred = self._model_forward(
|
| 472 |
+
latents=latents,
|
| 473 |
+
timestep=timestep,
|
| 474 |
+
context=context,
|
| 475 |
+
vae_source_input=vae_source_input,
|
| 476 |
+
vae_ref_image=vae_ref_image,
|
| 477 |
+
sigmas=sigmas,
|
| 478 |
+
timesteps_schedule=timesteps,
|
| 479 |
+
)
|
| 480 |
+
|
| 481 |
+
# CFG
|
| 482 |
+
# if guidance_scale != 1.0:
|
| 483 |
+
# noise_pred_nega = self._model_forward(
|
| 484 |
+
# latents=latents,
|
| 485 |
+
# timestep=timestep,
|
| 486 |
+
# context=context_nega,
|
| 487 |
+
# vae_source_input=vae_source_input,
|
| 488 |
+
# vae_ref_image=vae_ref_image,
|
| 489 |
+
# sigmas=sigmas,
|
| 490 |
+
# timesteps_schedule=timesteps,
|
| 491 |
+
# )
|
| 492 |
+
# noise_pred = noise_pred_nega + guidance_scale * (
|
| 493 |
+
# noise_pred_posi - noise_pred_nega
|
| 494 |
+
# )
|
| 495 |
+
# else:
|
| 496 |
+
# noise_pred = noise_pred_posi
|
| 497 |
+
|
| 498 |
+
# Scheduler step
|
| 499 |
+
latents = self._scheduler_step(noise_pred, sigmas, step_idx, latents)
|
| 500 |
+
|
| 501 |
+
# --- 10. Decode (offload denoising components, load VAE) ---
|
| 502 |
+
self._offload_to(["source_embedder", "ref_embedder", "transformer"], "cpu")
|
| 503 |
+
self._offload_to(["vae"], device)
|
| 504 |
+
|
| 505 |
+
if output_type == "latent":
|
| 506 |
+
return latents
|
| 507 |
+
|
| 508 |
+
video = self.vae.decode(latents).sample
|
| 509 |
+
video = self._vae_output_to_video(video)
|
| 510 |
+
return video
|
models/kiwi-edit/wan_video_vae.py
ADDED
|
@@ -0,0 +1,1486 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from dataclasses import dataclass
|
| 2 |
+
from typing import List, Optional
|
| 3 |
+
|
| 4 |
+
from einops import rearrange, repeat
|
| 5 |
+
|
| 6 |
+
import torch
|
| 7 |
+
import torch.nn as nn
|
| 8 |
+
import torch.nn.functional as F
|
| 9 |
+
from tqdm import tqdm
|
| 10 |
+
from diffusers import ModelMixin, ConfigMixin
|
| 11 |
+
from diffusers.configuration_utils import register_to_config
|
| 12 |
+
|
| 13 |
+
CACHE_T = 2
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
def check_is_instance(model, module_class):
|
| 17 |
+
if isinstance(model, module_class):
|
| 18 |
+
return True
|
| 19 |
+
if hasattr(model, "module") and isinstance(model.module, module_class):
|
| 20 |
+
return True
|
| 21 |
+
return False
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
def block_causal_mask(x, block_size):
|
| 25 |
+
# params
|
| 26 |
+
b, n, s, _, device = *x.size(), x.device
|
| 27 |
+
assert s % block_size == 0
|
| 28 |
+
num_blocks = s // block_size
|
| 29 |
+
|
| 30 |
+
# build mask
|
| 31 |
+
mask = torch.zeros(b, n, s, s, dtype=torch.bool, device=device)
|
| 32 |
+
for i in range(num_blocks):
|
| 33 |
+
mask[:, :,
|
| 34 |
+
i * block_size:(i + 1) * block_size, :(i + 1) * block_size] = 1
|
| 35 |
+
return mask
|
| 36 |
+
|
| 37 |
+
|
| 38 |
+
class CausalConv3d(nn.Conv3d):
|
| 39 |
+
"""
|
| 40 |
+
Causal 3d convolusion.
|
| 41 |
+
"""
|
| 42 |
+
|
| 43 |
+
def __init__(self, *args, **kwargs):
|
| 44 |
+
super().__init__(*args, **kwargs)
|
| 45 |
+
self._padding = (self.padding[2], self.padding[2], self.padding[1],
|
| 46 |
+
self.padding[1], 2 * self.padding[0], 0)
|
| 47 |
+
self.padding = (0, 0, 0)
|
| 48 |
+
|
| 49 |
+
def forward(self, x, cache_x=None):
|
| 50 |
+
padding = list(self._padding)
|
| 51 |
+
if cache_x is not None and self._padding[4] > 0:
|
| 52 |
+
cache_x = cache_x.to(x.device)
|
| 53 |
+
x = torch.cat([cache_x, x], dim=2)
|
| 54 |
+
padding[4] -= cache_x.shape[2]
|
| 55 |
+
x = F.pad(x, padding)
|
| 56 |
+
|
| 57 |
+
return super().forward(x)
|
| 58 |
+
|
| 59 |
+
|
| 60 |
+
class RMS_norm(nn.Module):
|
| 61 |
+
|
| 62 |
+
def __init__(self, dim, channel_first=True, images=True, bias=False):
|
| 63 |
+
super().__init__()
|
| 64 |
+
broadcastable_dims = (1, 1, 1) if not images else (1, 1)
|
| 65 |
+
shape = (dim, *broadcastable_dims) if channel_first else (dim,)
|
| 66 |
+
|
| 67 |
+
self.channel_first = channel_first
|
| 68 |
+
self.scale = dim**0.5
|
| 69 |
+
self.gamma = nn.Parameter(torch.ones(shape))
|
| 70 |
+
self.bias = nn.Parameter(torch.zeros(shape)) if bias else 0.
|
| 71 |
+
|
| 72 |
+
def forward(self, x):
|
| 73 |
+
return F.normalize(
|
| 74 |
+
x, dim=(1 if self.channel_first else
|
| 75 |
+
-1)) * self.scale * self.gamma + self.bias
|
| 76 |
+
|
| 77 |
+
|
| 78 |
+
class Upsample(nn.Upsample):
|
| 79 |
+
|
| 80 |
+
def forward(self, x):
|
| 81 |
+
"""
|
| 82 |
+
Fix bfloat16 support for nearest neighbor interpolation.
|
| 83 |
+
"""
|
| 84 |
+
return super().forward(x.float()).type_as(x)
|
| 85 |
+
|
| 86 |
+
|
| 87 |
+
class Resample(nn.Module):
|
| 88 |
+
|
| 89 |
+
def __init__(self, dim, mode):
|
| 90 |
+
assert mode in ('none', 'upsample2d', 'upsample3d', 'downsample2d',
|
| 91 |
+
'downsample3d')
|
| 92 |
+
super().__init__()
|
| 93 |
+
self.dim = dim
|
| 94 |
+
self.mode = mode
|
| 95 |
+
|
| 96 |
+
# layers
|
| 97 |
+
if mode == 'upsample2d':
|
| 98 |
+
self.resample = nn.Sequential(
|
| 99 |
+
Upsample(scale_factor=(2., 2.), mode='nearest-exact'),
|
| 100 |
+
nn.Conv2d(dim, dim // 2, 3, padding=1))
|
| 101 |
+
elif mode == 'upsample3d':
|
| 102 |
+
self.resample = nn.Sequential(
|
| 103 |
+
Upsample(scale_factor=(2., 2.), mode='nearest-exact'),
|
| 104 |
+
nn.Conv2d(dim, dim // 2, 3, padding=1))
|
| 105 |
+
self.time_conv = CausalConv3d(dim,
|
| 106 |
+
dim * 2, (3, 1, 1),
|
| 107 |
+
padding=(1, 0, 0))
|
| 108 |
+
|
| 109 |
+
elif mode == 'downsample2d':
|
| 110 |
+
self.resample = nn.Sequential(
|
| 111 |
+
nn.ZeroPad2d((0, 1, 0, 1)),
|
| 112 |
+
nn.Conv2d(dim, dim, 3, stride=(2, 2)))
|
| 113 |
+
elif mode == 'downsample3d':
|
| 114 |
+
self.resample = nn.Sequential(
|
| 115 |
+
nn.ZeroPad2d((0, 1, 0, 1)),
|
| 116 |
+
nn.Conv2d(dim, dim, 3, stride=(2, 2)))
|
| 117 |
+
self.time_conv = CausalConv3d(dim,
|
| 118 |
+
dim, (3, 1, 1),
|
| 119 |
+
stride=(2, 1, 1),
|
| 120 |
+
padding=(0, 0, 0))
|
| 121 |
+
|
| 122 |
+
else:
|
| 123 |
+
self.resample = nn.Identity()
|
| 124 |
+
|
| 125 |
+
def forward(self, x, feat_cache=None, feat_idx=[0]):
|
| 126 |
+
b, c, t, h, w = x.size()
|
| 127 |
+
if self.mode == 'upsample3d':
|
| 128 |
+
if feat_cache is not None:
|
| 129 |
+
idx = feat_idx[0]
|
| 130 |
+
if feat_cache[idx] is None:
|
| 131 |
+
feat_cache[idx] = 'Rep'
|
| 132 |
+
feat_idx[0] += 1
|
| 133 |
+
else:
|
| 134 |
+
|
| 135 |
+
cache_x = x[:, :, -CACHE_T:, :, :].clone()
|
| 136 |
+
if cache_x.shape[2] < 2 and feat_cache[
|
| 137 |
+
idx] is not None and feat_cache[idx] != 'Rep':
|
| 138 |
+
# cache last frame of last two chunk
|
| 139 |
+
cache_x = torch.cat([
|
| 140 |
+
feat_cache[idx][:, :, -1, :, :].unsqueeze(2).to(
|
| 141 |
+
cache_x.device), cache_x
|
| 142 |
+
],
|
| 143 |
+
dim=2)
|
| 144 |
+
if cache_x.shape[2] < 2 and feat_cache[
|
| 145 |
+
idx] is not None and feat_cache[idx] == 'Rep':
|
| 146 |
+
cache_x = torch.cat([
|
| 147 |
+
torch.zeros_like(cache_x).to(cache_x.device),
|
| 148 |
+
cache_x
|
| 149 |
+
],
|
| 150 |
+
dim=2)
|
| 151 |
+
if feat_cache[idx] == 'Rep':
|
| 152 |
+
x = self.time_conv(x)
|
| 153 |
+
else:
|
| 154 |
+
x = self.time_conv(x, feat_cache[idx])
|
| 155 |
+
feat_cache[idx] = cache_x
|
| 156 |
+
feat_idx[0] += 1
|
| 157 |
+
|
| 158 |
+
x = x.reshape(b, 2, c, t, h, w)
|
| 159 |
+
x = torch.stack((x[:, 0, :, :, :, :], x[:, 1, :, :, :, :]),
|
| 160 |
+
3)
|
| 161 |
+
x = x.reshape(b, c, t * 2, h, w)
|
| 162 |
+
t = x.shape[2]
|
| 163 |
+
x = rearrange(x, 'b c t h w -> (b t) c h w')
|
| 164 |
+
x = self.resample(x)
|
| 165 |
+
x = rearrange(x, '(b t) c h w -> b c t h w', t=t)
|
| 166 |
+
|
| 167 |
+
if self.mode == 'downsample3d':
|
| 168 |
+
if feat_cache is not None:
|
| 169 |
+
idx = feat_idx[0]
|
| 170 |
+
if feat_cache[idx] is None:
|
| 171 |
+
feat_cache[idx] = x.clone()
|
| 172 |
+
feat_idx[0] += 1
|
| 173 |
+
else:
|
| 174 |
+
cache_x = x[:, :, -1:, :, :].clone()
|
| 175 |
+
x = self.time_conv(
|
| 176 |
+
torch.cat([feat_cache[idx][:, :, -1:, :, :], x], 2))
|
| 177 |
+
feat_cache[idx] = cache_x
|
| 178 |
+
feat_idx[0] += 1
|
| 179 |
+
return x
|
| 180 |
+
|
| 181 |
+
def init_weight(self, conv):
|
| 182 |
+
conv_weight = conv.weight
|
| 183 |
+
nn.init.zeros_(conv_weight)
|
| 184 |
+
c1, c2, t, h, w = conv_weight.size()
|
| 185 |
+
one_matrix = torch.eye(c1, c2)
|
| 186 |
+
init_matrix = one_matrix
|
| 187 |
+
nn.init.zeros_(conv_weight)
|
| 188 |
+
conv_weight.data[:, :, 1, 0, 0] = init_matrix
|
| 189 |
+
conv.weight.data.copy_(conv_weight)
|
| 190 |
+
nn.init.zeros_(conv.bias.data)
|
| 191 |
+
|
| 192 |
+
def init_weight2(self, conv):
|
| 193 |
+
conv_weight = conv.weight.data
|
| 194 |
+
nn.init.zeros_(conv_weight)
|
| 195 |
+
c1, c2, t, h, w = conv_weight.size()
|
| 196 |
+
init_matrix = torch.eye(c1 // 2, c2)
|
| 197 |
+
conv_weight[:c1 // 2, :, -1, 0, 0] = init_matrix
|
| 198 |
+
conv_weight[c1 // 2:, :, -1, 0, 0] = init_matrix
|
| 199 |
+
conv.weight.data.copy_(conv_weight)
|
| 200 |
+
nn.init.zeros_(conv.bias.data)
|
| 201 |
+
|
| 202 |
+
|
| 203 |
+
|
| 204 |
+
def patchify(x, patch_size):
|
| 205 |
+
if patch_size == 1:
|
| 206 |
+
return x
|
| 207 |
+
if x.dim() == 4:
|
| 208 |
+
x = rearrange(x, "b c (h q) (w r) -> b (c r q) h w", q=patch_size, r=patch_size)
|
| 209 |
+
elif x.dim() == 5:
|
| 210 |
+
x = rearrange(x,
|
| 211 |
+
"b c f (h q) (w r) -> b (c r q) f h w",
|
| 212 |
+
q=patch_size,
|
| 213 |
+
r=patch_size)
|
| 214 |
+
else:
|
| 215 |
+
raise ValueError(f"Invalid input shape: {x.shape}")
|
| 216 |
+
return x
|
| 217 |
+
|
| 218 |
+
|
| 219 |
+
def unpatchify(x, patch_size):
|
| 220 |
+
if patch_size == 1:
|
| 221 |
+
return x
|
| 222 |
+
if x.dim() == 4:
|
| 223 |
+
x = rearrange(x, "b (c r q) h w -> b c (h q) (w r)", q=patch_size, r=patch_size)
|
| 224 |
+
elif x.dim() == 5:
|
| 225 |
+
x = rearrange(x,
|
| 226 |
+
"b (c r q) f h w -> b c f (h q) (w r)",
|
| 227 |
+
q=patch_size,
|
| 228 |
+
r=patch_size)
|
| 229 |
+
return x
|
| 230 |
+
|
| 231 |
+
|
| 232 |
+
class Resample38(Resample):
|
| 233 |
+
|
| 234 |
+
def __init__(self, dim, mode):
|
| 235 |
+
assert mode in (
|
| 236 |
+
"none",
|
| 237 |
+
"upsample2d",
|
| 238 |
+
"upsample3d",
|
| 239 |
+
"downsample2d",
|
| 240 |
+
"downsample3d",
|
| 241 |
+
)
|
| 242 |
+
super(Resample, self).__init__()
|
| 243 |
+
self.dim = dim
|
| 244 |
+
self.mode = mode
|
| 245 |
+
|
| 246 |
+
# layers
|
| 247 |
+
if mode == "upsample2d":
|
| 248 |
+
self.resample = nn.Sequential(
|
| 249 |
+
Upsample(scale_factor=(2.0, 2.0), mode="nearest-exact"),
|
| 250 |
+
nn.Conv2d(dim, dim, 3, padding=1),
|
| 251 |
+
)
|
| 252 |
+
elif mode == "upsample3d":
|
| 253 |
+
self.resample = nn.Sequential(
|
| 254 |
+
Upsample(scale_factor=(2.0, 2.0), mode="nearest-exact"),
|
| 255 |
+
nn.Conv2d(dim, dim, 3, padding=1),
|
| 256 |
+
)
|
| 257 |
+
self.time_conv = CausalConv3d(dim, dim * 2, (3, 1, 1), padding=(1, 0, 0))
|
| 258 |
+
elif mode == "downsample2d":
|
| 259 |
+
self.resample = nn.Sequential(
|
| 260 |
+
nn.ZeroPad2d((0, 1, 0, 1)), nn.Conv2d(dim, dim, 3, stride=(2, 2))
|
| 261 |
+
)
|
| 262 |
+
elif mode == "downsample3d":
|
| 263 |
+
self.resample = nn.Sequential(
|
| 264 |
+
nn.ZeroPad2d((0, 1, 0, 1)), nn.Conv2d(dim, dim, 3, stride=(2, 2))
|
| 265 |
+
)
|
| 266 |
+
self.time_conv = CausalConv3d(
|
| 267 |
+
dim, dim, (3, 1, 1), stride=(2, 1, 1), padding=(0, 0, 0)
|
| 268 |
+
)
|
| 269 |
+
else:
|
| 270 |
+
self.resample = nn.Identity()
|
| 271 |
+
|
| 272 |
+
class ResidualBlock(nn.Module):
|
| 273 |
+
|
| 274 |
+
def __init__(self, in_dim, out_dim, dropout=0.0):
|
| 275 |
+
super().__init__()
|
| 276 |
+
self.in_dim = in_dim
|
| 277 |
+
self.out_dim = out_dim
|
| 278 |
+
|
| 279 |
+
# layers
|
| 280 |
+
self.residual = nn.Sequential(
|
| 281 |
+
RMS_norm(in_dim, images=False), nn.SiLU(),
|
| 282 |
+
CausalConv3d(in_dim, out_dim, 3, padding=1),
|
| 283 |
+
RMS_norm(out_dim, images=False), nn.SiLU(), nn.Dropout(dropout),
|
| 284 |
+
CausalConv3d(out_dim, out_dim, 3, padding=1))
|
| 285 |
+
self.shortcut = CausalConv3d(in_dim, out_dim, 1) \
|
| 286 |
+
if in_dim != out_dim else nn.Identity()
|
| 287 |
+
|
| 288 |
+
def forward(self, x, feat_cache=None, feat_idx=[0]):
|
| 289 |
+
h = self.shortcut(x)
|
| 290 |
+
for layer in self.residual:
|
| 291 |
+
if check_is_instance(layer, CausalConv3d) and feat_cache is not None:
|
| 292 |
+
idx = feat_idx[0]
|
| 293 |
+
cache_x = x[:, :, -CACHE_T:, :, :].clone()
|
| 294 |
+
if cache_x.shape[2] < 2 and feat_cache[idx] is not None:
|
| 295 |
+
# cache last frame of last two chunk
|
| 296 |
+
cache_x = torch.cat([
|
| 297 |
+
feat_cache[idx][:, :, -1, :, :].unsqueeze(2).to(
|
| 298 |
+
cache_x.device), cache_x
|
| 299 |
+
],
|
| 300 |
+
dim=2)
|
| 301 |
+
x = layer(x, feat_cache[idx])
|
| 302 |
+
feat_cache[idx] = cache_x
|
| 303 |
+
feat_idx[0] += 1
|
| 304 |
+
else:
|
| 305 |
+
x = layer(x)
|
| 306 |
+
return x + h
|
| 307 |
+
|
| 308 |
+
|
| 309 |
+
class AttentionBlock(nn.Module):
|
| 310 |
+
"""
|
| 311 |
+
Causal self-attention with a single head.
|
| 312 |
+
"""
|
| 313 |
+
|
| 314 |
+
def __init__(self, dim):
|
| 315 |
+
super().__init__()
|
| 316 |
+
self.dim = dim
|
| 317 |
+
|
| 318 |
+
# layers
|
| 319 |
+
self.norm = RMS_norm(dim)
|
| 320 |
+
self.to_qkv = nn.Conv2d(dim, dim * 3, 1)
|
| 321 |
+
self.proj = nn.Conv2d(dim, dim, 1)
|
| 322 |
+
|
| 323 |
+
# zero out the last layer params
|
| 324 |
+
nn.init.zeros_(self.proj.weight)
|
| 325 |
+
|
| 326 |
+
def forward(self, x):
|
| 327 |
+
identity = x
|
| 328 |
+
b, c, t, h, w = x.size()
|
| 329 |
+
x = rearrange(x, 'b c t h w -> (b t) c h w')
|
| 330 |
+
x = self.norm(x)
|
| 331 |
+
# compute query, key, value
|
| 332 |
+
q, k, v = self.to_qkv(x).reshape(b * t, 1, c * 3, -1).permute(
|
| 333 |
+
0, 1, 3, 2).contiguous().chunk(3, dim=-1)
|
| 334 |
+
|
| 335 |
+
# apply attention
|
| 336 |
+
x = F.scaled_dot_product_attention(
|
| 337 |
+
q,
|
| 338 |
+
k,
|
| 339 |
+
v,
|
| 340 |
+
#attn_mask=block_causal_mask(q, block_size=h * w)
|
| 341 |
+
)
|
| 342 |
+
x = x.squeeze(1).permute(0, 2, 1).reshape(b * t, c, h, w)
|
| 343 |
+
|
| 344 |
+
# output
|
| 345 |
+
x = self.proj(x)
|
| 346 |
+
x = rearrange(x, '(b t) c h w-> b c t h w', t=t)
|
| 347 |
+
return x + identity
|
| 348 |
+
|
| 349 |
+
|
| 350 |
+
class AvgDown3D(nn.Module):
|
| 351 |
+
def __init__(
|
| 352 |
+
self,
|
| 353 |
+
in_channels,
|
| 354 |
+
out_channels,
|
| 355 |
+
factor_t,
|
| 356 |
+
factor_s=1,
|
| 357 |
+
):
|
| 358 |
+
super().__init__()
|
| 359 |
+
self.in_channels = in_channels
|
| 360 |
+
self.out_channels = out_channels
|
| 361 |
+
self.factor_t = factor_t
|
| 362 |
+
self.factor_s = factor_s
|
| 363 |
+
self.factor = self.factor_t * self.factor_s * self.factor_s
|
| 364 |
+
|
| 365 |
+
assert in_channels * self.factor % out_channels == 0
|
| 366 |
+
self.group_size = in_channels * self.factor // out_channels
|
| 367 |
+
|
| 368 |
+
def forward(self, x: torch.Tensor) -> torch.Tensor:
|
| 369 |
+
pad_t = (self.factor_t - x.shape[2] % self.factor_t) % self.factor_t
|
| 370 |
+
pad = (0, 0, 0, 0, pad_t, 0)
|
| 371 |
+
x = F.pad(x, pad)
|
| 372 |
+
B, C, T, H, W = x.shape
|
| 373 |
+
x = x.view(
|
| 374 |
+
B,
|
| 375 |
+
C,
|
| 376 |
+
T // self.factor_t,
|
| 377 |
+
self.factor_t,
|
| 378 |
+
H // self.factor_s,
|
| 379 |
+
self.factor_s,
|
| 380 |
+
W // self.factor_s,
|
| 381 |
+
self.factor_s,
|
| 382 |
+
)
|
| 383 |
+
x = x.permute(0, 1, 3, 5, 7, 2, 4, 6).contiguous()
|
| 384 |
+
x = x.view(
|
| 385 |
+
B,
|
| 386 |
+
C * self.factor,
|
| 387 |
+
T // self.factor_t,
|
| 388 |
+
H // self.factor_s,
|
| 389 |
+
W // self.factor_s,
|
| 390 |
+
)
|
| 391 |
+
x = x.view(
|
| 392 |
+
B,
|
| 393 |
+
self.out_channels,
|
| 394 |
+
self.group_size,
|
| 395 |
+
T // self.factor_t,
|
| 396 |
+
H // self.factor_s,
|
| 397 |
+
W // self.factor_s,
|
| 398 |
+
)
|
| 399 |
+
x = x.mean(dim=2)
|
| 400 |
+
return x
|
| 401 |
+
|
| 402 |
+
|
| 403 |
+
class DupUp3D(nn.Module):
|
| 404 |
+
def __init__(
|
| 405 |
+
self,
|
| 406 |
+
in_channels: int,
|
| 407 |
+
out_channels: int,
|
| 408 |
+
factor_t,
|
| 409 |
+
factor_s=1,
|
| 410 |
+
):
|
| 411 |
+
super().__init__()
|
| 412 |
+
self.in_channels = in_channels
|
| 413 |
+
self.out_channels = out_channels
|
| 414 |
+
|
| 415 |
+
self.factor_t = factor_t
|
| 416 |
+
self.factor_s = factor_s
|
| 417 |
+
self.factor = self.factor_t * self.factor_s * self.factor_s
|
| 418 |
+
|
| 419 |
+
assert out_channels * self.factor % in_channels == 0
|
| 420 |
+
self.repeats = out_channels * self.factor // in_channels
|
| 421 |
+
|
| 422 |
+
def forward(self, x: torch.Tensor, first_chunk=False) -> torch.Tensor:
|
| 423 |
+
x = x.repeat_interleave(self.repeats, dim=1)
|
| 424 |
+
x = x.view(
|
| 425 |
+
x.size(0),
|
| 426 |
+
self.out_channels,
|
| 427 |
+
self.factor_t,
|
| 428 |
+
self.factor_s,
|
| 429 |
+
self.factor_s,
|
| 430 |
+
x.size(2),
|
| 431 |
+
x.size(3),
|
| 432 |
+
x.size(4),
|
| 433 |
+
)
|
| 434 |
+
x = x.permute(0, 1, 5, 2, 6, 3, 7, 4).contiguous()
|
| 435 |
+
x = x.view(
|
| 436 |
+
x.size(0),
|
| 437 |
+
self.out_channels,
|
| 438 |
+
x.size(2) * self.factor_t,
|
| 439 |
+
x.size(4) * self.factor_s,
|
| 440 |
+
x.size(6) * self.factor_s,
|
| 441 |
+
)
|
| 442 |
+
if first_chunk:
|
| 443 |
+
x = x[:, :, self.factor_t - 1 :, :, :]
|
| 444 |
+
return x
|
| 445 |
+
|
| 446 |
+
|
| 447 |
+
class Down_ResidualBlock(nn.Module):
|
| 448 |
+
def __init__(
|
| 449 |
+
self, in_dim, out_dim, dropout, mult, temperal_downsample=False, down_flag=False
|
| 450 |
+
):
|
| 451 |
+
super().__init__()
|
| 452 |
+
|
| 453 |
+
# Shortcut path with downsample
|
| 454 |
+
self.avg_shortcut = AvgDown3D(
|
| 455 |
+
in_dim,
|
| 456 |
+
out_dim,
|
| 457 |
+
factor_t=2 if temperal_downsample else 1,
|
| 458 |
+
factor_s=2 if down_flag else 1,
|
| 459 |
+
)
|
| 460 |
+
|
| 461 |
+
# Main path with residual blocks and downsample
|
| 462 |
+
downsamples = []
|
| 463 |
+
for _ in range(mult):
|
| 464 |
+
downsamples.append(ResidualBlock(in_dim, out_dim, dropout))
|
| 465 |
+
in_dim = out_dim
|
| 466 |
+
|
| 467 |
+
# Add the final downsample block
|
| 468 |
+
if down_flag:
|
| 469 |
+
mode = "downsample3d" if temperal_downsample else "downsample2d"
|
| 470 |
+
downsamples.append(Resample38(out_dim, mode=mode))
|
| 471 |
+
|
| 472 |
+
self.downsamples = nn.Sequential(*downsamples)
|
| 473 |
+
|
| 474 |
+
def forward(self, x, feat_cache=None, feat_idx=[0]):
|
| 475 |
+
x_copy = x.clone()
|
| 476 |
+
for module in self.downsamples:
|
| 477 |
+
x = module(x, feat_cache, feat_idx)
|
| 478 |
+
|
| 479 |
+
return x + self.avg_shortcut(x_copy)
|
| 480 |
+
|
| 481 |
+
|
| 482 |
+
class Up_ResidualBlock(nn.Module):
|
| 483 |
+
def __init__(
|
| 484 |
+
self, in_dim, out_dim, dropout, mult, temperal_upsample=False, up_flag=False
|
| 485 |
+
):
|
| 486 |
+
super().__init__()
|
| 487 |
+
# Shortcut path with upsample
|
| 488 |
+
if up_flag:
|
| 489 |
+
self.avg_shortcut = DupUp3D(
|
| 490 |
+
in_dim,
|
| 491 |
+
out_dim,
|
| 492 |
+
factor_t=2 if temperal_upsample else 1,
|
| 493 |
+
factor_s=2 if up_flag else 1,
|
| 494 |
+
)
|
| 495 |
+
else:
|
| 496 |
+
self.avg_shortcut = None
|
| 497 |
+
|
| 498 |
+
# Main path with residual blocks and upsample
|
| 499 |
+
upsamples = []
|
| 500 |
+
for _ in range(mult):
|
| 501 |
+
upsamples.append(ResidualBlock(in_dim, out_dim, dropout))
|
| 502 |
+
in_dim = out_dim
|
| 503 |
+
|
| 504 |
+
# Add the final upsample block
|
| 505 |
+
if up_flag:
|
| 506 |
+
mode = "upsample3d" if temperal_upsample else "upsample2d"
|
| 507 |
+
upsamples.append(Resample38(out_dim, mode=mode))
|
| 508 |
+
|
| 509 |
+
self.upsamples = nn.Sequential(*upsamples)
|
| 510 |
+
|
| 511 |
+
def forward(self, x, feat_cache=None, feat_idx=[0], first_chunk=False):
|
| 512 |
+
x_main = x.clone()
|
| 513 |
+
for module in self.upsamples:
|
| 514 |
+
x_main = module(x_main, feat_cache, feat_idx)
|
| 515 |
+
if self.avg_shortcut is not None:
|
| 516 |
+
x_shortcut = self.avg_shortcut(x, first_chunk)
|
| 517 |
+
return x_main + x_shortcut
|
| 518 |
+
else:
|
| 519 |
+
return x_main
|
| 520 |
+
|
| 521 |
+
|
| 522 |
+
class Encoder3d(nn.Module):
|
| 523 |
+
|
| 524 |
+
def __init__(self,
|
| 525 |
+
dim=128,
|
| 526 |
+
z_dim=4,
|
| 527 |
+
dim_mult=[1, 2, 4, 4],
|
| 528 |
+
num_res_blocks=2,
|
| 529 |
+
attn_scales=[],
|
| 530 |
+
temperal_downsample=[True, True, False],
|
| 531 |
+
dropout=0.0):
|
| 532 |
+
super().__init__()
|
| 533 |
+
self.dim = dim
|
| 534 |
+
self.z_dim = z_dim
|
| 535 |
+
self.dim_mult = dim_mult
|
| 536 |
+
self.num_res_blocks = num_res_blocks
|
| 537 |
+
self.attn_scales = attn_scales
|
| 538 |
+
self.temperal_downsample = temperal_downsample
|
| 539 |
+
|
| 540 |
+
# dimensions
|
| 541 |
+
dims = [dim * u for u in [1] + dim_mult]
|
| 542 |
+
scale = 1.0
|
| 543 |
+
|
| 544 |
+
# init block
|
| 545 |
+
self.conv1 = CausalConv3d(3, dims[0], 3, padding=1)
|
| 546 |
+
|
| 547 |
+
# downsample blocks
|
| 548 |
+
downsamples = []
|
| 549 |
+
for i, (in_dim, out_dim) in enumerate(zip(dims[:-1], dims[1:])):
|
| 550 |
+
# residual (+attention) blocks
|
| 551 |
+
for _ in range(num_res_blocks):
|
| 552 |
+
downsamples.append(ResidualBlock(in_dim, out_dim, dropout))
|
| 553 |
+
if scale in attn_scales:
|
| 554 |
+
downsamples.append(AttentionBlock(out_dim))
|
| 555 |
+
in_dim = out_dim
|
| 556 |
+
|
| 557 |
+
# downsample block
|
| 558 |
+
if i != len(dim_mult) - 1:
|
| 559 |
+
mode = 'downsample3d' if temperal_downsample[
|
| 560 |
+
i] else 'downsample2d'
|
| 561 |
+
downsamples.append(Resample(out_dim, mode=mode))
|
| 562 |
+
scale /= 2.0
|
| 563 |
+
self.downsamples = nn.Sequential(*downsamples)
|
| 564 |
+
|
| 565 |
+
# middle blocks
|
| 566 |
+
self.middle = nn.Sequential(ResidualBlock(out_dim, out_dim, dropout),
|
| 567 |
+
AttentionBlock(out_dim),
|
| 568 |
+
ResidualBlock(out_dim, out_dim, dropout))
|
| 569 |
+
|
| 570 |
+
# output blocks
|
| 571 |
+
self.head = nn.Sequential(RMS_norm(out_dim, images=False), nn.SiLU(),
|
| 572 |
+
CausalConv3d(out_dim, z_dim, 3, padding=1))
|
| 573 |
+
|
| 574 |
+
def forward(self, x, feat_cache=None, feat_idx=[0]):
|
| 575 |
+
if feat_cache is not None:
|
| 576 |
+
idx = feat_idx[0]
|
| 577 |
+
cache_x = x[:, :, -CACHE_T:, :, :].clone()
|
| 578 |
+
if cache_x.shape[2] < 2 and feat_cache[idx] is not None:
|
| 579 |
+
# cache last frame of last two chunk
|
| 580 |
+
cache_x = torch.cat([
|
| 581 |
+
feat_cache[idx][:, :, -1, :, :].unsqueeze(2).to(
|
| 582 |
+
cache_x.device), cache_x
|
| 583 |
+
],
|
| 584 |
+
dim=2)
|
| 585 |
+
x = self.conv1(x, feat_cache[idx])
|
| 586 |
+
feat_cache[idx] = cache_x
|
| 587 |
+
feat_idx[0] += 1
|
| 588 |
+
else:
|
| 589 |
+
x = self.conv1(x)
|
| 590 |
+
|
| 591 |
+
## downsamples
|
| 592 |
+
for layer in self.downsamples:
|
| 593 |
+
if feat_cache is not None:
|
| 594 |
+
x = layer(x, feat_cache, feat_idx)
|
| 595 |
+
else:
|
| 596 |
+
x = layer(x)
|
| 597 |
+
|
| 598 |
+
## middle
|
| 599 |
+
for layer in self.middle:
|
| 600 |
+
if check_is_instance(layer, ResidualBlock) and feat_cache is not None:
|
| 601 |
+
x = layer(x, feat_cache, feat_idx)
|
| 602 |
+
else:
|
| 603 |
+
x = layer(x)
|
| 604 |
+
|
| 605 |
+
## head
|
| 606 |
+
for layer in self.head:
|
| 607 |
+
if check_is_instance(layer, CausalConv3d) and feat_cache is not None:
|
| 608 |
+
idx = feat_idx[0]
|
| 609 |
+
cache_x = x[:, :, -CACHE_T:, :, :].clone()
|
| 610 |
+
if cache_x.shape[2] < 2 and feat_cache[idx] is not None:
|
| 611 |
+
# cache last frame of last two chunk
|
| 612 |
+
cache_x = torch.cat([
|
| 613 |
+
feat_cache[idx][:, :, -1, :, :].unsqueeze(2).to(
|
| 614 |
+
cache_x.device), cache_x
|
| 615 |
+
],
|
| 616 |
+
dim=2)
|
| 617 |
+
x = layer(x, feat_cache[idx])
|
| 618 |
+
feat_cache[idx] = cache_x
|
| 619 |
+
feat_idx[0] += 1
|
| 620 |
+
else:
|
| 621 |
+
x = layer(x)
|
| 622 |
+
return x
|
| 623 |
+
|
| 624 |
+
|
| 625 |
+
class Encoder3d_38(nn.Module):
|
| 626 |
+
|
| 627 |
+
def __init__(self,
|
| 628 |
+
dim=128,
|
| 629 |
+
z_dim=4,
|
| 630 |
+
dim_mult=[1, 2, 4, 4],
|
| 631 |
+
num_res_blocks=2,
|
| 632 |
+
attn_scales=[],
|
| 633 |
+
temperal_downsample=[False, True, True],
|
| 634 |
+
dropout=0.0):
|
| 635 |
+
super().__init__()
|
| 636 |
+
self.dim = dim
|
| 637 |
+
self.z_dim = z_dim
|
| 638 |
+
self.dim_mult = dim_mult
|
| 639 |
+
self.num_res_blocks = num_res_blocks
|
| 640 |
+
self.attn_scales = attn_scales
|
| 641 |
+
self.temperal_downsample = temperal_downsample
|
| 642 |
+
|
| 643 |
+
# dimensions
|
| 644 |
+
dims = [dim * u for u in [1] + dim_mult]
|
| 645 |
+
scale = 1.0
|
| 646 |
+
|
| 647 |
+
# init block
|
| 648 |
+
self.conv1 = CausalConv3d(12, dims[0], 3, padding=1)
|
| 649 |
+
|
| 650 |
+
# downsample blocks
|
| 651 |
+
downsamples = []
|
| 652 |
+
for i, (in_dim, out_dim) in enumerate(zip(dims[:-1], dims[1:])):
|
| 653 |
+
t_down_flag = (
|
| 654 |
+
temperal_downsample[i] if i < len(temperal_downsample) else False
|
| 655 |
+
)
|
| 656 |
+
downsamples.append(
|
| 657 |
+
Down_ResidualBlock(
|
| 658 |
+
in_dim=in_dim,
|
| 659 |
+
out_dim=out_dim,
|
| 660 |
+
dropout=dropout,
|
| 661 |
+
mult=num_res_blocks,
|
| 662 |
+
temperal_downsample=t_down_flag,
|
| 663 |
+
down_flag=i != len(dim_mult) - 1,
|
| 664 |
+
)
|
| 665 |
+
)
|
| 666 |
+
scale /= 2.0
|
| 667 |
+
self.downsamples = nn.Sequential(*downsamples)
|
| 668 |
+
|
| 669 |
+
# middle blocks
|
| 670 |
+
self.middle = nn.Sequential(
|
| 671 |
+
ResidualBlock(out_dim, out_dim, dropout),
|
| 672 |
+
AttentionBlock(out_dim),
|
| 673 |
+
ResidualBlock(out_dim, out_dim, dropout),
|
| 674 |
+
)
|
| 675 |
+
|
| 676 |
+
# # output blocks
|
| 677 |
+
self.head = nn.Sequential(
|
| 678 |
+
RMS_norm(out_dim, images=False),
|
| 679 |
+
nn.SiLU(),
|
| 680 |
+
CausalConv3d(out_dim, z_dim, 3, padding=1),
|
| 681 |
+
)
|
| 682 |
+
|
| 683 |
+
|
| 684 |
+
def forward(self, x, feat_cache=None, feat_idx=[0]):
|
| 685 |
+
|
| 686 |
+
if feat_cache is not None:
|
| 687 |
+
idx = feat_idx[0]
|
| 688 |
+
cache_x = x[:, :, -CACHE_T:, :, :].clone()
|
| 689 |
+
if cache_x.shape[2] < 2 and feat_cache[idx] is not None:
|
| 690 |
+
cache_x = torch.cat(
|
| 691 |
+
[
|
| 692 |
+
feat_cache[idx][:, :, -1, :, :].unsqueeze(2).to(cache_x.device),
|
| 693 |
+
cache_x,
|
| 694 |
+
],
|
| 695 |
+
dim=2,
|
| 696 |
+
)
|
| 697 |
+
x = self.conv1(x, feat_cache[idx])
|
| 698 |
+
feat_cache[idx] = cache_x
|
| 699 |
+
feat_idx[0] += 1
|
| 700 |
+
else:
|
| 701 |
+
x = self.conv1(x)
|
| 702 |
+
|
| 703 |
+
## downsamples
|
| 704 |
+
for layer in self.downsamples:
|
| 705 |
+
if feat_cache is not None:
|
| 706 |
+
x = layer(x, feat_cache, feat_idx)
|
| 707 |
+
else:
|
| 708 |
+
x = layer(x)
|
| 709 |
+
|
| 710 |
+
## middle
|
| 711 |
+
for layer in self.middle:
|
| 712 |
+
if isinstance(layer, ResidualBlock) and feat_cache is not None:
|
| 713 |
+
x = layer(x, feat_cache, feat_idx)
|
| 714 |
+
else:
|
| 715 |
+
x = layer(x)
|
| 716 |
+
|
| 717 |
+
## head
|
| 718 |
+
for layer in self.head:
|
| 719 |
+
if isinstance(layer, CausalConv3d) and feat_cache is not None:
|
| 720 |
+
idx = feat_idx[0]
|
| 721 |
+
cache_x = x[:, :, -CACHE_T:, :, :].clone()
|
| 722 |
+
if cache_x.shape[2] < 2 and feat_cache[idx] is not None:
|
| 723 |
+
cache_x = torch.cat(
|
| 724 |
+
[
|
| 725 |
+
feat_cache[idx][:, :, -1, :, :]
|
| 726 |
+
.unsqueeze(2)
|
| 727 |
+
.to(cache_x.device),
|
| 728 |
+
cache_x,
|
| 729 |
+
],
|
| 730 |
+
dim=2,
|
| 731 |
+
)
|
| 732 |
+
x = layer(x, feat_cache[idx])
|
| 733 |
+
feat_cache[idx] = cache_x
|
| 734 |
+
feat_idx[0] += 1
|
| 735 |
+
else:
|
| 736 |
+
x = layer(x)
|
| 737 |
+
|
| 738 |
+
return x
|
| 739 |
+
|
| 740 |
+
|
| 741 |
+
class Decoder3d(nn.Module):
|
| 742 |
+
|
| 743 |
+
def __init__(self,
|
| 744 |
+
dim=128,
|
| 745 |
+
z_dim=4,
|
| 746 |
+
dim_mult=[1, 2, 4, 4],
|
| 747 |
+
num_res_blocks=2,
|
| 748 |
+
attn_scales=[],
|
| 749 |
+
temperal_upsample=[False, True, True],
|
| 750 |
+
dropout=0.0):
|
| 751 |
+
super().__init__()
|
| 752 |
+
self.dim = dim
|
| 753 |
+
self.z_dim = z_dim
|
| 754 |
+
self.dim_mult = dim_mult
|
| 755 |
+
self.num_res_blocks = num_res_blocks
|
| 756 |
+
self.attn_scales = attn_scales
|
| 757 |
+
self.temperal_upsample = temperal_upsample
|
| 758 |
+
|
| 759 |
+
# dimensions
|
| 760 |
+
dims = [dim * u for u in [dim_mult[-1]] + dim_mult[::-1]]
|
| 761 |
+
scale = 1.0 / 2**(len(dim_mult) - 2)
|
| 762 |
+
|
| 763 |
+
# init block
|
| 764 |
+
self.conv1 = CausalConv3d(z_dim, dims[0], 3, padding=1)
|
| 765 |
+
|
| 766 |
+
# middle blocks
|
| 767 |
+
self.middle = nn.Sequential(ResidualBlock(dims[0], dims[0], dropout),
|
| 768 |
+
AttentionBlock(dims[0]),
|
| 769 |
+
ResidualBlock(dims[0], dims[0], dropout))
|
| 770 |
+
|
| 771 |
+
# upsample blocks
|
| 772 |
+
upsamples = []
|
| 773 |
+
for i, (in_dim, out_dim) in enumerate(zip(dims[:-1], dims[1:])):
|
| 774 |
+
# residual (+attention) blocks
|
| 775 |
+
if i == 1 or i == 2 or i == 3:
|
| 776 |
+
in_dim = in_dim // 2
|
| 777 |
+
for _ in range(num_res_blocks + 1):
|
| 778 |
+
upsamples.append(ResidualBlock(in_dim, out_dim, dropout))
|
| 779 |
+
if scale in attn_scales:
|
| 780 |
+
upsamples.append(AttentionBlock(out_dim))
|
| 781 |
+
in_dim = out_dim
|
| 782 |
+
|
| 783 |
+
# upsample block
|
| 784 |
+
if i != len(dim_mult) - 1:
|
| 785 |
+
mode = 'upsample3d' if temperal_upsample[i] else 'upsample2d'
|
| 786 |
+
upsamples.append(Resample(out_dim, mode=mode))
|
| 787 |
+
scale *= 2.0
|
| 788 |
+
self.upsamples = nn.Sequential(*upsamples)
|
| 789 |
+
|
| 790 |
+
# output blocks
|
| 791 |
+
self.head = nn.Sequential(RMS_norm(out_dim, images=False), nn.SiLU(),
|
| 792 |
+
CausalConv3d(out_dim, 3, 3, padding=1))
|
| 793 |
+
|
| 794 |
+
def forward(self, x, feat_cache=None, feat_idx=[0]):
|
| 795 |
+
## conv1
|
| 796 |
+
if feat_cache is not None:
|
| 797 |
+
idx = feat_idx[0]
|
| 798 |
+
cache_x = x[:, :, -CACHE_T:, :, :].clone()
|
| 799 |
+
if cache_x.shape[2] < 2 and feat_cache[idx] is not None:
|
| 800 |
+
# cache last frame of last two chunk
|
| 801 |
+
cache_x = torch.cat([
|
| 802 |
+
feat_cache[idx][:, :, -1, :, :].unsqueeze(2).to(
|
| 803 |
+
cache_x.device), cache_x
|
| 804 |
+
],
|
| 805 |
+
dim=2)
|
| 806 |
+
x = self.conv1(x, feat_cache[idx])
|
| 807 |
+
feat_cache[idx] = cache_x
|
| 808 |
+
feat_idx[0] += 1
|
| 809 |
+
else:
|
| 810 |
+
x = self.conv1(x)
|
| 811 |
+
|
| 812 |
+
## middle
|
| 813 |
+
for layer in self.middle:
|
| 814 |
+
if check_is_instance(layer, ResidualBlock) and feat_cache is not None:
|
| 815 |
+
x = layer(x, feat_cache, feat_idx)
|
| 816 |
+
else:
|
| 817 |
+
x = layer(x)
|
| 818 |
+
|
| 819 |
+
## upsamples
|
| 820 |
+
for layer in self.upsamples:
|
| 821 |
+
if feat_cache is not None:
|
| 822 |
+
x = layer(x, feat_cache, feat_idx)
|
| 823 |
+
else:
|
| 824 |
+
x = layer(x)
|
| 825 |
+
|
| 826 |
+
## head
|
| 827 |
+
for layer in self.head:
|
| 828 |
+
if check_is_instance(layer, CausalConv3d) and feat_cache is not None:
|
| 829 |
+
idx = feat_idx[0]
|
| 830 |
+
cache_x = x[:, :, -CACHE_T:, :, :].clone()
|
| 831 |
+
if cache_x.shape[2] < 2 and feat_cache[idx] is not None:
|
| 832 |
+
# cache last frame of last two chunk
|
| 833 |
+
cache_x = torch.cat([
|
| 834 |
+
feat_cache[idx][:, :, -1, :, :].unsqueeze(2).to(
|
| 835 |
+
cache_x.device), cache_x
|
| 836 |
+
],
|
| 837 |
+
dim=2)
|
| 838 |
+
x = layer(x, feat_cache[idx])
|
| 839 |
+
feat_cache[idx] = cache_x
|
| 840 |
+
feat_idx[0] += 1
|
| 841 |
+
else:
|
| 842 |
+
x = layer(x)
|
| 843 |
+
return x
|
| 844 |
+
|
| 845 |
+
|
| 846 |
+
|
| 847 |
+
class Decoder3d_38(nn.Module):
|
| 848 |
+
|
| 849 |
+
def __init__(self,
|
| 850 |
+
dim=128,
|
| 851 |
+
z_dim=4,
|
| 852 |
+
dim_mult=[1, 2, 4, 4],
|
| 853 |
+
num_res_blocks=2,
|
| 854 |
+
attn_scales=[],
|
| 855 |
+
temperal_upsample=[False, True, True],
|
| 856 |
+
dropout=0.0):
|
| 857 |
+
super().__init__()
|
| 858 |
+
self.dim = dim
|
| 859 |
+
self.z_dim = z_dim
|
| 860 |
+
self.dim_mult = dim_mult
|
| 861 |
+
self.num_res_blocks = num_res_blocks
|
| 862 |
+
self.attn_scales = attn_scales
|
| 863 |
+
self.temperal_upsample = temperal_upsample
|
| 864 |
+
|
| 865 |
+
# dimensions
|
| 866 |
+
dims = [dim * u for u in [dim_mult[-1]] + dim_mult[::-1]]
|
| 867 |
+
scale = 1.0 / 2 ** (len(dim_mult) - 2)
|
| 868 |
+
# init block
|
| 869 |
+
self.conv1 = CausalConv3d(z_dim, dims[0], 3, padding=1)
|
| 870 |
+
|
| 871 |
+
# middle blocks
|
| 872 |
+
self.middle = nn.Sequential(ResidualBlock(dims[0], dims[0], dropout),
|
| 873 |
+
AttentionBlock(dims[0]),
|
| 874 |
+
ResidualBlock(dims[0], dims[0], dropout))
|
| 875 |
+
|
| 876 |
+
# upsample blocks
|
| 877 |
+
upsamples = []
|
| 878 |
+
for i, (in_dim, out_dim) in enumerate(zip(dims[:-1], dims[1:])):
|
| 879 |
+
t_up_flag = temperal_upsample[i] if i < len(temperal_upsample) else False
|
| 880 |
+
upsamples.append(
|
| 881 |
+
Up_ResidualBlock(in_dim=in_dim,
|
| 882 |
+
out_dim=out_dim,
|
| 883 |
+
dropout=dropout,
|
| 884 |
+
mult=num_res_blocks + 1,
|
| 885 |
+
temperal_upsample=t_up_flag,
|
| 886 |
+
up_flag=i != len(dim_mult) - 1))
|
| 887 |
+
self.upsamples = nn.Sequential(*upsamples)
|
| 888 |
+
|
| 889 |
+
# output blocks
|
| 890 |
+
self.head = nn.Sequential(RMS_norm(out_dim, images=False), nn.SiLU(),
|
| 891 |
+
CausalConv3d(out_dim, 12, 3, padding=1))
|
| 892 |
+
|
| 893 |
+
|
| 894 |
+
def forward(self, x, feat_cache=None, feat_idx=[0], first_chunk=False):
|
| 895 |
+
if feat_cache is not None:
|
| 896 |
+
idx = feat_idx[0]
|
| 897 |
+
cache_x = x[:, :, -CACHE_T:, :, :].clone()
|
| 898 |
+
if cache_x.shape[2] < 2 and feat_cache[idx] is not None:
|
| 899 |
+
cache_x = torch.cat(
|
| 900 |
+
[
|
| 901 |
+
feat_cache[idx][:, :, -1, :, :].unsqueeze(2).to(cache_x.device),
|
| 902 |
+
cache_x,
|
| 903 |
+
],
|
| 904 |
+
dim=2,
|
| 905 |
+
)
|
| 906 |
+
x = self.conv1(x, feat_cache[idx])
|
| 907 |
+
feat_cache[idx] = cache_x
|
| 908 |
+
feat_idx[0] += 1
|
| 909 |
+
else:
|
| 910 |
+
x = self.conv1(x)
|
| 911 |
+
|
| 912 |
+
for layer in self.middle:
|
| 913 |
+
if check_is_instance(layer, ResidualBlock) and feat_cache is not None:
|
| 914 |
+
x = layer(x, feat_cache, feat_idx)
|
| 915 |
+
else:
|
| 916 |
+
x = layer(x)
|
| 917 |
+
|
| 918 |
+
## upsamples
|
| 919 |
+
for layer in self.upsamples:
|
| 920 |
+
if feat_cache is not None:
|
| 921 |
+
x = layer(x, feat_cache, feat_idx, first_chunk)
|
| 922 |
+
else:
|
| 923 |
+
x = layer(x)
|
| 924 |
+
|
| 925 |
+
## head
|
| 926 |
+
for layer in self.head:
|
| 927 |
+
if check_is_instance(layer, CausalConv3d) and feat_cache is not None:
|
| 928 |
+
idx = feat_idx[0]
|
| 929 |
+
cache_x = x[:, :, -CACHE_T:, :, :].clone()
|
| 930 |
+
if cache_x.shape[2] < 2 and feat_cache[idx] is not None:
|
| 931 |
+
cache_x = torch.cat(
|
| 932 |
+
[
|
| 933 |
+
feat_cache[idx][:, :, -1, :, :]
|
| 934 |
+
.unsqueeze(2)
|
| 935 |
+
.to(cache_x.device),
|
| 936 |
+
cache_x,
|
| 937 |
+
],
|
| 938 |
+
dim=2,
|
| 939 |
+
)
|
| 940 |
+
x = layer(x, feat_cache[idx])
|
| 941 |
+
feat_cache[idx] = cache_x
|
| 942 |
+
feat_idx[0] += 1
|
| 943 |
+
else:
|
| 944 |
+
x = layer(x)
|
| 945 |
+
return x
|
| 946 |
+
|
| 947 |
+
|
| 948 |
+
def count_conv3d(model):
|
| 949 |
+
count = 0
|
| 950 |
+
for m in model.modules():
|
| 951 |
+
if isinstance(m, CausalConv3d):
|
| 952 |
+
count += 1
|
| 953 |
+
return count
|
| 954 |
+
|
| 955 |
+
|
| 956 |
+
class VideoVAE_(nn.Module):
|
| 957 |
+
|
| 958 |
+
def __init__(self,
|
| 959 |
+
dim=96,
|
| 960 |
+
z_dim=16,
|
| 961 |
+
dim_mult=[1, 2, 4, 4],
|
| 962 |
+
num_res_blocks=2,
|
| 963 |
+
attn_scales=[],
|
| 964 |
+
temperal_downsample=[False, True, True],
|
| 965 |
+
dropout=0.0):
|
| 966 |
+
super().__init__()
|
| 967 |
+
self.dim = dim
|
| 968 |
+
self.z_dim = z_dim
|
| 969 |
+
self.dim_mult = dim_mult
|
| 970 |
+
self.num_res_blocks = num_res_blocks
|
| 971 |
+
self.attn_scales = attn_scales
|
| 972 |
+
self.temperal_downsample = temperal_downsample
|
| 973 |
+
self.temperal_upsample = temperal_downsample[::-1]
|
| 974 |
+
|
| 975 |
+
# modules
|
| 976 |
+
self.encoder = Encoder3d(dim, z_dim * 2, dim_mult, num_res_blocks,
|
| 977 |
+
attn_scales, self.temperal_downsample, dropout)
|
| 978 |
+
self.conv1 = CausalConv3d(z_dim * 2, z_dim * 2, 1)
|
| 979 |
+
self.conv2 = CausalConv3d(z_dim, z_dim, 1)
|
| 980 |
+
self.decoder = Decoder3d(dim, z_dim, dim_mult, num_res_blocks,
|
| 981 |
+
attn_scales, self.temperal_upsample, dropout)
|
| 982 |
+
|
| 983 |
+
def forward(self, x):
|
| 984 |
+
mu, log_var = self.encode(x)
|
| 985 |
+
z = self.reparameterize(mu, log_var)
|
| 986 |
+
x_recon = self.decode(z)
|
| 987 |
+
return x_recon, mu, log_var
|
| 988 |
+
|
| 989 |
+
def encode(self, x, scale):
|
| 990 |
+
self.clear_cache()
|
| 991 |
+
## cache
|
| 992 |
+
t = x.shape[2]
|
| 993 |
+
iter_ = 1 + (t - 1) // 4
|
| 994 |
+
|
| 995 |
+
for i in range(iter_):
|
| 996 |
+
self._enc_conv_idx = [0]
|
| 997 |
+
if i == 0:
|
| 998 |
+
out = self.encoder(x[:, :, :1, :, :],
|
| 999 |
+
feat_cache=self._enc_feat_map,
|
| 1000 |
+
feat_idx=self._enc_conv_idx)
|
| 1001 |
+
else:
|
| 1002 |
+
out_ = self.encoder(x[:, :, 1 + 4 * (i - 1):1 + 4 * i, :, :],
|
| 1003 |
+
feat_cache=self._enc_feat_map,
|
| 1004 |
+
feat_idx=self._enc_conv_idx)
|
| 1005 |
+
out = torch.cat([out, out_], 2)
|
| 1006 |
+
mu, log_var = self.conv1(out).chunk(2, dim=1)
|
| 1007 |
+
if isinstance(scale[0], torch.Tensor):
|
| 1008 |
+
scale = [s.to(dtype=mu.dtype, device=mu.device) for s in scale]
|
| 1009 |
+
mu = (mu - scale[0].view(1, self.z_dim, 1, 1, 1)) * scale[1].view(
|
| 1010 |
+
1, self.z_dim, 1, 1, 1)
|
| 1011 |
+
else:
|
| 1012 |
+
scale = scale.to(dtype=mu.dtype, device=mu.device)
|
| 1013 |
+
mu = (mu - scale[0]) * scale[1]
|
| 1014 |
+
return mu
|
| 1015 |
+
|
| 1016 |
+
def decode(self, z, scale):
|
| 1017 |
+
self.clear_cache()
|
| 1018 |
+
# z: [b,c,t,h,w]
|
| 1019 |
+
if isinstance(scale[0], torch.Tensor):
|
| 1020 |
+
scale = [s.to(dtype=z.dtype, device=z.device) for s in scale]
|
| 1021 |
+
z = z / scale[1].view(1, self.z_dim, 1, 1, 1) + scale[0].view(
|
| 1022 |
+
1, self.z_dim, 1, 1, 1)
|
| 1023 |
+
else:
|
| 1024 |
+
scale = scale.to(dtype=z.dtype, device=z.device)
|
| 1025 |
+
z = z / scale[1] + scale[0]
|
| 1026 |
+
iter_ = z.shape[2]
|
| 1027 |
+
x = self.conv2(z)
|
| 1028 |
+
for i in range(iter_):
|
| 1029 |
+
self._conv_idx = [0]
|
| 1030 |
+
if i == 0:
|
| 1031 |
+
out = self.decoder(x[:, :, i:i + 1, :, :],
|
| 1032 |
+
feat_cache=self._feat_map,
|
| 1033 |
+
feat_idx=self._conv_idx)
|
| 1034 |
+
else:
|
| 1035 |
+
out_ = self.decoder(x[:, :, i:i + 1, :, :],
|
| 1036 |
+
feat_cache=self._feat_map,
|
| 1037 |
+
feat_idx=self._conv_idx)
|
| 1038 |
+
out = torch.cat([out, out_], 2) # may add tensor offload
|
| 1039 |
+
return out
|
| 1040 |
+
|
| 1041 |
+
def reparameterize(self, mu, log_var):
|
| 1042 |
+
std = torch.exp(0.5 * log_var)
|
| 1043 |
+
eps = torch.randn_like(std)
|
| 1044 |
+
return eps * std + mu
|
| 1045 |
+
|
| 1046 |
+
def sample(self, imgs, deterministic=False):
|
| 1047 |
+
mu, log_var = self.encode(imgs)
|
| 1048 |
+
if deterministic:
|
| 1049 |
+
return mu
|
| 1050 |
+
std = torch.exp(0.5 * log_var.clamp(-30.0, 20.0))
|
| 1051 |
+
return mu + std * torch.randn_like(std)
|
| 1052 |
+
|
| 1053 |
+
def clear_cache(self):
|
| 1054 |
+
self._conv_num = count_conv3d(self.decoder)
|
| 1055 |
+
self._conv_idx = [0]
|
| 1056 |
+
self._feat_map = [None] * self._conv_num
|
| 1057 |
+
# cache encode
|
| 1058 |
+
self._enc_conv_num = count_conv3d(self.encoder)
|
| 1059 |
+
self._enc_conv_idx = [0]
|
| 1060 |
+
self._enc_feat_map = [None] * self._enc_conv_num
|
| 1061 |
+
|
| 1062 |
+
|
| 1063 |
+
class WanVideoVAE(nn.Module):
|
| 1064 |
+
|
| 1065 |
+
def __init__(self, z_dim=16):
|
| 1066 |
+
super().__init__()
|
| 1067 |
+
|
| 1068 |
+
mean = [
|
| 1069 |
+
-0.7571, -0.7089, -0.9113, 0.1075, -0.1745, 0.9653, -0.1517, 1.5508,
|
| 1070 |
+
0.4134, -0.0715, 0.5517, -0.3632, -0.1922, -0.9497, 0.2503, -0.2921
|
| 1071 |
+
]
|
| 1072 |
+
std = [
|
| 1073 |
+
2.8184, 1.4541, 2.3275, 2.6558, 1.2196, 1.7708, 2.6052, 2.0743,
|
| 1074 |
+
3.2687, 2.1526, 2.8652, 1.5579, 1.6382, 1.1253, 2.8251, 1.9160
|
| 1075 |
+
]
|
| 1076 |
+
self.mean = torch.tensor(mean)
|
| 1077 |
+
self.std = torch.tensor(std)
|
| 1078 |
+
self.scale = [self.mean, 1.0 / self.std]
|
| 1079 |
+
|
| 1080 |
+
# init model
|
| 1081 |
+
self.model = VideoVAE_(z_dim=z_dim).eval().requires_grad_(False)
|
| 1082 |
+
self.upsampling_factor = 8
|
| 1083 |
+
self.z_dim = z_dim
|
| 1084 |
+
|
| 1085 |
+
|
| 1086 |
+
def build_1d_mask(self, length, left_bound, right_bound, border_width):
|
| 1087 |
+
x = torch.ones((length,))
|
| 1088 |
+
if not left_bound:
|
| 1089 |
+
x[:border_width] = (torch.arange(border_width) + 1) / border_width
|
| 1090 |
+
if not right_bound:
|
| 1091 |
+
x[-border_width:] = torch.flip((torch.arange(border_width) + 1) / border_width, dims=(0,))
|
| 1092 |
+
return x
|
| 1093 |
+
|
| 1094 |
+
|
| 1095 |
+
def build_mask(self, data, is_bound, border_width):
|
| 1096 |
+
_, _, _, H, W = data.shape
|
| 1097 |
+
h = self.build_1d_mask(H, is_bound[0], is_bound[1], border_width[0])
|
| 1098 |
+
w = self.build_1d_mask(W, is_bound[2], is_bound[3], border_width[1])
|
| 1099 |
+
|
| 1100 |
+
h = repeat(h, "H -> H W", H=H, W=W)
|
| 1101 |
+
w = repeat(w, "W -> H W", H=H, W=W)
|
| 1102 |
+
|
| 1103 |
+
mask = torch.stack([h, w]).min(dim=0).values
|
| 1104 |
+
mask = rearrange(mask, "H W -> 1 1 1 H W")
|
| 1105 |
+
return mask
|
| 1106 |
+
|
| 1107 |
+
|
| 1108 |
+
def tiled_decode(self, hidden_states, device, tile_size, tile_stride):
|
| 1109 |
+
_, _, T, H, W = hidden_states.shape
|
| 1110 |
+
size_h, size_w = tile_size
|
| 1111 |
+
stride_h, stride_w = tile_stride
|
| 1112 |
+
|
| 1113 |
+
# Split tasks
|
| 1114 |
+
tasks = []
|
| 1115 |
+
for h in range(0, H, stride_h):
|
| 1116 |
+
if (h-stride_h >= 0 and h-stride_h+size_h >= H): continue
|
| 1117 |
+
for w in range(0, W, stride_w):
|
| 1118 |
+
if (w-stride_w >= 0 and w-stride_w+size_w >= W): continue
|
| 1119 |
+
h_, w_ = h + size_h, w + size_w
|
| 1120 |
+
tasks.append((h, h_, w, w_))
|
| 1121 |
+
|
| 1122 |
+
data_device = "cpu"
|
| 1123 |
+
computation_device = device
|
| 1124 |
+
|
| 1125 |
+
out_T = T * 4 - 3
|
| 1126 |
+
weight = torch.zeros((1, 1, out_T, H * self.upsampling_factor, W * self.upsampling_factor), dtype=hidden_states.dtype, device=data_device)
|
| 1127 |
+
values = torch.zeros((1, 3, out_T, H * self.upsampling_factor, W * self.upsampling_factor), dtype=hidden_states.dtype, device=data_device)
|
| 1128 |
+
|
| 1129 |
+
for h, h_, w, w_ in tqdm(tasks, desc="VAE decoding"):
|
| 1130 |
+
hidden_states_batch = hidden_states[:, :, :, h:h_, w:w_].to(computation_device)
|
| 1131 |
+
hidden_states_batch = self.model.decode(hidden_states_batch, self.scale).to(data_device)
|
| 1132 |
+
|
| 1133 |
+
mask = self.build_mask(
|
| 1134 |
+
hidden_states_batch,
|
| 1135 |
+
is_bound=(h==0, h_>=H, w==0, w_>=W),
|
| 1136 |
+
border_width=((size_h - stride_h) * self.upsampling_factor, (size_w - stride_w) * self.upsampling_factor)
|
| 1137 |
+
).to(dtype=hidden_states.dtype, device=data_device)
|
| 1138 |
+
|
| 1139 |
+
target_h = h * self.upsampling_factor
|
| 1140 |
+
target_w = w * self.upsampling_factor
|
| 1141 |
+
values[
|
| 1142 |
+
:,
|
| 1143 |
+
:,
|
| 1144 |
+
:,
|
| 1145 |
+
target_h:target_h + hidden_states_batch.shape[3],
|
| 1146 |
+
target_w:target_w + hidden_states_batch.shape[4],
|
| 1147 |
+
] += hidden_states_batch * mask
|
| 1148 |
+
weight[
|
| 1149 |
+
:,
|
| 1150 |
+
:,
|
| 1151 |
+
:,
|
| 1152 |
+
target_h: target_h + hidden_states_batch.shape[3],
|
| 1153 |
+
target_w: target_w + hidden_states_batch.shape[4],
|
| 1154 |
+
] += mask
|
| 1155 |
+
values = values / weight
|
| 1156 |
+
values = values.clamp_(-1, 1)
|
| 1157 |
+
return values
|
| 1158 |
+
|
| 1159 |
+
|
| 1160 |
+
def tiled_encode(self, video, device, tile_size, tile_stride):
|
| 1161 |
+
_, _, T, H, W = video.shape
|
| 1162 |
+
size_h, size_w = tile_size
|
| 1163 |
+
stride_h, stride_w = tile_stride
|
| 1164 |
+
|
| 1165 |
+
# Split tasks
|
| 1166 |
+
tasks = []
|
| 1167 |
+
for h in range(0, H, stride_h):
|
| 1168 |
+
if (h-stride_h >= 0 and h-stride_h+size_h >= H): continue
|
| 1169 |
+
for w in range(0, W, stride_w):
|
| 1170 |
+
if (w-stride_w >= 0 and w-stride_w+size_w >= W): continue
|
| 1171 |
+
h_, w_ = h + size_h, w + size_w
|
| 1172 |
+
tasks.append((h, h_, w, w_))
|
| 1173 |
+
|
| 1174 |
+
data_device = "cpu"
|
| 1175 |
+
computation_device = device
|
| 1176 |
+
|
| 1177 |
+
out_T = (T + 3) // 4
|
| 1178 |
+
weight = torch.zeros((1, 1, out_T, H // self.upsampling_factor, W // self.upsampling_factor), dtype=video.dtype, device=data_device)
|
| 1179 |
+
values = torch.zeros((1, self.z_dim, out_T, H // self.upsampling_factor, W // self.upsampling_factor), dtype=video.dtype, device=data_device)
|
| 1180 |
+
|
| 1181 |
+
for h, h_, w, w_ in tqdm(tasks, desc="VAE encoding"):
|
| 1182 |
+
hidden_states_batch = video[:, :, :, h:h_, w:w_].to(computation_device)
|
| 1183 |
+
hidden_states_batch = self.model.encode(hidden_states_batch, self.scale).to(data_device)
|
| 1184 |
+
|
| 1185 |
+
mask = self.build_mask(
|
| 1186 |
+
hidden_states_batch,
|
| 1187 |
+
is_bound=(h==0, h_>=H, w==0, w_>=W),
|
| 1188 |
+
border_width=((size_h - stride_h) // self.upsampling_factor, (size_w - stride_w) // self.upsampling_factor)
|
| 1189 |
+
).to(dtype=video.dtype, device=data_device)
|
| 1190 |
+
|
| 1191 |
+
target_h = h // self.upsampling_factor
|
| 1192 |
+
target_w = w // self.upsampling_factor
|
| 1193 |
+
values[
|
| 1194 |
+
:,
|
| 1195 |
+
:,
|
| 1196 |
+
:,
|
| 1197 |
+
target_h:target_h + hidden_states_batch.shape[3],
|
| 1198 |
+
target_w:target_w + hidden_states_batch.shape[4],
|
| 1199 |
+
] += hidden_states_batch * mask
|
| 1200 |
+
weight[
|
| 1201 |
+
:,
|
| 1202 |
+
:,
|
| 1203 |
+
:,
|
| 1204 |
+
target_h: target_h + hidden_states_batch.shape[3],
|
| 1205 |
+
target_w: target_w + hidden_states_batch.shape[4],
|
| 1206 |
+
] += mask
|
| 1207 |
+
values = values / weight
|
| 1208 |
+
return values
|
| 1209 |
+
|
| 1210 |
+
|
| 1211 |
+
def single_encode(self, video, device):
|
| 1212 |
+
video = video.to(device)
|
| 1213 |
+
x = self.model.encode(video, self.scale)
|
| 1214 |
+
return x
|
| 1215 |
+
|
| 1216 |
+
|
| 1217 |
+
def single_decode(self, hidden_state, device):
|
| 1218 |
+
hidden_state = hidden_state.to(device)
|
| 1219 |
+
video = self.model.decode(hidden_state, self.scale)
|
| 1220 |
+
return video.clamp_(-1, 1)
|
| 1221 |
+
|
| 1222 |
+
|
| 1223 |
+
def encode(self, videos, device, tiled=False, tile_size=(34, 34), tile_stride=(18, 16)):
|
| 1224 |
+
videos = [video.to("cpu") for video in videos]
|
| 1225 |
+
hidden_states = []
|
| 1226 |
+
for video in videos:
|
| 1227 |
+
video = video.unsqueeze(0)
|
| 1228 |
+
if tiled:
|
| 1229 |
+
tile_size = (tile_size[0] * self.upsampling_factor, tile_size[1] * self.upsampling_factor)
|
| 1230 |
+
tile_stride = (tile_stride[0] * self.upsampling_factor, tile_stride[1] * self.upsampling_factor)
|
| 1231 |
+
hidden_state = self.tiled_encode(video, device, tile_size, tile_stride)
|
| 1232 |
+
else:
|
| 1233 |
+
hidden_state = self.single_encode(video, device)
|
| 1234 |
+
hidden_state = hidden_state.squeeze(0)
|
| 1235 |
+
hidden_states.append(hidden_state)
|
| 1236 |
+
hidden_states = torch.stack(hidden_states)
|
| 1237 |
+
return hidden_states
|
| 1238 |
+
|
| 1239 |
+
|
| 1240 |
+
def decode(self, hidden_states, device, tiled=False, tile_size=(34, 34), tile_stride=(18, 16)):
|
| 1241 |
+
hidden_states = [hidden_state.to("cpu") for hidden_state in hidden_states]
|
| 1242 |
+
videos = []
|
| 1243 |
+
for hidden_state in hidden_states:
|
| 1244 |
+
hidden_state = hidden_state.unsqueeze(0)
|
| 1245 |
+
if tiled:
|
| 1246 |
+
video = self.tiled_decode(hidden_state, device, tile_size, tile_stride)
|
| 1247 |
+
else:
|
| 1248 |
+
video = self.single_decode(hidden_state, device)
|
| 1249 |
+
video = video.squeeze(0)
|
| 1250 |
+
videos.append(video)
|
| 1251 |
+
videos = torch.stack(videos)
|
| 1252 |
+
return videos
|
| 1253 |
+
|
| 1254 |
+
|
| 1255 |
+
@staticmethod
|
| 1256 |
+
def state_dict_converter():
|
| 1257 |
+
return WanVideoVAEStateDictConverter()
|
| 1258 |
+
|
| 1259 |
+
|
| 1260 |
+
class WanVideoVAEStateDictConverter:
|
| 1261 |
+
|
| 1262 |
+
def __init__(self):
|
| 1263 |
+
pass
|
| 1264 |
+
|
| 1265 |
+
def from_civitai(self, state_dict):
|
| 1266 |
+
state_dict_ = {}
|
| 1267 |
+
if 'model_state' in state_dict:
|
| 1268 |
+
state_dict = state_dict['model_state']
|
| 1269 |
+
for name in state_dict:
|
| 1270 |
+
state_dict_['model.' + name] = state_dict[name]
|
| 1271 |
+
return state_dict_
|
| 1272 |
+
|
| 1273 |
+
|
| 1274 |
+
class VideoVAE38_(VideoVAE_):
|
| 1275 |
+
|
| 1276 |
+
def __init__(self,
|
| 1277 |
+
dim=160,
|
| 1278 |
+
z_dim=48,
|
| 1279 |
+
dec_dim=256,
|
| 1280 |
+
dim_mult=[1, 2, 4, 4],
|
| 1281 |
+
num_res_blocks=2,
|
| 1282 |
+
attn_scales=[],
|
| 1283 |
+
temperal_downsample=[False, True, True],
|
| 1284 |
+
dropout=0.0):
|
| 1285 |
+
super(VideoVAE_, self).__init__()
|
| 1286 |
+
self.dim = dim
|
| 1287 |
+
self.z_dim = z_dim
|
| 1288 |
+
self.dim_mult = dim_mult
|
| 1289 |
+
self.num_res_blocks = num_res_blocks
|
| 1290 |
+
self.attn_scales = attn_scales
|
| 1291 |
+
self.temperal_downsample = temperal_downsample
|
| 1292 |
+
self.temperal_upsample = temperal_downsample[::-1]
|
| 1293 |
+
|
| 1294 |
+
# modules
|
| 1295 |
+
self.encoder = Encoder3d_38(dim, z_dim * 2, dim_mult, num_res_blocks,
|
| 1296 |
+
attn_scales, self.temperal_downsample, dropout)
|
| 1297 |
+
self.conv1 = CausalConv3d(z_dim * 2, z_dim * 2, 1)
|
| 1298 |
+
self.conv2 = CausalConv3d(z_dim, z_dim, 1)
|
| 1299 |
+
self.decoder = Decoder3d_38(dec_dim, z_dim, dim_mult, num_res_blocks,
|
| 1300 |
+
attn_scales, self.temperal_upsample, dropout)
|
| 1301 |
+
|
| 1302 |
+
|
| 1303 |
+
def encode(self, x, scale):
|
| 1304 |
+
self.clear_cache()
|
| 1305 |
+
x = patchify(x, patch_size=2)
|
| 1306 |
+
t = x.shape[2]
|
| 1307 |
+
iter_ = 1 + (t - 1) // 4
|
| 1308 |
+
for i in range(iter_):
|
| 1309 |
+
self._enc_conv_idx = [0]
|
| 1310 |
+
if i == 0:
|
| 1311 |
+
out = self.encoder(x[:, :, :1, :, :],
|
| 1312 |
+
feat_cache=self._enc_feat_map,
|
| 1313 |
+
feat_idx=self._enc_conv_idx)
|
| 1314 |
+
else:
|
| 1315 |
+
out_ = self.encoder(x[:, :, 1 + 4 * (i - 1):1 + 4 * i, :, :],
|
| 1316 |
+
feat_cache=self._enc_feat_map,
|
| 1317 |
+
feat_idx=self._enc_conv_idx)
|
| 1318 |
+
out = torch.cat([out, out_], 2)
|
| 1319 |
+
mu, log_var = self.conv1(out).chunk(2, dim=1)
|
| 1320 |
+
if isinstance(scale[0], torch.Tensor):
|
| 1321 |
+
scale = [s.to(dtype=mu.dtype, device=mu.device) for s in scale]
|
| 1322 |
+
mu = (mu - scale[0].view(1, self.z_dim, 1, 1, 1)) * scale[1].view(
|
| 1323 |
+
1, self.z_dim, 1, 1, 1)
|
| 1324 |
+
else:
|
| 1325 |
+
scale = scale.to(dtype=mu.dtype, device=mu.device)
|
| 1326 |
+
mu = (mu - scale[0]) * scale[1]
|
| 1327 |
+
self.clear_cache()
|
| 1328 |
+
return mu
|
| 1329 |
+
|
| 1330 |
+
|
| 1331 |
+
def decode(self, z, scale):
|
| 1332 |
+
self.clear_cache()
|
| 1333 |
+
if isinstance(scale[0], torch.Tensor):
|
| 1334 |
+
scale = [s.to(dtype=z.dtype, device=z.device) for s in scale]
|
| 1335 |
+
z = z / scale[1].view(1, self.z_dim, 1, 1, 1) + scale[0].view(
|
| 1336 |
+
1, self.z_dim, 1, 1, 1)
|
| 1337 |
+
else:
|
| 1338 |
+
scale = scale.to(dtype=z.dtype, device=z.device)
|
| 1339 |
+
z = z / scale[1] + scale[0]
|
| 1340 |
+
iter_ = z.shape[2]
|
| 1341 |
+
x = self.conv2(z)
|
| 1342 |
+
for i in range(iter_):
|
| 1343 |
+
self._conv_idx = [0]
|
| 1344 |
+
if i == 0:
|
| 1345 |
+
out = self.decoder(x[:, :, i:i + 1, :, :],
|
| 1346 |
+
feat_cache=self._feat_map,
|
| 1347 |
+
feat_idx=self._conv_idx,
|
| 1348 |
+
first_chunk=True)
|
| 1349 |
+
else:
|
| 1350 |
+
out_ = self.decoder(x[:, :, i:i + 1, :, :],
|
| 1351 |
+
feat_cache=self._feat_map,
|
| 1352 |
+
feat_idx=self._conv_idx)
|
| 1353 |
+
out = torch.cat([out, out_], 2)
|
| 1354 |
+
out = unpatchify(out, patch_size=2)
|
| 1355 |
+
self.clear_cache()
|
| 1356 |
+
return out
|
| 1357 |
+
|
| 1358 |
+
|
| 1359 |
+
class WanVideoVAE38(WanVideoVAE):
|
| 1360 |
+
|
| 1361 |
+
def __init__(self, z_dim=48, dim=160):
|
| 1362 |
+
super(WanVideoVAE, self).__init__()
|
| 1363 |
+
|
| 1364 |
+
mean = [
|
| 1365 |
+
-0.2289, -0.0052, -0.1323, -0.2339, -0.2799, 0.0174, 0.1838, 0.1557,
|
| 1366 |
+
-0.1382, 0.0542, 0.2813, 0.0891, 0.1570, -0.0098, 0.0375, -0.1825,
|
| 1367 |
+
-0.2246, -0.1207, -0.0698, 0.5109, 0.2665, -0.2108, -0.2158, 0.2502,
|
| 1368 |
+
-0.2055, -0.0322, 0.1109, 0.1567, -0.0729, 0.0899, -0.2799, -0.1230,
|
| 1369 |
+
-0.0313, -0.1649, 0.0117, 0.0723, -0.2839, -0.2083, -0.0520, 0.3748,
|
| 1370 |
+
0.0152, 0.1957, 0.1433, -0.2944, 0.3573, -0.0548, -0.1681, -0.0667
|
| 1371 |
+
]
|
| 1372 |
+
std = [
|
| 1373 |
+
0.4765, 1.0364, 0.4514, 1.1677, 0.5313, 0.4990, 0.4818, 0.5013,
|
| 1374 |
+
0.8158, 1.0344, 0.5894, 1.0901, 0.6885, 0.6165, 0.8454, 0.4978,
|
| 1375 |
+
0.5759, 0.3523, 0.7135, 0.6804, 0.5833, 1.4146, 0.8986, 0.5659,
|
| 1376 |
+
0.7069, 0.5338, 0.4889, 0.4917, 0.4069, 0.4999, 0.6866, 0.4093,
|
| 1377 |
+
0.5709, 0.6065, 0.6415, 0.4944, 0.5726, 1.2042, 0.5458, 1.6887,
|
| 1378 |
+
0.3971, 1.0600, 0.3943, 0.5537, 0.5444, 0.4089, 0.7468, 0.7744
|
| 1379 |
+
]
|
| 1380 |
+
self.mean = torch.tensor(mean)
|
| 1381 |
+
self.std = torch.tensor(std)
|
| 1382 |
+
self.scale = [self.mean, 1.0 / self.std]
|
| 1383 |
+
|
| 1384 |
+
# init model
|
| 1385 |
+
self.model = VideoVAE38_(z_dim=z_dim, dim=dim).eval().requires_grad_(False)
|
| 1386 |
+
self.upsampling_factor = 16
|
| 1387 |
+
self.z_dim = z_dim
|
| 1388 |
+
|
| 1389 |
+
|
| 1390 |
+
# ─────────────────────────────────────────────────────────────────────────────
|
| 1391 |
+
# Diffusers-compatible wrapper (formerly kiwi_vae.py)
|
| 1392 |
+
# ─────────────────────────────────��───────────────────────────────────────────
|
| 1393 |
+
|
| 1394 |
+
@dataclass
|
| 1395 |
+
class LatentDist:
|
| 1396 |
+
mu: torch.Tensor
|
| 1397 |
+
|
| 1398 |
+
def sample(self):
|
| 1399 |
+
return self.mu
|
| 1400 |
+
|
| 1401 |
+
|
| 1402 |
+
@dataclass
|
| 1403 |
+
class EncoderOutput:
|
| 1404 |
+
latent_dist: LatentDist
|
| 1405 |
+
|
| 1406 |
+
|
| 1407 |
+
@dataclass
|
| 1408 |
+
class DecoderOutput:
|
| 1409 |
+
sample: torch.Tensor
|
| 1410 |
+
|
| 1411 |
+
|
| 1412 |
+
class VAE(VideoVAE_, ModelMixin, ConfigMixin):
|
| 1413 |
+
"""
|
| 1414 |
+
Diffusers-compatible VAE wrapper around the original Wan VideoVAE.
|
| 1415 |
+
Loads weights directly from diffusion_pytorch_model.safetensors.
|
| 1416 |
+
"""
|
| 1417 |
+
|
| 1418 |
+
@register_to_config
|
| 1419 |
+
def __init__(
|
| 1420 |
+
self,
|
| 1421 |
+
z_dim: int = 48,
|
| 1422 |
+
dim: int = 160,
|
| 1423 |
+
dim_mult: List[int] = [1, 2, 4, 4],
|
| 1424 |
+
num_res_blocks: int = 2,
|
| 1425 |
+
attn_scales: List[float] = [],
|
| 1426 |
+
temperal_downsample: List[bool] = [False, True, True],
|
| 1427 |
+
dropout: float = 0.0,
|
| 1428 |
+
vae_pth: str = "wan_vae.pth",
|
| 1429 |
+
latents_mean: Optional[List[float]] = None,
|
| 1430 |
+
latents_std: Optional[List[float]] = None,
|
| 1431 |
+
):
|
| 1432 |
+
# Build the actual VAE backbone so diffusers can load weights without mismatch.
|
| 1433 |
+
if z_dim == 48:
|
| 1434 |
+
VideoVAE38_.__init__(
|
| 1435 |
+
self,
|
| 1436 |
+
dim=dim,
|
| 1437 |
+
z_dim=z_dim,
|
| 1438 |
+
dim_mult=dim_mult,
|
| 1439 |
+
num_res_blocks=num_res_blocks,
|
| 1440 |
+
attn_scales=attn_scales,
|
| 1441 |
+
temperal_downsample=temperal_downsample,
|
| 1442 |
+
dropout=dropout,
|
| 1443 |
+
)
|
| 1444 |
+
self._use_38 = True
|
| 1445 |
+
self.upsampling_factor = 16
|
| 1446 |
+
else:
|
| 1447 |
+
VideoVAE_.__init__(
|
| 1448 |
+
self,
|
| 1449 |
+
dim=dim,
|
| 1450 |
+
z_dim=z_dim,
|
| 1451 |
+
dim_mult=dim_mult,
|
| 1452 |
+
num_res_blocks=num_res_blocks,
|
| 1453 |
+
attn_scales=attn_scales,
|
| 1454 |
+
temperal_downsample=temperal_downsample,
|
| 1455 |
+
dropout=dropout,
|
| 1456 |
+
)
|
| 1457 |
+
self._use_38 = False
|
| 1458 |
+
self.upsampling_factor = 8
|
| 1459 |
+
|
| 1460 |
+
# Keep for config compatibility; weights are loaded by diffusers.
|
| 1461 |
+
self._vae_pth = vae_pth
|
| 1462 |
+
self.z_dim = z_dim
|
| 1463 |
+
|
| 1464 |
+
# Build latent normalization scale: [mean, 1/std]
|
| 1465 |
+
if latents_mean is not None and latents_std is not None:
|
| 1466 |
+
mean = torch.tensor(latents_mean)
|
| 1467 |
+
std = torch.tensor(latents_std)
|
| 1468 |
+
self._scale = [mean, 1.0 / std]
|
| 1469 |
+
else:
|
| 1470 |
+
self._scale = [torch.zeros(z_dim), torch.ones(z_dim)]
|
| 1471 |
+
|
| 1472 |
+
def encode(self, x):
|
| 1473 |
+
x = x.to(dtype=next(self.parameters()).dtype)
|
| 1474 |
+
if self._use_38:
|
| 1475 |
+
mu = VideoVAE38_.encode(self, x, self._scale)
|
| 1476 |
+
else:
|
| 1477 |
+
mu = VideoVAE_.encode(self, x, self._scale)
|
| 1478 |
+
return EncoderOutput(latent_dist=LatentDist(mu=mu))
|
| 1479 |
+
|
| 1480 |
+
def decode(self, z):
|
| 1481 |
+
z = z.to(dtype=next(self.parameters()).dtype)
|
| 1482 |
+
if self._use_38:
|
| 1483 |
+
out = VideoVAE38_.decode(self, z, self._scale)
|
| 1484 |
+
else:
|
| 1485 |
+
out = VideoVAE_.decode(self, z, self._scale)
|
| 1486 |
+
return DecoderOutput(sample=out)
|
models/rife/.DS_Store
ADDED
|
Binary file (6.15 kB). View file
|
|
|
models/rife/._.DS_Store
ADDED
|
Binary file (212 Bytes). View file
|
|
|
models/rife/._IFNet_HDv3.cpython-311.pyc
ADDED
|
Binary file (212 Bytes). View file
|
|
|
models/rife/._IFNet_HDv3.py
ADDED
|
Binary file (212 Bytes). View file
|
|
|
models/rife/._RIFE_HDv3.cpython-311.pyc
ADDED
|
Binary file (212 Bytes). View file
|
|
|
models/rife/._RIFE_HDv3.py
ADDED
|
Binary file (576 Bytes). View file
|
|
|
models/rife/._RIFEv4.26_0921
ADDED
|
Binary file (212 Bytes). View file
|
|
|
models/rife/.___pycache__
ADDED
|
Binary file (212 Bytes). View file
|
|
|
models/rife/._flownet.pkl
ADDED
|
Binary file (312 Bytes). View file
|
|
|
models/rife/._refine.py
ADDED
|
Binary file (212 Bytes). View file
|
|
|
models/rife/IFNet_HDv3.py
ADDED
|
@@ -0,0 +1,169 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import torch
|
| 2 |
+
import torch.nn as nn
|
| 3 |
+
import torch.nn.functional as F
|
| 4 |
+
from model.warplayer import warp
|
| 5 |
+
# from train_log.refine import *
|
| 6 |
+
|
| 7 |
+
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
| 8 |
+
|
| 9 |
+
def conv(in_planes, out_planes, kernel_size=3, stride=1, padding=1, dilation=1):
|
| 10 |
+
return nn.Sequential(
|
| 11 |
+
nn.Conv2d(in_planes, out_planes, kernel_size=kernel_size, stride=stride,
|
| 12 |
+
padding=padding, dilation=dilation, bias=True),
|
| 13 |
+
nn.LeakyReLU(0.2, True)
|
| 14 |
+
)
|
| 15 |
+
|
| 16 |
+
def conv_bn(in_planes, out_planes, kernel_size=3, stride=1, padding=1, dilation=1):
|
| 17 |
+
return nn.Sequential(
|
| 18 |
+
nn.Conv2d(in_planes, out_planes, kernel_size=kernel_size, stride=stride,
|
| 19 |
+
padding=padding, dilation=dilation, bias=False),
|
| 20 |
+
nn.BatchNorm2d(out_planes),
|
| 21 |
+
nn.LeakyReLU(0.2, True)
|
| 22 |
+
)
|
| 23 |
+
|
| 24 |
+
class Head(nn.Module):
|
| 25 |
+
def __init__(self):
|
| 26 |
+
super(Head, self).__init__()
|
| 27 |
+
self.cnn0 = nn.Conv2d(3, 16, 3, 2, 1)
|
| 28 |
+
self.cnn1 = nn.Conv2d(16, 16, 3, 1, 1)
|
| 29 |
+
self.cnn2 = nn.Conv2d(16, 16, 3, 1, 1)
|
| 30 |
+
self.cnn3 = nn.ConvTranspose2d(16, 4, 4, 2, 1)
|
| 31 |
+
self.relu = nn.LeakyReLU(0.2, True)
|
| 32 |
+
|
| 33 |
+
def forward(self, x, feat=False):
|
| 34 |
+
x0 = self.cnn0(x)
|
| 35 |
+
x = self.relu(x0)
|
| 36 |
+
x1 = self.cnn1(x)
|
| 37 |
+
x = self.relu(x1)
|
| 38 |
+
x2 = self.cnn2(x)
|
| 39 |
+
x = self.relu(x2)
|
| 40 |
+
x3 = self.cnn3(x)
|
| 41 |
+
if feat:
|
| 42 |
+
return [x0, x1, x2, x3]
|
| 43 |
+
return x3
|
| 44 |
+
|
| 45 |
+
class ResConv(nn.Module):
|
| 46 |
+
def __init__(self, c, dilation=1):
|
| 47 |
+
super(ResConv, self).__init__()
|
| 48 |
+
self.conv = nn.Conv2d(c, c, 3, 1, dilation, dilation=dilation, groups=1\
|
| 49 |
+
)
|
| 50 |
+
self.beta = nn.Parameter(torch.ones((1, c, 1, 1)), requires_grad=True)
|
| 51 |
+
self.relu = nn.LeakyReLU(0.2, True)
|
| 52 |
+
|
| 53 |
+
def forward(self, x):
|
| 54 |
+
return self.relu(self.conv(x) * self.beta + x)
|
| 55 |
+
|
| 56 |
+
class IFBlock(nn.Module):
|
| 57 |
+
def __init__(self, in_planes, c=64):
|
| 58 |
+
super(IFBlock, self).__init__()
|
| 59 |
+
self.conv0 = nn.Sequential(
|
| 60 |
+
conv(in_planes, c//2, 3, 2, 1),
|
| 61 |
+
conv(c//2, c, 3, 2, 1),
|
| 62 |
+
)
|
| 63 |
+
self.convblock = nn.Sequential(
|
| 64 |
+
ResConv(c),
|
| 65 |
+
ResConv(c),
|
| 66 |
+
ResConv(c),
|
| 67 |
+
ResConv(c),
|
| 68 |
+
ResConv(c),
|
| 69 |
+
ResConv(c),
|
| 70 |
+
ResConv(c),
|
| 71 |
+
ResConv(c),
|
| 72 |
+
)
|
| 73 |
+
self.lastconv = nn.Sequential(
|
| 74 |
+
nn.ConvTranspose2d(c, 4*13, 4, 2, 1),
|
| 75 |
+
nn.PixelShuffle(2)
|
| 76 |
+
)
|
| 77 |
+
|
| 78 |
+
def forward(self, x, flow=None, scale=1):
|
| 79 |
+
x = F.interpolate(x, scale_factor= 1. / scale, mode="bilinear", align_corners=False)
|
| 80 |
+
if flow is not None:
|
| 81 |
+
flow = F.interpolate(flow, scale_factor= 1. / scale, mode="bilinear", align_corners=False) * 1. / scale
|
| 82 |
+
x = torch.cat((x, flow), 1)
|
| 83 |
+
feat = self.conv0(x)
|
| 84 |
+
feat = self.convblock(feat)
|
| 85 |
+
tmp = self.lastconv(feat)
|
| 86 |
+
tmp = F.interpolate(tmp, scale_factor=scale, mode="bilinear", align_corners=False)
|
| 87 |
+
flow = tmp[:, :4] * scale
|
| 88 |
+
mask = tmp[:, 4:5]
|
| 89 |
+
feat = tmp[:, 5:]
|
| 90 |
+
return flow, mask, feat
|
| 91 |
+
|
| 92 |
+
class IFNet(nn.Module):
|
| 93 |
+
def __init__(self):
|
| 94 |
+
super(IFNet, self).__init__()
|
| 95 |
+
self.block0 = IFBlock(7+8, c=192)
|
| 96 |
+
self.block1 = IFBlock(8+4+8+8, c=128)
|
| 97 |
+
self.block2 = IFBlock(8+4+8+8, c=96)
|
| 98 |
+
self.block3 = IFBlock(8+4+8+8, c=64)
|
| 99 |
+
self.block4 = IFBlock(8+4+8+8, c=32)
|
| 100 |
+
self.encode = Head()
|
| 101 |
+
|
| 102 |
+
# not used during inference
|
| 103 |
+
'''
|
| 104 |
+
self.teacher = IFBlock(8+4+8+3+8, c=64)
|
| 105 |
+
self.caltime = nn.Sequential(
|
| 106 |
+
nn.Conv2d(16+9, 8, 3, 2, 1),
|
| 107 |
+
nn.LeakyReLU(0.2, True),
|
| 108 |
+
nn.Conv2d(32, 64, 3, 2, 1),
|
| 109 |
+
nn.LeakyReLU(0.2, True),
|
| 110 |
+
nn.Conv2d(64, 64, 3, 1, 1),
|
| 111 |
+
nn.LeakyReLU(0.2, True),
|
| 112 |
+
nn.Conv2d(64, 64, 3, 1, 1),
|
| 113 |
+
nn.LeakyReLU(0.2, True),
|
| 114 |
+
nn.Conv2d(64, 1, 3, 1, 1),
|
| 115 |
+
nn.Sigmoid()
|
| 116 |
+
)
|
| 117 |
+
'''
|
| 118 |
+
|
| 119 |
+
def forward(self, x, timestep=0.5, scale_list=[8, 4, 2, 1], training=False, fastmode=True, ensemble=False):
|
| 120 |
+
if training == False:
|
| 121 |
+
channel = x.shape[1] // 2
|
| 122 |
+
img0 = x[:, :channel]
|
| 123 |
+
img1 = x[:, channel:]
|
| 124 |
+
if not torch.is_tensor(timestep):
|
| 125 |
+
timestep = (x[:, :1].clone() * 0 + 1) * timestep
|
| 126 |
+
else:
|
| 127 |
+
timestep = timestep.repeat(1, 1, img0.shape[2], img0.shape[3])
|
| 128 |
+
f0 = self.encode(img0[:, :3])
|
| 129 |
+
f1 = self.encode(img1[:, :3])
|
| 130 |
+
flow_list = []
|
| 131 |
+
merged = []
|
| 132 |
+
mask_list = []
|
| 133 |
+
warped_img0 = img0
|
| 134 |
+
warped_img1 = img1
|
| 135 |
+
flow = None
|
| 136 |
+
mask = None
|
| 137 |
+
loss_cons = 0
|
| 138 |
+
block = [self.block0, self.block1, self.block2, self.block3, self.block4]
|
| 139 |
+
for i in range(5):
|
| 140 |
+
if flow is None:
|
| 141 |
+
flow, mask, feat = block[i](torch.cat((img0[:, :3], img1[:, :3], f0, f1, timestep), 1), None, scale=scale_list[i])
|
| 142 |
+
if ensemble:
|
| 143 |
+
print("warning: ensemble is not supported since RIFEv4.21")
|
| 144 |
+
else:
|
| 145 |
+
wf0 = warp(f0, flow[:, :2])
|
| 146 |
+
wf1 = warp(f1, flow[:, 2:4])
|
| 147 |
+
fd, m0, feat = block[i](torch.cat((warped_img0[:, :3], warped_img1[:, :3], wf0, wf1, timestep, mask, feat), 1), flow, scale=scale_list[i])
|
| 148 |
+
if ensemble:
|
| 149 |
+
print("warning: ensemble is not supported since RIFEv4.21")
|
| 150 |
+
else:
|
| 151 |
+
mask = m0
|
| 152 |
+
flow = flow + fd
|
| 153 |
+
mask_list.append(mask)
|
| 154 |
+
flow_list.append(flow)
|
| 155 |
+
warped_img0 = warp(img0, flow[:, :2])
|
| 156 |
+
warped_img1 = warp(img1, flow[:, 2:4])
|
| 157 |
+
merged.append((warped_img0, warped_img1))
|
| 158 |
+
mask = torch.sigmoid(mask)
|
| 159 |
+
merged[4] = (warped_img0 * mask + warped_img1 * (1 - mask))
|
| 160 |
+
if not fastmode:
|
| 161 |
+
print('contextnet is removed')
|
| 162 |
+
'''
|
| 163 |
+
c0 = self.contextnet(img0, flow[:, :2])
|
| 164 |
+
c1 = self.contextnet(img1, flow[:, 2:4])
|
| 165 |
+
tmp = self.unet(img0, img1, warped_img0, warped_img1, mask, flow, c0, c1)
|
| 166 |
+
res = tmp[:, :3] * 2 - 1
|
| 167 |
+
merged[4] = torch.clamp(merged[4] + res, 0, 1)
|
| 168 |
+
'''
|
| 169 |
+
return flow_list, mask_list[4], merged
|
models/rife/RIFE_HDv3.py
ADDED
|
@@ -0,0 +1,89 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import torch
|
| 2 |
+
import torch.nn as nn
|
| 3 |
+
import numpy as np
|
| 4 |
+
from torch.optim import AdamW
|
| 5 |
+
import torch.optim as optim
|
| 6 |
+
import itertools
|
| 7 |
+
from model.warplayer import warp
|
| 8 |
+
from torch.nn.parallel import DistributedDataParallel as DDP
|
| 9 |
+
from IFNet_HDv3 import *
|
| 10 |
+
import torch.nn.functional as F
|
| 11 |
+
from model.loss import *
|
| 12 |
+
|
| 13 |
+
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
| 14 |
+
|
| 15 |
+
class Model:
|
| 16 |
+
def __init__(self, local_rank=-1):
|
| 17 |
+
self.flownet = IFNet()
|
| 18 |
+
self.device()
|
| 19 |
+
self.optimG = AdamW(self.flownet.parameters(), lr=1e-6, weight_decay=1e-4)
|
| 20 |
+
self.epe = EPE()
|
| 21 |
+
self.version = 4.25
|
| 22 |
+
# self.vgg = VGGPerceptualLoss().to(device)
|
| 23 |
+
self.sobel = SOBEL()
|
| 24 |
+
if local_rank != -1:
|
| 25 |
+
self.flownet = DDP(self.flownet, device_ids=[local_rank], output_device=local_rank)
|
| 26 |
+
|
| 27 |
+
def train(self):
|
| 28 |
+
self.flownet.train()
|
| 29 |
+
|
| 30 |
+
def eval(self):
|
| 31 |
+
self.flownet.eval()
|
| 32 |
+
|
| 33 |
+
def device(self):
|
| 34 |
+
self.flownet.to(device)
|
| 35 |
+
|
| 36 |
+
def load_model(self, path, rank=0):
|
| 37 |
+
def convert(param):
|
| 38 |
+
if rank == -1:
|
| 39 |
+
return {
|
| 40 |
+
k.replace("module.", ""): v
|
| 41 |
+
for k, v in param.items()
|
| 42 |
+
if "module." in k
|
| 43 |
+
}
|
| 44 |
+
else:
|
| 45 |
+
return param
|
| 46 |
+
if rank <= 0:
|
| 47 |
+
if torch.cuda.is_available():
|
| 48 |
+
self.flownet.load_state_dict(convert(torch.load('{}/flownet.pkl'.format(path))), False)
|
| 49 |
+
else:
|
| 50 |
+
self.flownet.load_state_dict(convert(torch.load('{}/flownet.pkl'.format(path), map_location ='cpu')), False)
|
| 51 |
+
|
| 52 |
+
def save_model(self, path, rank=0):
|
| 53 |
+
if rank == 0:
|
| 54 |
+
torch.save(self.flownet.state_dict(),'{}/flownet.pkl'.format(path))
|
| 55 |
+
|
| 56 |
+
def inference(self, img0, img1, timestep=0.5, scale=1.0):
|
| 57 |
+
imgs = torch.cat((img0, img1), 1)
|
| 58 |
+
scale_list = [16/scale, 8/scale, 4/scale, 2/scale, 1/scale]
|
| 59 |
+
flow, mask, merged = self.flownet(imgs, timestep, scale_list)
|
| 60 |
+
return merged[-1]
|
| 61 |
+
|
| 62 |
+
def update(self, imgs, gt, learning_rate=0, mul=1, training=True, flow_gt=None):
|
| 63 |
+
for param_group in self.optimG.param_groups:
|
| 64 |
+
param_group['lr'] = learning_rate
|
| 65 |
+
img0 = imgs[:, :3]
|
| 66 |
+
img1 = imgs[:, 3:]
|
| 67 |
+
if training:
|
| 68 |
+
self.train()
|
| 69 |
+
else:
|
| 70 |
+
self.eval()
|
| 71 |
+
scale = [16, 8, 4, 2, 1]
|
| 72 |
+
flow, mask, merged = self.flownet(torch.cat((imgs, gt), 1), scale=scale, training=training)
|
| 73 |
+
loss_l1 = (merged[-1] - gt).abs().mean()
|
| 74 |
+
loss_smooth = self.sobel(flow[-1], flow[-1]*0).mean()
|
| 75 |
+
# loss_vgg = self.vgg(merged[-1], gt)
|
| 76 |
+
if training:
|
| 77 |
+
self.optimG.zero_grad()
|
| 78 |
+
loss_G = loss_l1 + loss_cons + loss_smooth * 0.1
|
| 79 |
+
loss_G.backward()
|
| 80 |
+
self.optimG.step()
|
| 81 |
+
else:
|
| 82 |
+
flow_teacher = flow[2]
|
| 83 |
+
return merged[-1], {
|
| 84 |
+
'mask': mask,
|
| 85 |
+
'flow': flow[-1][:, :2],
|
| 86 |
+
'loss_l1': loss_l1,
|
| 87 |
+
'loss_cons': loss_cons,
|
| 88 |
+
'loss_smooth': loss_smooth,
|
| 89 |
+
}
|
models/rife/refine.py
ADDED
|
@@ -0,0 +1,90 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import torch
|
| 2 |
+
import torch.nn as nn
|
| 3 |
+
import numpy as np
|
| 4 |
+
from torch.optim import AdamW
|
| 5 |
+
import torch.optim as optim
|
| 6 |
+
import itertools
|
| 7 |
+
from model.warplayer import warp
|
| 8 |
+
from torch.nn.parallel import DistributedDataParallel as DDP
|
| 9 |
+
import torch.nn.functional as F
|
| 10 |
+
|
| 11 |
+
device = torch.device("cuda")
|
| 12 |
+
|
| 13 |
+
def conv(in_planes, out_planes, kernel_size=3, stride=1, padding=1, dilation=1):
|
| 14 |
+
return nn.Sequential(
|
| 15 |
+
nn.Conv2d(in_planes, out_planes, kernel_size=kernel_size, stride=stride,
|
| 16 |
+
padding=padding, dilation=dilation, bias=True),
|
| 17 |
+
nn.LeakyReLU(0.2, True)
|
| 18 |
+
)
|
| 19 |
+
|
| 20 |
+
def conv_woact(in_planes, out_planes, kernel_size=3, stride=1, padding=1, dilation=1):
|
| 21 |
+
return nn.Sequential(
|
| 22 |
+
nn.Conv2d(in_planes, out_planes, kernel_size=kernel_size, stride=stride,
|
| 23 |
+
padding=padding, dilation=dilation, bias=True),
|
| 24 |
+
)
|
| 25 |
+
|
| 26 |
+
def deconv(in_planes, out_planes, kernel_size=4, stride=2, padding=1):
|
| 27 |
+
return nn.Sequential(
|
| 28 |
+
torch.nn.ConvTranspose2d(in_channels=in_planes, out_channels=out_planes, kernel_size=4, stride=2, padding=1, bias=True),
|
| 29 |
+
nn.LeakyReLU(0.2, True)
|
| 30 |
+
)
|
| 31 |
+
|
| 32 |
+
class Conv2(nn.Module):
|
| 33 |
+
def __init__(self, in_planes, out_planes, stride=2):
|
| 34 |
+
super(Conv2, self).__init__()
|
| 35 |
+
self.conv1 = conv(in_planes, out_planes, 3, stride, 1)
|
| 36 |
+
self.conv2 = conv(out_planes, out_planes, 3, 1, 1)
|
| 37 |
+
|
| 38 |
+
def forward(self, x):
|
| 39 |
+
x = self.conv1(x)
|
| 40 |
+
x = self.conv2(x)
|
| 41 |
+
return x
|
| 42 |
+
|
| 43 |
+
c = 16
|
| 44 |
+
class Contextnet(nn.Module):
|
| 45 |
+
def __init__(self):
|
| 46 |
+
super(Contextnet, self).__init__()
|
| 47 |
+
self.conv1 = Conv2(3, c)
|
| 48 |
+
self.conv2 = Conv2(c, 2*c)
|
| 49 |
+
self.conv3 = Conv2(2*c, 4*c)
|
| 50 |
+
self.conv4 = Conv2(4*c, 8*c)
|
| 51 |
+
|
| 52 |
+
def forward(self, x, flow):
|
| 53 |
+
x = self.conv1(x)
|
| 54 |
+
flow = F.interpolate(flow, scale_factor=0.5, mode="bilinear", align_corners=False) * 0.5
|
| 55 |
+
f1 = warp(x, flow)
|
| 56 |
+
x = self.conv2(x)
|
| 57 |
+
flow = F.interpolate(flow, scale_factor=0.5, mode="bilinear", align_corners=False) * 0.5
|
| 58 |
+
f2 = warp(x, flow)
|
| 59 |
+
x = self.conv3(x)
|
| 60 |
+
flow = F.interpolate(flow, scale_factor=0.5, mode="bilinear", align_corners=False) * 0.5
|
| 61 |
+
f3 = warp(x, flow)
|
| 62 |
+
x = self.conv4(x)
|
| 63 |
+
flow = F.interpolate(flow, scale_factor=0.5, mode="bilinear", align_corners=False) * 0.5
|
| 64 |
+
f4 = warp(x, flow)
|
| 65 |
+
return [f1, f2, f3, f4]
|
| 66 |
+
|
| 67 |
+
class Unet(nn.Module):
|
| 68 |
+
def __init__(self):
|
| 69 |
+
super(Unet, self).__init__()
|
| 70 |
+
self.down0 = Conv2(17, 2*c)
|
| 71 |
+
self.down1 = Conv2(4*c, 4*c)
|
| 72 |
+
self.down2 = Conv2(8*c, 8*c)
|
| 73 |
+
self.down3 = Conv2(16*c, 16*c)
|
| 74 |
+
self.up0 = deconv(32*c, 8*c)
|
| 75 |
+
self.up1 = deconv(16*c, 4*c)
|
| 76 |
+
self.up2 = deconv(8*c, 2*c)
|
| 77 |
+
self.up3 = deconv(4*c, c)
|
| 78 |
+
self.conv = nn.Conv2d(c, 3, 3, 1, 1)
|
| 79 |
+
|
| 80 |
+
def forward(self, img0, img1, warped_img0, warped_img1, mask, flow, c0, c1):
|
| 81 |
+
s0 = self.down0(torch.cat((img0, img1, warped_img0, warped_img1, mask, flow), 1))
|
| 82 |
+
s1 = self.down1(torch.cat((s0, c0[0], c1[0]), 1))
|
| 83 |
+
s2 = self.down2(torch.cat((s1, c0[1], c1[1]), 1))
|
| 84 |
+
s3 = self.down3(torch.cat((s2, c0[2], c1[2]), 1))
|
| 85 |
+
x = self.up0(torch.cat((s3, c0[3], c1[3]), 1))
|
| 86 |
+
x = self.up1(torch.cat((x, s2), 1))
|
| 87 |
+
x = self.up2(torch.cat((x, s1), 1))
|
| 88 |
+
x = self.up3(torch.cat((x, s0), 1))
|
| 89 |
+
x = self.conv(x)
|
| 90 |
+
return torch.sigmoid(x)
|
models/seedvr2/.validation_cache.json
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"seedvr2_ema_3b_fp8_e4m3fn.safetensors": {
|
| 3 |
+
"size": 3391544696,
|
| 4 |
+
"mtime": 1772206219.677575,
|
| 5 |
+
"hash": "3bf1e43ebedd570e7e7a0b1b60d6a02e105978f505c8128a241cde99a8240cff"
|
| 6 |
+
},
|
| 7 |
+
"ema_vae_fp16.safetensors": {
|
| 8 |
+
"size": 501324814,
|
| 9 |
+
"mtime": 1772206245.5699334,
|
| 10 |
+
"hash": "20678548f420d98d26f11442d3528f8b8c94e57ee046ef93dbb7633da8612ca1"
|
| 11 |
+
}
|
| 12 |
+
}
|
models/seedvr2/config.json
ADDED
|
File without changes
|
models/voice-presets/bruce.wav
ADDED
|
Binary file (14.3 kB). View file
|
|
|
models/voice-presets/christian.wav
ADDED
|
Binary file (80.4 kB). View file
|
|
|
models/voice-presets/hal.wav
ADDED
|
Binary file (50.9 kB). View file
|
|
|
models/voice-presets/heath.wav
ADDED
|
Binary file (43.7 kB). View file
|
|
|
models/voice-presets/ian.wav
ADDED
|
Binary file (63.4 kB). View file
|
|
|
models/voice-presets/johnny.wav
ADDED
|
Binary file (17.4 kB). View file
|
|
|
models/voice-presets/patrick.wav
ADDED
|
Binary file (16.3 kB). View file
|
|
|
models/voice-presets/robert.wav
ADDED
|
Binary file (37.2 kB). View file
|
|
|
models/voice-presets/russel.wav
ADDED
|
Binary file (31.2 kB). View file
|
|
|
models/voice-presets/sean.wav
ADDED
|
Binary file (73.9 kB). View file
|
|
|
models/voice-presets/sigourney.wav
ADDED
|
Binary file (15.4 kB). View file
|
|
|
models/z-image-ControlNet-Union/config.json
ADDED
|
@@ -0,0 +1,39 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"_class_name": "ZImageControlNetModel",
|
| 3 |
+
"_diffusers_version": "0.36.0.dev0",
|
| 4 |
+
"add_control_noise_refiner": "control_noise_refiner",
|
| 5 |
+
"all_f_patch_size": [
|
| 6 |
+
1
|
| 7 |
+
],
|
| 8 |
+
"all_patch_size": [
|
| 9 |
+
2
|
| 10 |
+
],
|
| 11 |
+
"control_in_dim": 33,
|
| 12 |
+
"control_layers_places": [
|
| 13 |
+
0,
|
| 14 |
+
2,
|
| 15 |
+
4,
|
| 16 |
+
6,
|
| 17 |
+
8,
|
| 18 |
+
10,
|
| 19 |
+
12,
|
| 20 |
+
14,
|
| 21 |
+
16,
|
| 22 |
+
18,
|
| 23 |
+
20,
|
| 24 |
+
22,
|
| 25 |
+
24,
|
| 26 |
+
26,
|
| 27 |
+
28
|
| 28 |
+
],
|
| 29 |
+
"control_refiner_layers_places": [
|
| 30 |
+
0,
|
| 31 |
+
1
|
| 32 |
+
],
|
| 33 |
+
"dim": 3840,
|
| 34 |
+
"n_heads": 30,
|
| 35 |
+
"n_kv_heads": 30,
|
| 36 |
+
"n_refiner_layers": 2,
|
| 37 |
+
"norm_eps": 1e-05,
|
| 38 |
+
"qk_norm": true
|
| 39 |
+
}
|