Instructions to use aoiandroid/SoulX-FlashHead-1_3B with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Diffusers
How to use aoiandroid/SoulX-FlashHead-1_3B with Diffusers:
pip install -U diffusers transformers accelerate
import torch from diffusers import DiffusionPipeline from diffusers.utils import load_image, export_to_video # switch to "mps" for apple devices pipe = DiffusionPipeline.from_pretrained("aoiandroid/SoulX-FlashHead-1_3B", dtype=torch.bfloat16, device_map="cuda") pipe.to("cuda") prompt = "A man with short gray hair plays a red electric guitar." image = load_image( "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/guitar-man.png" ) output = pipe(image=image, prompt=prompt).frames[0] export_to_video(output, "output.mp4") - Notebooks
- Google Colab
- Kaggle
Commit ·
5c668c1
0
Parent(s):
Duplicate from Soul-AILab/SoulX-FlashHead-1_3B
Browse filesCo-authored-by: JokerZ <JokerZhou@users.noreply.huggingface.co>
- .gitattributes +41 -0
- Model_Lite/config.json +24 -0
- Model_Lite/diffusion_pytorch_model.safetensors +3 -0
- Model_Pro/config.json +24 -0
- Model_Pro/diffusion_pytorch_model.safetensors +3 -0
- README.md +196 -0
- VAE_LTX/config.json +32 -0
- VAE_LTX/diffusion_pytorch_model.safetensors +3 -0
- VAE_Wan/Wan2.1_VAE.pth +3 -0
- assets/chengdu.mp4 +3 -0
- assets/einstein.mp4 +3 -0
- assets/flashhead_logo.png +3 -0
- assets/qitiandasheng.mp4 +3 -0
- assets/soul_event_link.png +3 -0
- assets/soul_group.png +0 -0
- assets/wechat_group.png +3 -0
- config.json +5 -0
- model_index.json +4 -0
.gitattributes
ADDED
|
@@ -0,0 +1,41 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
| 2 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
| 3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
| 4 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
| 5 |
+
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
| 6 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
| 7 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
| 8 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
| 9 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
| 10 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
| 11 |
+
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
| 12 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
| 13 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
| 14 |
+
*.npy filter=lfs diff=lfs merge=lfs -text
|
| 15 |
+
*.npz filter=lfs diff=lfs merge=lfs -text
|
| 16 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
| 17 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
| 18 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
| 19 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
| 20 |
+
*.pickle filter=lfs diff=lfs merge=lfs -text
|
| 21 |
+
*.pkl filter=lfs diff=lfs merge=lfs -text
|
| 22 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
| 23 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
| 24 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
| 25 |
+
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
| 26 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
| 27 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
| 28 |
+
*.tar filter=lfs diff=lfs merge=lfs -text
|
| 29 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
| 30 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
| 31 |
+
*.wasm filter=lfs diff=lfs merge=lfs -text
|
| 32 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
| 33 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
+
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
+
assets/chengdu.mp4 filter=lfs diff=lfs merge=lfs -text
|
| 37 |
+
assets/einstein.mp4 filter=lfs diff=lfs merge=lfs -text
|
| 38 |
+
assets/flashhead_logo.png filter=lfs diff=lfs merge=lfs -text
|
| 39 |
+
assets/qitiandasheng.mp4 filter=lfs diff=lfs merge=lfs -text
|
| 40 |
+
assets/soul_event_link.png filter=lfs diff=lfs merge=lfs -text
|
| 41 |
+
assets/wechat_group.png filter=lfs diff=lfs merge=lfs -text
|
Model_Lite/config.json
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"_class_name": "WanModelAudioProject",
|
| 3 |
+
"_diffusers_version": "0.36.0",
|
| 4 |
+
"dim": 1536,
|
| 5 |
+
"eps": 1e-06,
|
| 6 |
+
"ffn_dim": 8960,
|
| 7 |
+
"freq_dim": 256,
|
| 8 |
+
"has_image_input": false,
|
| 9 |
+
"in_dim": 256,
|
| 10 |
+
"num_heads": 12,
|
| 11 |
+
"num_layers": 30,
|
| 12 |
+
"out_dim": 128,
|
| 13 |
+
"patch_size": [
|
| 14 |
+
1,
|
| 15 |
+
1,
|
| 16 |
+
1
|
| 17 |
+
],
|
| 18 |
+
"text_dim": 4096,
|
| 19 |
+
"vae_stride": [
|
| 20 |
+
8,
|
| 21 |
+
32,
|
| 22 |
+
32
|
| 23 |
+
]
|
| 24 |
+
}
|
Model_Lite/diffusion_pytorch_model.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:aaf1cde6e80ca23f740aae236c47954249f65b151db133cc0f77d3a138ccdf6e
|
| 3 |
+
size 6107542048
|
Model_Pro/config.json
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"_class_name": "WanModelAudioProject",
|
| 3 |
+
"_diffusers_version": "0.36.0",
|
| 4 |
+
"dim": 1536,
|
| 5 |
+
"eps": 1e-06,
|
| 6 |
+
"ffn_dim": 8960,
|
| 7 |
+
"freq_dim": 256,
|
| 8 |
+
"has_image_input": false,
|
| 9 |
+
"in_dim": 32,
|
| 10 |
+
"num_heads": 12,
|
| 11 |
+
"num_layers": 30,
|
| 12 |
+
"out_dim": 16,
|
| 13 |
+
"patch_size": [
|
| 14 |
+
1,
|
| 15 |
+
2,
|
| 16 |
+
2
|
| 17 |
+
],
|
| 18 |
+
"text_dim": 4096,
|
| 19 |
+
"vae_stride": [
|
| 20 |
+
4,
|
| 21 |
+
8,
|
| 22 |
+
8
|
| 23 |
+
]
|
| 24 |
+
}
|
Model_Pro/diffusion_pytorch_model.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:e47e61b9023ea1aac60c0c0fff077289bc6cca443c71907e5a76a411480af250
|
| 3 |
+
size 6030864656
|
README.md
ADDED
|
@@ -0,0 +1,196 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
license: apache-2.0
|
| 3 |
+
datasets:
|
| 4 |
+
- Soul-AILab/VividHead
|
| 5 |
+
pipeline_tag: image-to-video
|
| 6 |
+
---
|
| 7 |
+
Soul-AILab/SoulX-FlashHead-1_3B
|
| 8 |
+
<div align="center">
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
<h1>SoulX-FlashHead: Oracle-guided Generation of Infinite Real-time Streaming Talking Heads</h1>
|
| 12 |
+
|
| 13 |
+
[Tan Yu*](https://jiayoujiayoujiayoua.github.io/), [Qian Qiao*](https://qianqiaoai.github.io/)<sup>✉</sup>, [Le Shen*](https://openreview.net/profile?id=%7ELe_Shen3), [Ke Zhou](https://github.com/jokerz0624), [Jincheng Hu](#), [Dian Sheng](#), [Bo Hu](#), [Haoming Qin](#), [Jun Gao](#), [Changhai Zhou](#), [Shunshun Yin](#), [Siyuan Liu](#) <sup>✉</sup>
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
<sup>*</sup>Equal Contribution
|
| 17 |
+
<sup>✉</sup>Corresponding Author
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
<a href='https://soul-ailab.github.io/soulx-flashhead/' target="_blank"><img src='https://img.shields.io/badge/Project-Page-green'></a> <a href='https://arxiv.org/pdf/2602.07449' target="_blank"><img src='https://img.shields.io/badge/Technical-Report-red'></a>
|
| 21 |
+
<a href='https://huggingface.co/Soul-AILab/SoulX-FlashHead-1_3B' target="_blank"><img src='https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Model-blue'></a>
|
| 22 |
+
<a href="https://huggingface.co/datasets/Soul-AILab/VividHead" target="_blank"><img src="https://img.shields.io/badge/🤗 Hugging Face-Dataset-blue" alt="Dataset"></a>
|
| 23 |
+
</div>
|
| 24 |
+
|
| 25 |
+
## ⚡ Highlights
|
| 26 |
+
- **Model_Lite** [Released](https://huggingface.co/Soul-AILab/SoulX-FlashHead-1_3B/tree/main/Model_Lite) get 96 FPS, or 3-concurrent real-time(25+ FPS) streaming on single RTX4090.
|
| 27 |
+
- **Model_Pro** [Released](https://huggingface.co/Soul-AILab/SoulX-FlashHead-1_3B/tree/main/Model_Pro) can generate high-quality videos with 10.8 FPS on single RTX4090, or real-time(25+ FPS) on two RTX5090.
|
| 28 |
+
- **Model_Pretrained** is coming soon, providing high-performance weights and experimental foundations for community research.
|
| 29 |
+
|
| 30 |
+
## 🔥 News
|
| 31 |
+
- **2026.02.12** - The [online demo](#online-experience-qr) is now available via the Soul App. Download it today to try it out. Download it today to try it out.
|
| 32 |
+
- **2026.02.12** - We have released the [inference code](https://github.com/Soul-AILab/SoulX-FlashHead), and the [model weights](https://huggingface.co/Soul-AILab/SoulX-FlashHead-1_3B).
|
| 33 |
+
- **2026.02.12** - We released **Project page** on [SoulX-FlashHead](https://soul-ailab.github.io/soulx-flashhead/).
|
| 34 |
+
- **2026.02.07** - We released [Dataset](https://huggingface.co/datasets/Soul-AILab/VividHead).
|
| 35 |
+
- **2026.02.07** - We released **SoulX-FlashHead Technical Report** on [Arxiv](https://arxiv.org/pdf/2602.07449) and [GitHub repository](./assets/SoulX_FlashHead.pdf).
|
| 36 |
+
|
| 37 |
+
|
| 38 |
+
## 📑 Todo List
|
| 39 |
+
- [x] Technical report
|
| 40 |
+
- [x] Project Page
|
| 41 |
+
- [x] Inference code
|
| 42 |
+
- [x] Distilled Checkpoint of Pro-Model & Lite-Model release
|
| 43 |
+
- [ ] Pretrained Checkpoint release
|
| 44 |
+
|
| 45 |
+
## 🌰 Examples
|
| 46 |
+
More examples are available in the project.
|
| 47 |
+
|
| 48 |
+
<table>
|
| 49 |
+
<tbody>
|
| 50 |
+
<!-- Row 1: Videos 1-5 -->
|
| 51 |
+
<tr>
|
| 52 |
+
<td width="30%"><video src="https://huggingface.co/Soul-AILab/SoulX-FlashHead-1_3B/resolve/main/assets/qitiandasheng.mp4" style="width:100%; aspect-ratio:512/512; object-fit:cover;" controls loop></video></td>
|
| 53 |
+
<td width="30%"><video src="https://huggingface.co/Soul-AILab/SoulX-FlashHead-1_3B/resolve/main/assets/chengdu.mp4" style="width:100%; aspect-ratio:512/512; object-fit:cover;" controls loop></video></td>
|
| 54 |
+
<td width="30%"><video src="https://huggingface.co/Soul-AILab/SoulX-FlashHead-1_3B/resolve/main/assets/einstein.mp4" style="width:100%; aspect-ratio:512/512; object-fit:cover;" controls loop></video></td>
|
| 55 |
+
</tr>
|
| 56 |
+
|
| 57 |
+
</tbody>
|
| 58 |
+
</table>
|
| 59 |
+
|
| 60 |
+
|
| 61 |
+
|
| 62 |
+
## 📖 Quickstart
|
| 63 |
+
### 🔧 Installation
|
| 64 |
+
#### 1. Create a Conda environment
|
| 65 |
+
```bash
|
| 66 |
+
conda create -n flashhead python=3.10
|
| 67 |
+
conda activate flashhead
|
| 68 |
+
```
|
| 69 |
+
#### 2. Install PyTorch on CUDA
|
| 70 |
+
```bash
|
| 71 |
+
pip install torch==2.7.1 torchvision==0.22.1 --index-url https://download.pytorch.org/whl/cu128
|
| 72 |
+
```
|
| 73 |
+
#### 3. Install other dependencies
|
| 74 |
+
```bash
|
| 75 |
+
pip install -r requirements.txt
|
| 76 |
+
```
|
| 77 |
+
#### 4. FlashAttention installation:
|
| 78 |
+
```bash
|
| 79 |
+
pip install ninja
|
| 80 |
+
pip install flash_attn==2.8.0.post2 --no-build-isolation
|
| 81 |
+
```
|
| 82 |
+
|
| 83 |
+
-- If it takes a long time, we recommend the way below.
|
| 84 |
+
1. download wheel file from [here](https://github.com/Dao-AILab/flash-attention/releases/tag/v2.8.0.post2)
|
| 85 |
+
2. pip install xxx.whl
|
| 86 |
+
|
| 87 |
+
#### 5. SageAttention installation (Optional)
|
| 88 |
+
```bash
|
| 89 |
+
pip install sageattention==2.2.0 --no-build-isolation
|
| 90 |
+
```
|
| 91 |
+
|
| 92 |
+
#### 5. FFmpeg installation
|
| 93 |
+
```bash
|
| 94 |
+
# Ubuntu / Debian
|
| 95 |
+
apt-get install ffmpeg
|
| 96 |
+
# CentOS / RHEL
|
| 97 |
+
yum install ffmpeg ffmpeg-devel
|
| 98 |
+
```
|
| 99 |
+
or
|
| 100 |
+
```bash
|
| 101 |
+
# Conda (no root required)
|
| 102 |
+
conda install -c conda-forge ffmpeg==7
|
| 103 |
+
```
|
| 104 |
+
### 🤗 Model download
|
| 105 |
+
```bash
|
| 106 |
+
# If you are in china mainland, run this first: export HF_ENDPOINT=https://hf-mirror.com
|
| 107 |
+
pip install "huggingface_hub[cli]"
|
| 108 |
+
huggingface-cli download Soul-AILab/SoulX-FlashHead-1_3B --local-dir ./models/SoulX-FlashHead-1_3B
|
| 109 |
+
huggingface-cli download facebook/wav2vec2-base-960h --local-dir ./models/wav2vec2-base-960h
|
| 110 |
+
```
|
| 111 |
+
### 🚀 Inference
|
| 112 |
+
```bash
|
| 113 |
+
# Infer with [Pro-Model] on single GPU
|
| 114 |
+
bash inference_script_single_gpu_pro.sh
|
| 115 |
+
|
| 116 |
+
|
| 117 |
+
# Infer with [Pro-Model] on multy GPUs
|
| 118 |
+
bash inference_script_multi_gpu_pro.sh
|
| 119 |
+
# Real-time inference speed of Pro-Model can only be supported on two RTX-5090 with SageAttention.
|
| 120 |
+
|
| 121 |
+
|
| 122 |
+
# Infer with [Lite-Model] on single GPU
|
| 123 |
+
bash inference_script_single_gpu_lite.sh
|
| 124 |
+
# Real-time inference speed can be supported on single RTX-4090 (up to 3 concurrent).
|
| 125 |
+
```
|
| 126 |
+
|
| 127 |
+
### 👋 Online Experience
|
| 128 |
+
For a real-time interactive experience, scan the QR code to enter the event link. [2026.2.12~2026.3.11]
|
| 129 |
+
<a id="online-experience-qr"></a>
|
| 130 |
+
<div align="center">
|
| 131 |
+
<table>
|
| 132 |
+
<tr>
|
| 133 |
+
<td align="center">
|
| 134 |
+
<img src="assets/soul_event_link.png" width="200" alt="SoulApp event QR Code"/>
|
| 135 |
+
<br />
|
| 136 |
+
<strong>Real-time Online Experience<br>(SoulApp 实时在线体验)</strong>
|
| 137 |
+
</td>
|
| 138 |
+
</tr>
|
| 139 |
+
</table>
|
| 140 |
+
</div>
|
| 141 |
+
|
| 142 |
+
## 📧 Contact Us
|
| 143 |
+
If you are interested in leaving a message to our work, feel free to email yutan@soulapp.cn or qiaoqian@soulapp.cn or le.shen@mail.dhu.edu.cn or zhouke@soulapp.cn or liusiyuan@soulapp.cn
|
| 144 |
+
|
| 145 |
+
We have opened a WeChat group. Additionally, we represent **SoulApp** and warmly welcome everyone to download the app and join our Soul group for further technical discussions and updates!
|
| 146 |
+
|
| 147 |
+
<div align="center">
|
| 148 |
+
<table>
|
| 149 |
+
<tr>
|
| 150 |
+
<td align="center">
|
| 151 |
+
<img src="assets/wechat_group.png" width="300" alt="WeChat Group QR Code"/>
|
| 152 |
+
<br />
|
| 153 |
+
<strong>Join WeChat Group<br>(加入微信技术群)</strong>
|
| 154 |
+
</td>
|
| 155 |
+
<td width="100"></td>
|
| 156 |
+
<td align="center">
|
| 157 |
+
<img src="assets/soul_group.png" width="300" alt="Soul App Group QR Code"/>
|
| 158 |
+
<br />
|
| 159 |
+
<strong>Download SoulApp & Join Group<br>(下载SoulApp加入群组)</strong>
|
| 160 |
+
</td>
|
| 161 |
+
</tr>
|
| 162 |
+
</table>
|
| 163 |
+
</div>
|
| 164 |
+
|
| 165 |
+
|
| 166 |
+
## 📚 Citation
|
| 167 |
+
|
| 168 |
+
If you find our work useful in your research, please consider citing:
|
| 169 |
+
|
| 170 |
+
```
|
| 171 |
+
@misc{yu2026soulxflashheadoracleguidedgenerationinfinite,
|
| 172 |
+
title={SoulX-FlashHead: Oracle-guided Generation of Infinite Real-time Streaming Talking Heads},
|
| 173 |
+
author={Tan Yu and Qian Qiao and Le Shen and Ke Zhou and Jincheng Hu and Dian Sheng and Bo Hu and Haoming Qin and Jun Gao and Changhai Zhou and Shunshun Yin and Siyuan Liu},
|
| 174 |
+
year={2026},
|
| 175 |
+
eprint={2602.07449},
|
| 176 |
+
archivePrefix={arXiv},
|
| 177 |
+
primaryClass={cs.CV},
|
| 178 |
+
url={https://arxiv.org/abs/2602.07449},
|
| 179 |
+
}
|
| 180 |
+
```
|
| 181 |
+
|
| 182 |
+
## 🙇 Acknowledgement
|
| 183 |
+
- [Wan](https://github.com/Wan-Video/Wan2.1): the base model we built upon.
|
| 184 |
+
- [LTX-Video](https://github.com/Lightricks/LTX-Video): the VAE of our Lite-Model.
|
| 185 |
+
- [Self forcing](https://github.com/guandeh17/Self-Forcing): the codebase we built upon.
|
| 186 |
+
- [DMD](https://github.com/tianweiy/DMD2) and [Self forcing++](https://github.com/justincui03/Self-Forcing-Plus-Plus): the key distillation technique used by our method.
|
| 187 |
+
- [SoulX-FlashTalk](https://github.com/Soul-AILab/SoulX-FlashTalk/) is another model developed by our team, featuring 14B parameters and real-time capabilities.
|
| 188 |
+
> [!TIP]
|
| 189 |
+
> If you find our work useful, please also consider starring the original repositories of these foundational methods.
|
| 190 |
+
|
| 191 |
+
## 💡 Star History
|
| 192 |
+
<p align="center">
|
| 193 |
+
<a href="https://star-history.com/#Soul-AILab/SoulX-FlashHead&Date">
|
| 194 |
+
<img src="https://api.star-history.com/svg?repos=Soul-AILab/SoulX-FlashHead&type=Date" alt="Star History Chart" width="100%">
|
| 195 |
+
</a>
|
| 196 |
+
</p>
|
VAE_LTX/config.json
ADDED
|
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"_class_name": "AutoencoderKLLTXVideo",
|
| 3 |
+
"_diffusers_version": "0.32.0.dev0",
|
| 4 |
+
"block_out_channels": [
|
| 5 |
+
128,
|
| 6 |
+
256,
|
| 7 |
+
512,
|
| 8 |
+
512
|
| 9 |
+
],
|
| 10 |
+
"decoder_causal": false,
|
| 11 |
+
"encoder_causal": true,
|
| 12 |
+
"in_channels": 3,
|
| 13 |
+
"latent_channels": 128,
|
| 14 |
+
"layers_per_block": [
|
| 15 |
+
4,
|
| 16 |
+
3,
|
| 17 |
+
3,
|
| 18 |
+
3,
|
| 19 |
+
4
|
| 20 |
+
],
|
| 21 |
+
"out_channels": 3,
|
| 22 |
+
"patch_size": 4,
|
| 23 |
+
"patch_size_t": 1,
|
| 24 |
+
"resnet_norm_eps": 1e-06,
|
| 25 |
+
"scaling_factor": 1.0,
|
| 26 |
+
"spatio_temporal_scaling": [
|
| 27 |
+
true,
|
| 28 |
+
true,
|
| 29 |
+
true,
|
| 30 |
+
false
|
| 31 |
+
]
|
| 32 |
+
}
|
VAE_LTX/diffusion_pytorch_model.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:265ca87cb5dff5e37f924286e957324e282fe7710a952a7dafc0df43883e2010
|
| 3 |
+
size 1676798532
|
VAE_Wan/Wan2.1_VAE.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:38071ab59bd94681c686fa51d75a1968f64e470262043be31f7a094e442fd981
|
| 3 |
+
size 507609880
|
assets/chengdu.mp4
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:95bab76c675fa176b1c2b1be54fe4ac5cb2ad4fb7868196be8a380b608e7b49d
|
| 3 |
+
size 2170506
|
assets/einstein.mp4
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:c10118b169916df66ff4ac4258ae3b8e959f51b2ba52f6551e1442125655f899
|
| 3 |
+
size 5901287
|
assets/flashhead_logo.png
ADDED
|
Git LFS Details
|
assets/qitiandasheng.mp4
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:72a340f8b74936961c4525592e8f07280f1cc771efa3e0dbc84b731ece6e3198
|
| 3 |
+
size 1682357
|
assets/soul_event_link.png
ADDED
|
Git LFS Details
|
assets/soul_group.png
ADDED
|
assets/wechat_group.png
ADDED
|
Git LFS Details
|
config.json
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"model_type": "wan",
|
| 3 |
+
"architectures": ["WanVideoModel"],
|
| 4 |
+
"transformers_version": "4.48.0"
|
| 5 |
+
}
|
model_index.json
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"_class_name": "WanModelAudioProject",
|
| 3 |
+
"_diffusers_version": "0.36.0"
|
| 4 |
+
}
|