Upload folder using huggingface_hub
Browse files
README.md
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
license: gpl-3.0
|
| 3 |
+
---
|
| 4 |
+
In this work, we introduce Micro-World, an action-controlled interactive world model designed to generate high-quality, open-domain scenes. Built on top of the Wan2.1 family of models, we train both image-to-video (I2V) and text-to-video (T2V) variants to support a wide range of use cases. To foster open research and practical adoption in the community, we release the model weights, full training and inference code, as well as a curated dataset specifically tailored for controllable world modeling.
|
| 5 |
+
|
| 6 |
+
For action injection, we favor adaLN for its lightweight parameter footprint, and ControlNet for its strong empirical stability during training.
|
| 7 |
+
|
| 8 |
+
Note that released t2v model is trained using ControlNet architecture.
|
| 9 |
+
|
| 10 |
+
More info please refer to code.
|
| 11 |
+
<div style="margin: 0; padding: 0; text-align: center;">
|
| 12 |
+
<img src="https://github.com/user-attachments/assets/680b87ac-0c95-4a27-b4fd-fcafb9fdf609" alt="t2v architecture" title="t2v architecture" class="t2v architecture">
|
| 13 |
+
<img src="https://github.com/user-attachments/assets/c9cd8d9e-9555-42d3-b884-04705d1e329c" alt="t2v architecture" title="t2v architecture" class="t2v architecture">
|
| 14 |
+
</div>
|
lora_diffusion_pytorch_model.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:a395da7ef7776bd567efcc4b85aa4918a577c1399bf2a6967d981acf50ab04e2
|
| 3 |
+
size 356705124
|
transformer/config.json
ADDED
|
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"_class_name": "WanActionControlNetModel",
|
| 3 |
+
"_diffusers_version": "0.34.0",
|
| 4 |
+
"cross_attn_norm": true,
|
| 5 |
+
"dim": 1536,
|
| 6 |
+
"eps": 1e-06,
|
| 7 |
+
"ffn_dim": 8960,
|
| 8 |
+
"freq_dim": 256,
|
| 9 |
+
"in_dim": 16,
|
| 10 |
+
"keyboard_dim": 7,
|
| 11 |
+
"model_type": "t2v",
|
| 12 |
+
"mouse_dim": 2,
|
| 13 |
+
"num_heads": 12,
|
| 14 |
+
"num_layers": 30,
|
| 15 |
+
"out_dim": 16,
|
| 16 |
+
"patch_size": [
|
| 17 |
+
1,
|
| 18 |
+
2,
|
| 19 |
+
2
|
| 20 |
+
],
|
| 21 |
+
"qk_norm": true,
|
| 22 |
+
"text_dim": 4096,
|
| 23 |
+
"text_len": 512,
|
| 24 |
+
"action_in_dim": null,
|
| 25 |
+
"action_layers": null,
|
| 26 |
+
"window_size": [
|
| 27 |
+
-1,
|
| 28 |
+
-1
|
| 29 |
+
]
|
| 30 |
+
}
|
| 31 |
+
|
transformer/diffusion_pytorch_model.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:46144735a50095b54751327934552cde9a3e091a624b19d29f12a1389bc4e2e7
|
| 3 |
+
size 4315208944
|