yugeng-amd commited on
Commit
5a799ca
·
verified ·
1 Parent(s): 75326cd

Upload folder using huggingface_hub

Browse files
README.md ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: gpl-3.0
3
+ ---
4
+ In this work, we introduce Micro-World, an action-controlled interactive world model designed to generate high-quality, open-domain scenes. Built on top of the Wan2.1 family of models, we train both image-to-video (I2V) and text-to-video (T2V) variants to support a wide range of use cases. To foster open research and practical adoption in the community, we release the model weights, full training and inference code, as well as a curated dataset specifically tailored for controllable world modeling.
5
+
6
+ For action injection, we favor adaLN for its lightweight parameter footprint, and ControlNet for its strong empirical stability during training.
7
+
8
+ Note that released t2v model is trained using ControlNet architecture.
9
+
10
+ More info please refer to code.
11
+ <div style="margin: 0; padding: 0; text-align: center;">
12
+ <img src="https://github.com/user-attachments/assets/680b87ac-0c95-4a27-b4fd-fcafb9fdf609" alt="t2v architecture" title="t2v architecture" class="t2v architecture">
13
+ <img src="https://github.com/user-attachments/assets/c9cd8d9e-9555-42d3-b884-04705d1e329c" alt="t2v architecture" title="t2v architecture" class="t2v architecture">
14
+ </div>
lora_diffusion_pytorch_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a395da7ef7776bd567efcc4b85aa4918a577c1399bf2a6967d981acf50ab04e2
3
+ size 356705124
transformer/config.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_class_name": "WanActionControlNetModel",
3
+ "_diffusers_version": "0.34.0",
4
+ "cross_attn_norm": true,
5
+ "dim": 1536,
6
+ "eps": 1e-06,
7
+ "ffn_dim": 8960,
8
+ "freq_dim": 256,
9
+ "in_dim": 16,
10
+ "keyboard_dim": 7,
11
+ "model_type": "t2v",
12
+ "mouse_dim": 2,
13
+ "num_heads": 12,
14
+ "num_layers": 30,
15
+ "out_dim": 16,
16
+ "patch_size": [
17
+ 1,
18
+ 2,
19
+ 2
20
+ ],
21
+ "qk_norm": true,
22
+ "text_dim": 4096,
23
+ "text_len": 512,
24
+ "action_in_dim": null,
25
+ "action_layers": null,
26
+ "window_size": [
27
+ -1,
28
+ -1
29
+ ]
30
+ }
31
+
transformer/diffusion_pytorch_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:46144735a50095b54751327934552cde9a3e091a624b19d29f12a1389bc4e2e7
3
+ size 4315208944