Upload folder using huggingface_hub
Browse files- pretrain_imgnet_B_qknorm.json +32 -0
- pretrain_imgnet_B_qknorm.safetensors +3 -0
- pretrain_imgnet_B_qknorm_nofinal.json +32 -0
- pretrain_imgnet_B_qknorm_nofinal.safetensors +3 -0
- pretrain_imgnet_L_1152d28l.json +29 -0
- pretrain_imgnet_L_1152d28l.safetensors +3 -0
- pretrain_imgnet_L_qknorm.json +32 -0
- pretrain_imgnet_L_qknorm.safetensors +3 -0
- pretrain_imgnet_S_qknorm.json +32 -0
- pretrain_imgnet_S_qknorm.safetensors +3 -0
- pretrain_imgnet_S_qknorm_nofinal.json +32 -0
- pretrain_imgnet_S_qknorm_nofinal.safetensors +3 -0
- pretrain_video_B_1024d16l.json +29 -0
- pretrain_video_B_1024d16l.safetensors +3 -0
- pretrain_video_L_1152d28l_80k.json +29 -0
- pretrain_video_L_1152d28l_80k.safetensors +3 -0
- pretrain_video_L_qknorm.json +32 -0
- pretrain_video_L_qknorm.safetensors +3 -0
- pretrain_video_S_qknorm.json +32 -0
- pretrain_video_S_qknorm.safetensors +3 -0
- pretrain_video_S_qknorm_nofinal.json +32 -0
- pretrain_video_S_qknorm_nofinal.safetensors +3 -0
pretrain_imgnet_B_qknorm.json
ADDED
|
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"decoder_config": {
|
| 3 |
+
"image_dim": 4,
|
| 4 |
+
"patch_size": 2,
|
| 5 |
+
"decoder_style": "dit",
|
| 6 |
+
"num_layers": 16,
|
| 7 |
+
"hidden_size": 1024,
|
| 8 |
+
"intermediate_size": 4096,
|
| 9 |
+
"num_heads": 16,
|
| 10 |
+
"decode_mode": "diffusion",
|
| 11 |
+
"qk_norm": true,
|
| 12 |
+
"attn_bias": false,
|
| 13 |
+
"use_final_norm": true
|
| 14 |
+
},
|
| 15 |
+
"encoder_hidden_size": 768,
|
| 16 |
+
"latent_h": 32,
|
| 17 |
+
"latent_w": 32,
|
| 18 |
+
"latent_mean": [
|
| 19 |
+
-0.69,
|
| 20 |
+
-0.48,
|
| 21 |
+
-0.6,
|
| 22 |
+
0.28
|
| 23 |
+
],
|
| 24 |
+
"latent_std": [
|
| 25 |
+
12.38,
|
| 26 |
+
11.22,
|
| 27 |
+
7.93,
|
| 28 |
+
21.22
|
| 29 |
+
],
|
| 30 |
+
"pretrain_mode": "imagenet",
|
| 31 |
+
"source_ckpt": "ttvidt-dit-pretrain/rbzlbc57/checkpoints/epoch=19-step=100000.ckpt"
|
| 32 |
+
}
|
pretrain_imgnet_B_qknorm.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:23738d11fa3b93c3ff46b478ec7472ad3e8e2a517b6d7c5a6825448a3f04927d
|
| 3 |
+
size 1548691368
|
pretrain_imgnet_B_qknorm_nofinal.json
ADDED
|
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"decoder_config": {
|
| 3 |
+
"image_dim": 4,
|
| 4 |
+
"patch_size": 2,
|
| 5 |
+
"decoder_style": "dit",
|
| 6 |
+
"num_layers": 16,
|
| 7 |
+
"hidden_size": 1024,
|
| 8 |
+
"intermediate_size": 4096,
|
| 9 |
+
"num_heads": 16,
|
| 10 |
+
"decode_mode": "diffusion",
|
| 11 |
+
"qk_norm": true,
|
| 12 |
+
"attn_bias": false,
|
| 13 |
+
"use_final_norm": false
|
| 14 |
+
},
|
| 15 |
+
"encoder_hidden_size": 768,
|
| 16 |
+
"latent_h": 32,
|
| 17 |
+
"latent_w": 32,
|
| 18 |
+
"latent_mean": [
|
| 19 |
+
-0.69,
|
| 20 |
+
-0.48,
|
| 21 |
+
-0.6,
|
| 22 |
+
0.28
|
| 23 |
+
],
|
| 24 |
+
"latent_std": [
|
| 25 |
+
12.38,
|
| 26 |
+
11.22,
|
| 27 |
+
7.93,
|
| 28 |
+
21.22
|
| 29 |
+
],
|
| 30 |
+
"pretrain_mode": "imagenet",
|
| 31 |
+
"source_ckpt": "ttvidt-dit-pretrain/dkskempu/checkpoints/epoch=19-step=100000.ckpt"
|
| 32 |
+
}
|
pretrain_imgnet_B_qknorm_nofinal.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:8912e682705591d7516a0ce8b2ee77d5e86c40edc1606566690c1a0f06b36256
|
| 3 |
+
size 1548687192
|
pretrain_imgnet_L_1152d28l.json
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"decoder_config": {
|
| 3 |
+
"image_dim": 4,
|
| 4 |
+
"patch_size": 2,
|
| 5 |
+
"decoder_style": "dit",
|
| 6 |
+
"num_layers": 28,
|
| 7 |
+
"hidden_size": 1152,
|
| 8 |
+
"intermediate_size": 3456,
|
| 9 |
+
"num_heads": 16,
|
| 10 |
+
"decode_mode": "diffusion"
|
| 11 |
+
},
|
| 12 |
+
"encoder_hidden_size": 768,
|
| 13 |
+
"latent_h": 32,
|
| 14 |
+
"latent_w": 32,
|
| 15 |
+
"latent_mean": [
|
| 16 |
+
-0.69,
|
| 17 |
+
-0.48,
|
| 18 |
+
-0.6,
|
| 19 |
+
0.28
|
| 20 |
+
],
|
| 21 |
+
"latent_std": [
|
| 22 |
+
12.38,
|
| 23 |
+
11.22,
|
| 24 |
+
7.93,
|
| 25 |
+
21.22
|
| 26 |
+
],
|
| 27 |
+
"pretrain_mode": "imagenet",
|
| 28 |
+
"source_ckpt": "ttvidt-dit-pretrain/ibh0mukg/checkpoints/epoch=19-step=100000.ckpt"
|
| 29 |
+
}
|
pretrain_imgnet_L_1152d28l.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:dbb10311cb4bfb7a165bf732de3d58f3493d87ac0980ccaa7e6d5d69a6c166d5
|
| 3 |
+
size 2985899600
|
pretrain_imgnet_L_qknorm.json
ADDED
|
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"decoder_config": {
|
| 3 |
+
"image_dim": 4,
|
| 4 |
+
"patch_size": 2,
|
| 5 |
+
"decoder_style": "dit",
|
| 6 |
+
"num_layers": 28,
|
| 7 |
+
"hidden_size": 1152,
|
| 8 |
+
"intermediate_size": 3456,
|
| 9 |
+
"num_heads": 16,
|
| 10 |
+
"decode_mode": "diffusion",
|
| 11 |
+
"qk_norm": true,
|
| 12 |
+
"attn_bias": false,
|
| 13 |
+
"use_final_norm": true
|
| 14 |
+
},
|
| 15 |
+
"encoder_hidden_size": 768,
|
| 16 |
+
"latent_h": 32,
|
| 17 |
+
"latent_w": 32,
|
| 18 |
+
"latent_mean": [
|
| 19 |
+
-0.69,
|
| 20 |
+
-0.48,
|
| 21 |
+
-0.6,
|
| 22 |
+
0.28
|
| 23 |
+
],
|
| 24 |
+
"latent_std": [
|
| 25 |
+
12.38,
|
| 26 |
+
11.22,
|
| 27 |
+
7.93,
|
| 28 |
+
21.22
|
| 29 |
+
],
|
| 30 |
+
"pretrain_mode": "imagenet",
|
| 31 |
+
"source_ckpt": "ttvidt-dit-pretrain/d14nc69r/checkpoints/epoch=19-step=100000.ckpt"
|
| 32 |
+
}
|
pretrain_imgnet_L_qknorm.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:18b6630f9bd67bf1d3ea05ef732619b76ee57f1ff27c6c12d6dde2b140535151
|
| 3 |
+
size 2979568560
|
pretrain_imgnet_S_qknorm.json
ADDED
|
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"decoder_config": {
|
| 3 |
+
"image_dim": 4,
|
| 4 |
+
"patch_size": 2,
|
| 5 |
+
"decoder_style": "dit",
|
| 6 |
+
"num_layers": 12,
|
| 7 |
+
"hidden_size": 768,
|
| 8 |
+
"intermediate_size": 3072,
|
| 9 |
+
"num_heads": 12,
|
| 10 |
+
"decode_mode": "diffusion",
|
| 11 |
+
"qk_norm": true,
|
| 12 |
+
"attn_bias": false,
|
| 13 |
+
"use_final_norm": true
|
| 14 |
+
},
|
| 15 |
+
"encoder_hidden_size": 768,
|
| 16 |
+
"latent_h": 32,
|
| 17 |
+
"latent_w": 32,
|
| 18 |
+
"latent_mean": [
|
| 19 |
+
-0.69,
|
| 20 |
+
-0.48,
|
| 21 |
+
-0.6,
|
| 22 |
+
0.28
|
| 23 |
+
],
|
| 24 |
+
"latent_std": [
|
| 25 |
+
12.38,
|
| 26 |
+
11.22,
|
| 27 |
+
7.93,
|
| 28 |
+
21.22
|
| 29 |
+
],
|
| 30 |
+
"pretrain_mode": "imagenet",
|
| 31 |
+
"source_ckpt": "ttvidt-dit-pretrain/tsiyjsvh/checkpoints/epoch=19-step=100000.ckpt"
|
| 32 |
+
}
|
pretrain_imgnet_S_qknorm.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:58d3b381d18602df0b0d97ea8557211de79c79e50d1e339c90f3bc483134f66f
|
| 3 |
+
size 654121648
|
pretrain_imgnet_S_qknorm_nofinal.json
ADDED
|
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"decoder_config": {
|
| 3 |
+
"image_dim": 4,
|
| 4 |
+
"patch_size": 2,
|
| 5 |
+
"decoder_style": "dit",
|
| 6 |
+
"num_layers": 12,
|
| 7 |
+
"hidden_size": 768,
|
| 8 |
+
"intermediate_size": 3072,
|
| 9 |
+
"num_heads": 12,
|
| 10 |
+
"decode_mode": "diffusion",
|
| 11 |
+
"qk_norm": true,
|
| 12 |
+
"attn_bias": false,
|
| 13 |
+
"use_final_norm": false
|
| 14 |
+
},
|
| 15 |
+
"encoder_hidden_size": 768,
|
| 16 |
+
"latent_h": 32,
|
| 17 |
+
"latent_w": 32,
|
| 18 |
+
"latent_mean": [
|
| 19 |
+
-0.69,
|
| 20 |
+
-0.48,
|
| 21 |
+
-0.6,
|
| 22 |
+
0.28
|
| 23 |
+
],
|
| 24 |
+
"latent_std": [
|
| 25 |
+
12.38,
|
| 26 |
+
11.22,
|
| 27 |
+
7.93,
|
| 28 |
+
21.22
|
| 29 |
+
],
|
| 30 |
+
"pretrain_mode": "imagenet",
|
| 31 |
+
"source_ckpt": "ttvidt-dit-pretrain/gwv1kxjd/checkpoints/epoch=19-step=100000.ckpt"
|
| 32 |
+
}
|
pretrain_imgnet_S_qknorm_nofinal.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:f57d0647744439b4c48219ac433eb21ffcff4ef46d6b07d6a859be813c0f6a5c
|
| 3 |
+
size 654118496
|
pretrain_video_B_1024d16l.json
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"decoder_config": {
|
| 3 |
+
"image_dim": 4,
|
| 4 |
+
"patch_size": 2,
|
| 5 |
+
"decoder_style": "dit",
|
| 6 |
+
"num_layers": 16,
|
| 7 |
+
"hidden_size": 1024,
|
| 8 |
+
"intermediate_size": 4096,
|
| 9 |
+
"num_heads": 16,
|
| 10 |
+
"decode_mode": "diffusion"
|
| 11 |
+
},
|
| 12 |
+
"encoder_hidden_size": 768,
|
| 13 |
+
"latent_h": 32,
|
| 14 |
+
"latent_w": 32,
|
| 15 |
+
"latent_mean": [
|
| 16 |
+
-0.69,
|
| 17 |
+
-0.48,
|
| 18 |
+
-0.6,
|
| 19 |
+
0.28
|
| 20 |
+
],
|
| 21 |
+
"latent_std": [
|
| 22 |
+
12.38,
|
| 23 |
+
11.22,
|
| 24 |
+
7.93,
|
| 25 |
+
21.22
|
| 26 |
+
],
|
| 27 |
+
"pretrain_mode": "video",
|
| 28 |
+
"source_ckpt": "ttvidt-dit-pretrain/7isfknx9/checkpoints/epoch=14-step=100000.ckpt"
|
| 29 |
+
}
|
pretrain_video_B_1024d16l.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:47fe8dd97160a15d65bc8a0f295590b448c6895e21ef37b9e42ab84a879f31fb
|
| 3 |
+
size 1553397056
|
pretrain_video_L_1152d28l_80k.json
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"decoder_config": {
|
| 3 |
+
"image_dim": 4,
|
| 4 |
+
"patch_size": 2,
|
| 5 |
+
"decoder_style": "dit",
|
| 6 |
+
"num_layers": 28,
|
| 7 |
+
"hidden_size": 1152,
|
| 8 |
+
"intermediate_size": 3456,
|
| 9 |
+
"num_heads": 16,
|
| 10 |
+
"decode_mode": "diffusion"
|
| 11 |
+
},
|
| 12 |
+
"encoder_hidden_size": 768,
|
| 13 |
+
"latent_h": 32,
|
| 14 |
+
"latent_w": 32,
|
| 15 |
+
"latent_mean": [
|
| 16 |
+
-0.69,
|
| 17 |
+
-0.48,
|
| 18 |
+
-0.6,
|
| 19 |
+
0.28
|
| 20 |
+
],
|
| 21 |
+
"latent_std": [
|
| 22 |
+
12.38,
|
| 23 |
+
11.22,
|
| 24 |
+
7.93,
|
| 25 |
+
21.22
|
| 26 |
+
],
|
| 27 |
+
"pretrain_mode": "video",
|
| 28 |
+
"source_ckpt": "ttvidt-dit-pretrain/2v3xqhf2/checkpoints/epoch=11-step=80000.ckpt"
|
| 29 |
+
}
|
pretrain_video_L_1152d28l_80k.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:258ef38f5f41c9a1d073f1934aa58582f12b824b13cc91303e1fee7787784e28
|
| 3 |
+
size 2985899600
|
pretrain_video_L_qknorm.json
ADDED
|
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"decoder_config": {
|
| 3 |
+
"image_dim": 4,
|
| 4 |
+
"patch_size": 2,
|
| 5 |
+
"decoder_style": "dit",
|
| 6 |
+
"num_layers": 28,
|
| 7 |
+
"hidden_size": 1152,
|
| 8 |
+
"intermediate_size": 3456,
|
| 9 |
+
"num_heads": 16,
|
| 10 |
+
"decode_mode": "diffusion",
|
| 11 |
+
"qk_norm": true,
|
| 12 |
+
"attn_bias": false,
|
| 13 |
+
"use_final_norm": true
|
| 14 |
+
},
|
| 15 |
+
"encoder_hidden_size": 768,
|
| 16 |
+
"latent_h": 32,
|
| 17 |
+
"latent_w": 32,
|
| 18 |
+
"latent_mean": [
|
| 19 |
+
-0.69,
|
| 20 |
+
-0.48,
|
| 21 |
+
-0.6,
|
| 22 |
+
0.28
|
| 23 |
+
],
|
| 24 |
+
"latent_std": [
|
| 25 |
+
12.38,
|
| 26 |
+
11.22,
|
| 27 |
+
7.93,
|
| 28 |
+
21.22
|
| 29 |
+
],
|
| 30 |
+
"pretrain_mode": "video",
|
| 31 |
+
"source_ckpt": "ttvidt-dit-pretrain/j7hkyq3z/checkpoints/epoch=14-step=100000.ckpt"
|
| 32 |
+
}
|
pretrain_video_L_qknorm.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:49e3b7b1d430c64c767f9a50bd1b1dd351d0e9597533f8f7adc39392bd75ddf1
|
| 3 |
+
size 2979568560
|
pretrain_video_S_qknorm.json
ADDED
|
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"decoder_config": {
|
| 3 |
+
"image_dim": 4,
|
| 4 |
+
"patch_size": 2,
|
| 5 |
+
"decoder_style": "dit",
|
| 6 |
+
"num_layers": 12,
|
| 7 |
+
"hidden_size": 768,
|
| 8 |
+
"intermediate_size": 3072,
|
| 9 |
+
"num_heads": 12,
|
| 10 |
+
"decode_mode": "diffusion",
|
| 11 |
+
"qk_norm": true,
|
| 12 |
+
"attn_bias": false,
|
| 13 |
+
"use_final_norm": true
|
| 14 |
+
},
|
| 15 |
+
"encoder_hidden_size": 768,
|
| 16 |
+
"latent_h": 32,
|
| 17 |
+
"latent_w": 32,
|
| 18 |
+
"latent_mean": [
|
| 19 |
+
-0.69,
|
| 20 |
+
-0.48,
|
| 21 |
+
-0.6,
|
| 22 |
+
0.28
|
| 23 |
+
],
|
| 24 |
+
"latent_std": [
|
| 25 |
+
12.38,
|
| 26 |
+
11.22,
|
| 27 |
+
7.93,
|
| 28 |
+
21.22
|
| 29 |
+
],
|
| 30 |
+
"pretrain_mode": "video",
|
| 31 |
+
"source_ckpt": "ttvidt-dit-pretrain/baec0kfz/checkpoints/epoch=14-step=100000.ckpt"
|
| 32 |
+
}
|
pretrain_video_S_qknorm.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:79067d2ecc5355f44bcaa6fb88baf383db787bf770ffdef77bb7875fda100a28
|
| 3 |
+
size 654121648
|
pretrain_video_S_qknorm_nofinal.json
ADDED
|
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"decoder_config": {
|
| 3 |
+
"image_dim": 4,
|
| 4 |
+
"patch_size": 2,
|
| 5 |
+
"decoder_style": "dit",
|
| 6 |
+
"num_layers": 12,
|
| 7 |
+
"hidden_size": 768,
|
| 8 |
+
"intermediate_size": 3072,
|
| 9 |
+
"num_heads": 12,
|
| 10 |
+
"decode_mode": "diffusion",
|
| 11 |
+
"qk_norm": true,
|
| 12 |
+
"attn_bias": false,
|
| 13 |
+
"use_final_norm": false
|
| 14 |
+
},
|
| 15 |
+
"encoder_hidden_size": 768,
|
| 16 |
+
"latent_h": 32,
|
| 17 |
+
"latent_w": 32,
|
| 18 |
+
"latent_mean": [
|
| 19 |
+
-0.69,
|
| 20 |
+
-0.48,
|
| 21 |
+
-0.6,
|
| 22 |
+
0.28
|
| 23 |
+
],
|
| 24 |
+
"latent_std": [
|
| 25 |
+
12.38,
|
| 26 |
+
11.22,
|
| 27 |
+
7.93,
|
| 28 |
+
21.22
|
| 29 |
+
],
|
| 30 |
+
"pretrain_mode": "video",
|
| 31 |
+
"source_ckpt": "ttvidt-dit-pretrain/1c4ts6z2/checkpoints/epoch=14-step=100000.ckpt"
|
| 32 |
+
}
|
pretrain_video_S_qknorm_nofinal.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:e63d0b9b2485e3857647e4f0ba81283609a507f7a2d69ff7b7ac583b27c24f6b
|
| 3 |
+
size 654118496
|