FlashStight commited on
Commit
602e3bc
·
verified ·
1 Parent(s): b75ad33

Upload 26 files

Browse files

Initial submission

checkpoint/.DS_Store ADDED
Binary file (6.15 kB). View file
 
checkpoint/denoiser/config.json ADDED
@@ -0,0 +1,141 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_class_name": "UNet2DConditionModel",
3
+ "_diffusers_version": "0.32.2",
4
+ "_name_or_path": "/data02/zhenchen/VTO_mobile/result/cat_final/checkpoint-8280",
5
+ "act_fn": "hardswish",
6
+ "addition_embed_type": null,
7
+ "addition_embed_type_num_heads": 64,
8
+ "addition_time_embed_dim": 256,
9
+ "attention_head_dim": [
10
+ 4,
11
+ 8,
12
+ 14
13
+ ],
14
+ "attention_type": "default",
15
+ "attn_module": "Attention",
16
+ "attn_processor_type": "AttnProcessor2_0",
17
+ "block_out_channels": [
18
+ 256,
19
+ 512,
20
+ 896
21
+ ],
22
+ "center_input_sample": false,
23
+ "class_embed_type": null,
24
+ "class_embeddings_concat": false,
25
+ "context_embedding_caption_projection_dim": 2048,
26
+ "context_embedding_text_embedding_dim": 4096,
27
+ "conv_in_dw_bias": true,
28
+ "conv_in_kernel": 3,
29
+ "conv_in_module": "Conv2d",
30
+ "conv_in_pw_bias": false,
31
+ "conv_out_dw_bias": true,
32
+ "conv_out_kernel": 3,
33
+ "conv_out_module": "Conv2d",
34
+ "conv_out_pw_bias": false,
35
+ "cross_attention_dim": 4096,
36
+ "cross_attention_norm": null,
37
+ "down_block_ff_mult": 3,
38
+ "down_block_kv_heads": 1,
39
+ "down_block_qk_norm": "layer_norm",
40
+ "down_block_resnet_dw_bias": true,
41
+ "down_block_resnet_middle_expansion": 2,
42
+ "down_block_resnet_middle_expansion_type": "input",
43
+ "down_block_resnet_pw_bias": false,
44
+ "down_block_types": [
45
+ "CrossAttnDownBlock2D",
46
+ "CrossAttnDownBlock2D",
47
+ "CrossAttnDownBlock2D"
48
+ ],
49
+ "down_block_use_self_attention": [
50
+ false,
51
+ false,
52
+ true
53
+ ],
54
+ "downsample_conv_module": "SepConv2d",
55
+ "downsample_module": "Downsample2D",
56
+ "downsample_padding": 1,
57
+ "dropout": 0.0,
58
+ "dual_cross_attention": false,
59
+ "encoder_hid_dim": null,
60
+ "encoder_hid_dim_type": "ip_image_proj",
61
+ "encoder_type": "dinov2_base",
62
+ "flip_sin_to_cos": true,
63
+ "freq_shift": 0,
64
+ "height": 2048,
65
+ "in_channels": 32,
66
+ "layers_per_block": 2,
67
+ "mid_block_ff_mult": 3,
68
+ "mid_block_kv_heads": 1,
69
+ "mid_block_only_cross_attention": null,
70
+ "mid_block_qk_norm": "layer_norm",
71
+ "mid_block_resnet_dw_bias": true,
72
+ "mid_block_resnet_middle_expansion": 2,
73
+ "mid_block_resnet_middle_expansion_type": "input",
74
+ "mid_block_resnet_pw_bias": false,
75
+ "mid_block_scale_factor": 1,
76
+ "mid_block_type": "UNetMidBlock2DCrossAttn",
77
+ "mid_block_use_additional_resnet": false,
78
+ "mid_block_use_self_attention": true,
79
+ "norm_eps": 1e-05,
80
+ "norm_num_groups": 32,
81
+ "num_attention_heads": null,
82
+ "num_class_embeds": null,
83
+ "only_cross_attention": false,
84
+ "out_channels": 16,
85
+ "pooled_projection_dim": 2048,
86
+ "projection_class_embeddings_input_dim": 2816,
87
+ "resnet_conv_module": "SepConv2d",
88
+ "resnet_module": "ResnetBlock2D",
89
+ "resnet_out_scale_factor": 1.0,
90
+ "resnet_skip_time_act": false,
91
+ "resnet_time_scale_shift": "default",
92
+ "reverse_transformer_layers_per_block": null,
93
+ "sample_size": 128,
94
+ "time_cond_proj_dim": null,
95
+ "time_embedding_act_fn": null,
96
+ "time_embedding_dim": null,
97
+ "time_embedding_module": "TimestepEmbedding",
98
+ "time_embedding_type": "positional",
99
+ "time_text_embedding_act_fn": "hardswish",
100
+ "time_text_embedding_mode": "default",
101
+ "time_text_embedding_module": "CombinedTimestepTextProjEmbeddings",
102
+ "time_text_embedding_pooled_projection_dim": 2048,
103
+ "time_text_embedding_time_embed_dim": 896,
104
+ "timestep_post_act": null,
105
+ "transformer2d_model_type": "Transformer2DModel",
106
+ "transformer_block_type": "BasicTransformerBlockTryOn",
107
+ "transformer_layers_per_block": [
108
+ 1,
109
+ 2,
110
+ 4
111
+ ],
112
+ "up_block_ff_mult": 3,
113
+ "up_block_kv_heads": 1,
114
+ "up_block_qk_norm": "layer_norm",
115
+ "up_block_receive_additional_residuals": [
116
+ false,
117
+ true,
118
+ true
119
+ ],
120
+ "up_block_resnet_dw_bias": true,
121
+ "up_block_resnet_middle_expansion": 2,
122
+ "up_block_resnet_middle_expansion_type": "input",
123
+ "up_block_resnet_pw_bias": false,
124
+ "up_block_types": [
125
+ "CrossAttnUpBlock2D",
126
+ "CrossAttnUpBlock2D",
127
+ "UpBlock2D"
128
+ ],
129
+ "up_block_use_self_attention": [
130
+ true,
131
+ false,
132
+ false
133
+ ],
134
+ "upcast_attention": null,
135
+ "upsample_conv_module": "SepConv2d",
136
+ "upsample_module": "Upsample2D",
137
+ "use_linear_projection": true,
138
+ "use_pooled_projection": false,
139
+ "use_rope": true,
140
+ "width": 768
141
+ }
checkpoint/denoiser/diffusion_pytorch_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:34ee9331e414a8d89ebd8c4c485094947f1fa710ef34d40a7726a623a7f3eca2
3
+ size 840173776
checkpoint/denoiser_garment/config.json ADDED
@@ -0,0 +1,139 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_class_name": "UNet2DConditionModel",
3
+ "_diffusers_version": "0.32.2",
4
+ "act_fn": "hardswish",
5
+ "addition_embed_type": null,
6
+ "addition_embed_type_num_heads": 64,
7
+ "addition_time_embed_dim": 256,
8
+ "attention_head_dim": [
9
+ 4,
10
+ 8,
11
+ 14
12
+ ],
13
+ "attention_type": "default",
14
+ "attn_module": "Attention",
15
+ "attn_processor_type": "AttnProcessor2_0",
16
+ "block_out_channels": [
17
+ 256,
18
+ 512,
19
+ 896
20
+ ],
21
+ "center_input_sample": false,
22
+ "class_embed_type": null,
23
+ "class_embeddings_concat": false,
24
+ "context_embedding_caption_projection_dim": 2048,
25
+ "context_embedding_text_embedding_dim": 4096,
26
+ "conv_in_dw_bias": true,
27
+ "conv_in_kernel": 3,
28
+ "conv_in_module": "Conv2d",
29
+ "conv_in_pw_bias": false,
30
+ "conv_out_dw_bias": true,
31
+ "conv_out_kernel": 3,
32
+ "conv_out_module": "Conv2d",
33
+ "conv_out_pw_bias": false,
34
+ "cross_attention_dim": 4096,
35
+ "cross_attention_norm": null,
36
+ "down_block_ff_mult": 3,
37
+ "down_block_kv_heads": 1,
38
+ "down_block_qk_norm": "layer_norm",
39
+ "down_block_resnet_dw_bias": true,
40
+ "down_block_resnet_middle_expansion": 2,
41
+ "down_block_resnet_middle_expansion_type": "input",
42
+ "down_block_resnet_pw_bias": false,
43
+ "down_block_types": [
44
+ "CrossAttnDownBlock2D",
45
+ "CrossAttnDownBlock2D",
46
+ "CrossAttnDownBlock2D"
47
+ ],
48
+ "down_block_use_self_attention": [
49
+ false,
50
+ false,
51
+ true
52
+ ],
53
+ "downsample_conv_module": "SepConv2d",
54
+ "downsample_module": "Downsample2D",
55
+ "downsample_padding": 1,
56
+ "dropout": 0.0,
57
+ "dual_cross_attention": false,
58
+ "encoder_hid_dim": null,
59
+ "encoder_hid_dim_type": null,
60
+ "flip_sin_to_cos": true,
61
+ "freq_shift": 0,
62
+ "height": 1024,
63
+ "in_channels": 16,
64
+ "layers_per_block": 2,
65
+ "mid_block_ff_mult": 3,
66
+ "mid_block_kv_heads": 1,
67
+ "mid_block_only_cross_attention": null,
68
+ "mid_block_qk_norm": "layer_norm",
69
+ "mid_block_resnet_dw_bias": true,
70
+ "mid_block_resnet_middle_expansion": 2,
71
+ "mid_block_resnet_middle_expansion_type": "input",
72
+ "mid_block_resnet_pw_bias": false,
73
+ "mid_block_scale_factor": 1,
74
+ "mid_block_type": "UNetMidBlock2DCrossAttn",
75
+ "mid_block_use_additional_resnet": false,
76
+ "mid_block_use_self_attention": true,
77
+ "norm_eps": 1e-05,
78
+ "norm_num_groups": 32,
79
+ "num_attention_heads": null,
80
+ "num_class_embeds": null,
81
+ "only_cross_attention": false,
82
+ "out_channels": 16,
83
+ "pooled_projection_dim": 2048,
84
+ "projection_class_embeddings_input_dim": 2816,
85
+ "resnet_conv_module": "SepConv2d",
86
+ "resnet_module": "ResnetBlock2D",
87
+ "resnet_out_scale_factor": 1.0,
88
+ "resnet_skip_time_act": false,
89
+ "resnet_time_scale_shift": "default",
90
+ "reverse_transformer_layers_per_block": null,
91
+ "sample_size": 128,
92
+ "time_cond_proj_dim": null,
93
+ "time_embedding_act_fn": null,
94
+ "time_embedding_dim": null,
95
+ "time_embedding_module": "TimestepEmbedding",
96
+ "time_embedding_type": "positional",
97
+ "time_text_embedding_act_fn": "hardswish",
98
+ "time_text_embedding_mode": "default",
99
+ "time_text_embedding_module": "CombinedTimestepTextProjEmbeddings",
100
+ "time_text_embedding_pooled_projection_dim": 2048,
101
+ "time_text_embedding_time_embed_dim": 896,
102
+ "timestep_post_act": null,
103
+ "transformer2d_model_type": "Transformer2DModel",
104
+ "transformer_block_type": "BasicTransformerBlockGarment",
105
+ "transformer_layers_per_block": [
106
+ 1,
107
+ 2,
108
+ 4
109
+ ],
110
+ "up_block_ff_mult": 3,
111
+ "up_block_kv_heads": 1,
112
+ "up_block_qk_norm": "layer_norm",
113
+ "up_block_receive_additional_residuals": [
114
+ false,
115
+ true,
116
+ true
117
+ ],
118
+ "up_block_resnet_dw_bias": true,
119
+ "up_block_resnet_middle_expansion": 2,
120
+ "up_block_resnet_middle_expansion_type": "input",
121
+ "up_block_resnet_pw_bias": false,
122
+ "up_block_types": [
123
+ "CrossAttnUpBlock2D",
124
+ "CrossAttnUpBlock2D",
125
+ "UpBlock2D"
126
+ ],
127
+ "up_block_use_self_attention": [
128
+ true,
129
+ false,
130
+ false
131
+ ],
132
+ "upcast_attention": null,
133
+ "upsample_conv_module": "SepConv2d",
134
+ "upsample_module": "Upsample2D",
135
+ "use_linear_projection": true,
136
+ "use_pooled_projection": false,
137
+ "use_rope": true,
138
+ "width": 768
139
+ }
checkpoint/denoiser_garment/diffusion_pytorch_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:902e5f6d47e9aef909497d5d7d400dd51cb39764fac63aca1e5b730927e53184
3
+ size 759200976
checkpoint/image_encoder/config.json ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "../dinov2_base",
3
+ "apply_layernorm": true,
4
+ "architectures": [
5
+ "Dinov2Model"
6
+ ],
7
+ "attention_probs_dropout_prob": 0.0,
8
+ "drop_path_rate": 0.0,
9
+ "hidden_act": "gelu",
10
+ "hidden_dropout_prob": 0.0,
11
+ "hidden_size": 768,
12
+ "image_size": 518,
13
+ "initializer_range": 0.02,
14
+ "layer_norm_eps": 1e-06,
15
+ "layerscale_value": 1.0,
16
+ "mlp_ratio": 4,
17
+ "model_type": "dinov2",
18
+ "num_attention_heads": 12,
19
+ "num_channels": 3,
20
+ "num_hidden_layers": 12,
21
+ "out_features": [
22
+ "stage12"
23
+ ],
24
+ "out_indices": [
25
+ 12
26
+ ],
27
+ "patch_size": 14,
28
+ "qkv_bias": true,
29
+ "reshape_hidden_states": true,
30
+ "stage_names": [
31
+ "stem",
32
+ "stage1",
33
+ "stage2",
34
+ "stage3",
35
+ "stage4",
36
+ "stage5",
37
+ "stage6",
38
+ "stage7",
39
+ "stage8",
40
+ "stage9",
41
+ "stage10",
42
+ "stage11",
43
+ "stage12"
44
+ ],
45
+ "torch_dtype": "bfloat16",
46
+ "transformers_version": "4.42.0",
47
+ "use_swiglu_ffn": false
48
+ }
checkpoint/image_encoder/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a39e985dd10fd1ae72faa49c35ad211a17dc0457ebe7dcbe7b32b92da4007506
3
+ size 173185040
checkpoint/image_encoder/preprocessor_config.json ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "crop_size": {
3
+ "height": 224,
4
+ "width": 224
5
+ },
6
+ "do_center_crop": true,
7
+ "do_convert_rgb": true,
8
+ "do_normalize": true,
9
+ "do_rescale": true,
10
+ "do_resize": true,
11
+ "image_mean": [
12
+ 0.485,
13
+ 0.456,
14
+ 0.406
15
+ ],
16
+ "image_processor_type": "BitImageProcessor",
17
+ "image_std": [
18
+ 0.229,
19
+ 0.224,
20
+ 0.225
21
+ ],
22
+ "resample": 3,
23
+ "rescale_factor": 0.00392156862745098,
24
+ "size": {
25
+ "shortest_edge": 256
26
+ }
27
+ }
checkpoint/model_index.json ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_class_name": "T2IMobilePipelineV1_3_NotLoadingT5_Decoder",
3
+ "_diffusers_version": "0.32.2",
4
+ "denoiser": [
5
+ "Mobile_VTON.models.unets.unet_2d_condition_tryon",
6
+ "UNet2DConditionModel"
7
+ ],
8
+ "denoiser_garment": [
9
+ "Mobile_VTON.models.unets.unet_2d_condition_garment",
10
+ "UNet2DConditionModel"
11
+ ],
12
+ "feature_extractor": [
13
+ null,
14
+ null
15
+ ],
16
+ "image_encoder": [
17
+ "transformers",
18
+ "Dinov2Model"
19
+ ],
20
+ "scheduler": [
21
+ "diffusers",
22
+ "FlowMatchEulerDiscreteScheduler"
23
+ ],
24
+ "text_encoder": [
25
+ "transformers",
26
+ "CLIPTextModelWithProjection"
27
+ ],
28
+ "text_encoder_2": [
29
+ "transformers",
30
+ "CLIPTextModelWithProjection"
31
+ ],
32
+ "tokenizer": [
33
+ "transformers",
34
+ "CLIPTokenizer"
35
+ ],
36
+ "tokenizer_2": [
37
+ "transformers",
38
+ "CLIPTokenizer"
39
+ ],
40
+ "vae": [
41
+ "diffusers",
42
+ "AutoencoderKL"
43
+ ],
44
+ "vae_decoder": [
45
+ null,
46
+ null
47
+ ]
48
+ }
checkpoint/scheduler/scheduler_config.json ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_class_name": "FlowMatchEulerDiscreteScheduler",
3
+ "_diffusers_version": "0.32.2",
4
+ "base_image_seq_len": 256,
5
+ "base_shift": 0.5,
6
+ "invert_sigmas": false,
7
+ "max_image_seq_len": 4096,
8
+ "max_shift": 1.15,
9
+ "num_train_timesteps": 1000,
10
+ "shift": 3.0,
11
+ "shift_terminal": null,
12
+ "use_beta_sigmas": false,
13
+ "use_dynamic_shifting": false,
14
+ "use_exponential_sigmas": false,
15
+ "use_karras_sigmas": false
16
+ }
checkpoint/text_encoder/config.json ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "/data02/zhenchen/VTO_mobile/checkpoints/text_encoder",
3
+ "architectures": [
4
+ "CLIPTextModelWithProjection"
5
+ ],
6
+ "attention_dropout": 0.0,
7
+ "bos_token_id": 0,
8
+ "dropout": 0.0,
9
+ "eos_token_id": 2,
10
+ "hidden_act": "quick_gelu",
11
+ "hidden_size": 768,
12
+ "initializer_factor": 1.0,
13
+ "initializer_range": 0.02,
14
+ "intermediate_size": 3072,
15
+ "layer_norm_eps": 1e-05,
16
+ "max_position_embeddings": 77,
17
+ "model_type": "clip_text_model",
18
+ "num_attention_heads": 12,
19
+ "num_hidden_layers": 12,
20
+ "pad_token_id": 1,
21
+ "projection_dim": 768,
22
+ "torch_dtype": "bfloat16",
23
+ "transformers_version": "4.42.0",
24
+ "vocab_size": 49408
25
+ }
checkpoint/text_encoder/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b4ddf499861727e83713b525ba1e087202f248a7ebb2268b04d1de09de5b67c2
3
+ size 247324096
checkpoint/text_encoder_2/config.json ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "/data02/zhenchen/VTO_mobile/checkpoints/text_encoder_2",
3
+ "architectures": [
4
+ "CLIPTextModelWithProjection"
5
+ ],
6
+ "attention_dropout": 0.0,
7
+ "bos_token_id": 0,
8
+ "dropout": 0.0,
9
+ "eos_token_id": 2,
10
+ "hidden_act": "gelu",
11
+ "hidden_size": 1280,
12
+ "initializer_factor": 1.0,
13
+ "initializer_range": 0.02,
14
+ "intermediate_size": 5120,
15
+ "layer_norm_eps": 1e-05,
16
+ "max_position_embeddings": 77,
17
+ "model_type": "clip_text_model",
18
+ "num_attention_heads": 20,
19
+ "num_hidden_layers": 32,
20
+ "pad_token_id": 1,
21
+ "projection_dim": 1280,
22
+ "torch_dtype": "bfloat16",
23
+ "transformers_version": "4.42.0",
24
+ "vocab_size": 49408
25
+ }
checkpoint/text_encoder_2/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c4ba596d8c78b5509aafd523f619e13aa129ae1a886a1e92766d7cddd9706f3c
3
+ size 1389382688
checkpoint/tokenizer/merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint/tokenizer/special_tokens_map.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<|startoftext|>",
4
+ "lstrip": false,
5
+ "normalized": true,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "<|endoftext|>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": {
17
+ "content": "<|endoftext|>",
18
+ "lstrip": false,
19
+ "normalized": false,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ },
23
+ "unk_token": {
24
+ "content": "<|endoftext|>",
25
+ "lstrip": false,
26
+ "normalized": false,
27
+ "rstrip": false,
28
+ "single_word": false
29
+ }
30
+ }
checkpoint/tokenizer/tokenizer_config.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "added_tokens_decoder": {
4
+ "49406": {
5
+ "content": "<|startoftext|>",
6
+ "lstrip": false,
7
+ "normalized": true,
8
+ "rstrip": false,
9
+ "single_word": false,
10
+ "special": true
11
+ },
12
+ "49407": {
13
+ "content": "<|endoftext|>",
14
+ "lstrip": false,
15
+ "normalized": false,
16
+ "rstrip": false,
17
+ "single_word": false,
18
+ "special": true
19
+ }
20
+ },
21
+ "bos_token": "<|startoftext|>",
22
+ "clean_up_tokenization_spaces": true,
23
+ "do_lower_case": true,
24
+ "eos_token": "<|endoftext|>",
25
+ "errors": "replace",
26
+ "model_max_length": 77,
27
+ "pad_token": "<|endoftext|>",
28
+ "tokenizer_class": "CLIPTokenizer",
29
+ "unk_token": "<|endoftext|>"
30
+ }
checkpoint/tokenizer/vocab.json ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint/tokenizer_2/merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint/tokenizer_2/special_tokens_map.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<|startoftext|>",
4
+ "lstrip": false,
5
+ "normalized": true,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "<|endoftext|>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": {
17
+ "content": "!",
18
+ "lstrip": false,
19
+ "normalized": false,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ },
23
+ "unk_token": {
24
+ "content": "<|endoftext|>",
25
+ "lstrip": false,
26
+ "normalized": false,
27
+ "rstrip": false,
28
+ "single_word": false
29
+ }
30
+ }
checkpoint/tokenizer_2/tokenizer_config.json ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "added_tokens_decoder": {
4
+ "0": {
5
+ "content": "!",
6
+ "lstrip": false,
7
+ "normalized": false,
8
+ "rstrip": false,
9
+ "single_word": false,
10
+ "special": true
11
+ },
12
+ "49406": {
13
+ "content": "<|startoftext|>",
14
+ "lstrip": false,
15
+ "normalized": true,
16
+ "rstrip": false,
17
+ "single_word": false,
18
+ "special": true
19
+ },
20
+ "49407": {
21
+ "content": "<|endoftext|>",
22
+ "lstrip": false,
23
+ "normalized": false,
24
+ "rstrip": false,
25
+ "single_word": false,
26
+ "special": true
27
+ }
28
+ },
29
+ "bos_token": "<|startoftext|>",
30
+ "clean_up_tokenization_spaces": true,
31
+ "do_lower_case": true,
32
+ "eos_token": "<|endoftext|>",
33
+ "errors": "replace",
34
+ "model_max_length": 77,
35
+ "pad_token": "!",
36
+ "tokenizer_class": "CLIPTokenizer",
37
+ "unk_token": "<|endoftext|>"
38
+ }
checkpoint/tokenizer_2/vocab.json ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint/vae/config.json ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_class_name": "AutoencoderKL",
3
+ "_diffusers_version": "0.32.2",
4
+ "_name_or_path": "/data02/zhenchen/VTO_mobile/pretrained_models/stable-diffusion-3.5-large",
5
+ "act_fn": "silu",
6
+ "block_out_channels": [
7
+ 128,
8
+ 256,
9
+ 512,
10
+ 512
11
+ ],
12
+ "down_block_types": [
13
+ "DownEncoderBlock2D",
14
+ "DownEncoderBlock2D",
15
+ "DownEncoderBlock2D",
16
+ "DownEncoderBlock2D"
17
+ ],
18
+ "force_upcast": true,
19
+ "in_channels": 3,
20
+ "latent_channels": 16,
21
+ "latents_mean": null,
22
+ "latents_std": null,
23
+ "layers_per_block": 2,
24
+ "mid_block_add_attention": true,
25
+ "norm_num_groups": 32,
26
+ "out_channels": 3,
27
+ "sample_size": 1024,
28
+ "scaling_factor": 1.5305,
29
+ "shift_factor": 0.0609,
30
+ "up_block_types": [
31
+ "UpDecoderBlock2D",
32
+ "UpDecoderBlock2D",
33
+ "UpDecoderBlock2D",
34
+ "UpDecoderBlock2D"
35
+ ],
36
+ "use_post_quant_conv": false,
37
+ "use_quant_conv": false
38
+ }
checkpoint/vae/diffusion_pytorch_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:965f71f229a5981c0a025db771d8868998f5e4e8412dc5585c687f638fa59679
3
+ size 335306212
checkpoint/vae_decoder/decoder.json ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "in_channels": 16,
3
+ "out_channels": 3,
4
+ "up_block_types": [
5
+ "DecoderUpBlock2D",
6
+ "DecoderUpBlock2D",
7
+ "DecoderUpBlock2D",
8
+ "DecoderUpBlock2D"
9
+ ],
10
+ "block_out_channels": [
11
+ 64,
12
+ 128,
13
+ 256,
14
+ 256
15
+ ],
16
+ "layers_per_block": 2,
17
+ "norm_num_groups": 32,
18
+ "act_fn": "hardswish",
19
+ "mid_block_add_attention": false,
20
+ "conv_in_module": "Conv2d",
21
+ "conv_in_dw_bias": true,
22
+ "conv_in_pw_bias": false,
23
+ "conv_out_module": "Conv2d",
24
+ "conv_out_dw_bias": true,
25
+ "conv_out_pw_bias": false,
26
+ "use_mid_block": false,
27
+ "mid_block_type": "DecoderUNetMidBlock2D",
28
+ "mid_block_use_additional_resnet": true,
29
+ "resnet_middle_expansion": null,
30
+ "resnet_module": "DecoderResnetBlock2D",
31
+ "resnet_conv_module": "DecoderSepConv2d",
32
+ "attn_module": "Attention",
33
+ "attn_processor_type": "AttnProcessor2_0",
34
+ "kv_heads": 1,
35
+ "qk_norm": "layer_norm",
36
+ "layers_per_blocks": [
37
+ 3,
38
+ 3,
39
+ 2,
40
+ 1
41
+ ],
42
+ "backward_output_channels": true,
43
+ "upsample_module": "Upsample2D",
44
+ "upsample_conv_module": "Conv2d",
45
+ "resnet_dw_bias": true,
46
+ "resnet_pw_bias": true
47
+ }
checkpoint/vae_decoder/decoder.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:13ab9dec8e181b6d0e2efa3ecf466018b009bf5251c798f7aafc0b258106eb17
3
+ size 7644444