Safetensors
xiwenyoumu commited on
Commit
612bf87
·
verified ·
1 Parent(s): 0ba5e1d

Upload folder using huggingface_hub

Browse files
mhla_videogen/config.json ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_type": "wan_t2v",
3
+ "model_name": "Wan_T2V_1300M",
4
+ "image_size": 256,
5
+ "video_width": 800,
6
+ "video_height": 480,
7
+ "num_frames": 81,
8
+ "patch_size": [
9
+ 1,
10
+ 2,
11
+ 2
12
+ ],
13
+ "dim": 1536,
14
+ "ffn_dim": 8960,
15
+ "freq_dim": 256,
16
+ "num_heads": 12,
17
+ "num_layers": 30,
18
+ "window_size": [
19
+ -1,
20
+ -1
21
+ ],
22
+ "qk_norm": true,
23
+ "cross_attn_norm": true,
24
+ "eps": 1e-06,
25
+ "self_attn_type": "mhla",
26
+ "rope_after": true,
27
+ "rms_output": true,
28
+ "norm_output": false,
29
+ "mhla_adjust": true,
30
+ "without_rope": false,
31
+ "is_gated": false,
32
+ "is_lepe": false,
33
+ "ffn_type": "mlp",
34
+ "linear_attn_idx": [
35
+ 1,
36
+ 2,
37
+ 4,
38
+ 5,
39
+ 7,
40
+ 8,
41
+ 10,
42
+ 11,
43
+ 13,
44
+ 14,
45
+ 16,
46
+ 17,
47
+ 19,
48
+ 20,
49
+ 22,
50
+ 23,
51
+ 25,
52
+ 26,
53
+ 28,
54
+ 29
55
+ ],
56
+ "training_epoch": 2,
57
+ "training_step": 47000
58
+ }
mhla_videogen/config.yaml ADDED
@@ -0,0 +1,317 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ data:
2
+ data_dir:
3
+ toy_data: toy_data
4
+ caption_proportion:
5
+ prompt: 1
6
+ external_caption_suffixes: []
7
+ external_clipscore_suffixes: []
8
+ caption_selection_type: clipscore
9
+ clip_thr_temperature: 0.1
10
+ clip_thr: 25.0
11
+ del_img_clip_thr: 22.0
12
+ sort_dataset: false
13
+ load_text_feat: false
14
+ load_vae_feat: true
15
+ aspect_ratio_type: ASPECT_RATIO_VIDEO_256_MS
16
+ transform: default_train_video
17
+ type: SanaZipDatasetWithCache
18
+ image_size: 256
19
+ hq_only: false
20
+ valid_num: 0
21
+ data: null
22
+ num_frames: 81
23
+ extra: null
24
+ external_data_filter:
25
+ toy_data:
26
+ _image_quality:
27
+ min: 0.5
28
+ max: 1.0
29
+
30
+ motion_score_file_thres: {}
31
+ motion_score_cal_type: average
32
+ target_fps: 16
33
+ resample_fps: true
34
+ shuffle_dataset: false
35
+ vae_cache_dir:
36
+ json_cache_dir: null
37
+ load_first_frame: false
38
+ model:
39
+ model: Wan_T2V_1300M
40
+ from_pretrained: null
41
+ load_model_ckpt:
42
+ init_patch_embedding: false
43
+ image_size: 256
44
+ video_width: 800
45
+ video_height: 480
46
+ num_frames: 81
47
+ patch_size:
48
+ - 1
49
+ - 2
50
+ - 2
51
+ dim: 1536
52
+ ffn_dim: 8960
53
+ freq_dim: 256
54
+ num_heads: 12
55
+ num_layers: 30
56
+ window_size:
57
+ - -1
58
+ - -1
59
+ qk_norm: true
60
+ cross_attn_norm: true
61
+ eps: 1.0e-06
62
+ mixed_precision: bf16
63
+ fp32_attention: true
64
+ load_from: null
65
+ resume_from:
66
+ checkpoint: latest
67
+ load_ema: false
68
+ resume_optimizer: true
69
+ resume_lr_scheduler: true
70
+ aspect_ratio_type: ASPECT_RATIO_VIDEO_480
71
+ multi_scale: false
72
+ class_dropout_prob: 0.1
73
+ guidance_type: classifier-free
74
+ mask: null
75
+ image_latent_mode: video_zero
76
+ linear_attn_idx:
77
+ - 1
78
+ - 2
79
+ - 4
80
+ - 5
81
+ - 7
82
+ - 8
83
+ - 10
84
+ - 11
85
+ - 13
86
+ - 14
87
+ - 16
88
+ - 17
89
+ - 19
90
+ - 20
91
+ - 22
92
+ - 23
93
+ - 25
94
+ - 26
95
+ - 28
96
+ - 29
97
+ self_attn_type: mhla
98
+ rope_after: true
99
+ rms_output: true
100
+ norm_output: false
101
+ mhla_adjust: true
102
+ without_rope: false
103
+ is_gated: false
104
+ is_lepe: false
105
+ block_layout: null
106
+ power: 1.0
107
+ ffn_type: mlp
108
+ attn_mask: null
109
+ diagonal_block_size: 1
110
+ vae:
111
+ vae_type: WanVAE
112
+ vae_latent_dim: 16
113
+ vae_pretrained:
114
+ vae_stride:
115
+ - 4
116
+ - 8
117
+ - 8
118
+ vae_downsample_rate: 8
119
+ weight_dtype: bf16
120
+ extra: null
121
+ cache_dir: null
122
+ if_cache: false
123
+ text_encoder:
124
+ t5_model: umt5_xxl
125
+ t5_dtype: bfloat16
126
+ text_len: 512
127
+ t5_checkpoint: null
128
+ t5_tokenizer: google/umt5-xxl
129
+ extra: null
130
+ caption_channels: 4096
131
+ scheduler:
132
+ train_sampling_steps: 1000
133
+ predict_flow_v: true
134
+ noise_schedule: linear_flow
135
+ pred_sigma: false
136
+ learn_sigma: true
137
+ vis_sampler: flow_dpm-solver
138
+ flow_shift: 3.0
139
+ inference_flow_shift: null
140
+ weighting_scheme: logit_normal
141
+ weighting_scheme_discriminator: logit_normal_trigflow
142
+ add_noise_timesteps:
143
+ - 1.5708
144
+ logit_mean: 0.0
145
+ logit_std: 1.0
146
+ logit_mean_discriminator: 0.0
147
+ logit_std_discriminator: 1.0
148
+ mode_scale: 1.29
149
+ sigma_data: 1.0
150
+ p_low: null
151
+ p_high: null
152
+ timestep_norm_scale_factor: 1.0
153
+ pretrain_timestep_norm_scale_factor: 1.0
154
+ discrete_norm_timestep: false
155
+ extra: null
156
+ train:
157
+ num_workers: 10
158
+ seed: 1
159
+ train_batch_size: 4
160
+ train_batch_size_image: 32
161
+ early_stop_hours: 100
162
+ num_epochs: 100
163
+ gradient_accumulation_steps: 1
164
+ grad_checkpointing: true
165
+ gradient_clip: 0.1
166
+ gc_step: 1
167
+ optimizer:
168
+ betas:
169
+ - 0.9
170
+ - 0.999
171
+ eps: 1.0e-10
172
+ lr: 2.0e-05
173
+ type: AdamW
174
+ weight_decay: 0.0
175
+ optimizer_D:
176
+ eps: 1.0e-10
177
+ lr: 0.0001
178
+ type: AdamW
179
+ weight_decay: 0.03
180
+ load_from_optimizer: false
181
+ load_from_lr_scheduler: false
182
+ resume_lr_scheduler: true
183
+ lr_schedule: constant
184
+ lr_schedule_args:
185
+ num_warmup_steps: 1000
186
+ auto_lr: null
187
+ ema_rate: 0.9999
188
+ eval_batch_size: 16
189
+ use_fsdp: false
190
+ use_flash_attn: false
191
+ eval_sampling_steps: 500
192
+ lora_rank: 4
193
+ log_interval: 20
194
+ mask_type: 'null'
195
+ mask_loss_coef: 0.0
196
+ load_mask_index: false
197
+ snr_loss: false
198
+ real_prompt_ratio: 1.0
199
+ save_image_epochs: 1
200
+ save_model_epochs: 5
201
+ save_model_steps: 500
202
+ visualize: true
203
+ null_embed_root: output/pretrained_models/
204
+ valid_prompt_embed_root: output/tmp_embed/
205
+ validation_prompts:
206
+ - soft lighting and warm colors infuse the image, creating a magical and serene
207
+ effect. the view captures the serene guru man levitating above the golden
208
+ sands, his long, flowing beard and simple robes gently swaying. his eyes are
209
+ closed, and a peaceful smile graces his calm face. a gentle glow surrounds
210
+ him, enhancing his aura of tranquility. behind him, the majestic pyramids
211
+ of egypt loom, bathed in the warm light of the setting sun. the softly glowing
212
+ sands and the pyramid silhouettes create a composition rich with spirituality
213
+ and enlightenment, exuding an atmosphere of profound calmness.
214
+ - a soft-focus view captures a serene garden, filled with cherry blossom trees
215
+ in bloom. at the center stands a beautiful japanese woman, portrayed in exquisite
216
+ detail, wearing a traditional kimono with intricate floral patterns in soft
217
+ pastel colors. her long, dark hair cascades elegantly down her back, enhancing
218
+ her gentle, serene expression. pink petals drift down in a light breeze, adding
219
+ to the garden's ethereal ambiance. sunlight filters through the leafy canopy,
220
+ casting dappled shadows that dance around her, subtly highlighting her serene
221
+ posture and the details of her kimono. the atmosphere is tranquil and picturesque,
222
+ enveloped in a sense of timeless beauty.
223
+ - the angle is mid-range, focusing on the well-dressed asian male friend sitting
224
+ comfortably in a modern and stylish cafe. the setting exudes warmth and elegance,
225
+ with soft music playing in the background to create an inviting atmosphere.
226
+ the man holds a small gift box in his hand, his face illuminated by a confident
227
+ smile as he looks around, his expression full of anticipation. his attire,
228
+ a blend of classic and contemporary style, complements the chic surroundings,
229
+ enhancing his poised demeanor. the ambient lighting accentuates his features,
230
+ making the scene lively and intimate.
231
+ - the setting is a training facility, brightly lit with mirrored walls and sprung
232
+ wooden floors. the scene starts with the camera panning slowly over the room,
233
+ capturing the boundless determination in action as a group of korean girls
234
+ rigorously practice their moves. each one is focused, their expressions marked
235
+ by the intensity of a long and arduous journey. they are seen rehearsing complex
236
+ choreography, with synchronized steps and practiced precision. alongside the
237
+ strenuous physical training, snippets of their vocal lessons are interwoven,
238
+ illustrating the multifaceted preparation involved. the atmosphere is one
239
+ of dedication and discipline, reflected in their commitment to rigorous exercise
240
+ and strict dietary habits. these scenes provide a glimpse into the grueling
241
+ yet passionate pursuit of their dreams.
242
+ - astronaut is riding a horse on the moon, wearing a space suit and helmet. the
243
+ horse is galloping across the lunar surface, leaving behind a trail of moon
244
+ dust. in the background, earth is visible in the black sky, a beautiful blue
245
+ and green marble. the astronaut is holding a flag with a logo on it, waving
246
+ it proudly as they ride. the scene is surreal and whimsical, capturing the
247
+ imagination of space exploration and adventure.
248
+ local_save_vis: true
249
+ deterministic_validation: true
250
+ online_metric: false
251
+ eval_metric_step: 2000
252
+ online_metric_dir: metric_helper
253
+ work_dir: output/debug
254
+ skip_step: 0
255
+ loss_type: huber
256
+ huber_c: 0.001
257
+ num_ddim_timesteps: 50
258
+ w_max: 15.0
259
+ w_min: 3.0
260
+ ema_decay: 0.95
261
+ debug_nan: false
262
+ ema_update: false
263
+ weight_loss: true
264
+ tangent_warmup_steps: 10000
265
+ scm_cfg_scale:
266
+ - 1.0
267
+ cfg_interval: null
268
+ scm_logvar_loss: true
269
+ norm_invariant_to_spatial_dim: true
270
+ norm_same_as_512_scale: false
271
+ g_norm_constant: 0.1
272
+ g_norm_r: 1.0
273
+ show_gradient: false
274
+ lr_scale: null
275
+ adv_lambda: 1.0
276
+ scm_loss: true
277
+ scm_lambda: 1.0
278
+ loss_scale: 1.0
279
+ r1_penalty: false
280
+ r1_penalty_weight: 1.0e-05
281
+ diff_timesteps_D: true
282
+ suffix_checkpoints: disc
283
+ misaligned_pairs_D: false
284
+ discriminator_loss: cross entropy
285
+ largest_timestep: 1.5708
286
+ train_largest_timestep: false
287
+ largest_timestep_prob: 0.5
288
+ reconstruct_loss: false
289
+ reconstruct_loss_type: huber
290
+ vis_grad: false
291
+ extra: null
292
+ offload_vae: false
293
+ offload_text_encoder: false
294
+ deepspeed_stage: null
295
+ sp_degree: 1
296
+ fsdp_config: null
297
+ fsdp_inference: false
298
+ train_la_only: false
299
+ work_dir: output/debug
300
+ resume_from: latest
301
+ load_from: null
302
+ debug: false
303
+ caching: false
304
+ report_to: wandb
305
+ tracker_project_name: wan-video
306
+ name: debug
307
+ loss_report_name: loss
308
+ task: t2v
309
+ image_encoder:
310
+ image_encoder_type: null
311
+ image_encoder_pretrained: null
312
+ image_encoder_tokenizer: null
313
+ weight_dtype: float32
314
+ extra: null
315
+ distill: null
316
+ lora: null
317
+ cfg_scale: 3.0
mhla_videogen/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0c5628e7d98e79ec5af71031446fc605be791b5bf5b2ceb53dc4b03cde8f6775
3
+ size 5677997904