File size: 2,165 Bytes
295978e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
__object__:
  path: humo.generate
  name: Generator

dit:
  model:
    __inherit__: humo/configs/models/Wan_14B_I2V.yaml
    __object__:
      path: humo.models.wan_modules.model_humo
      name: WanModel
    insert_audio: True
  zero_vae_path: ./weights/HuMo/zero_vae_129frame.pt
  zero_vae_720p_path: ./weights/HuMo/zero_vae_720p_161frame.pt
  checkpoint_dir: ./weights/HuMo/HuMo-17B
  compile: False
  init_with_meta_device: True
  gradient_checkpoint: True
  fsdp:
    sharding_strategy: _HYBRID_SHARD_ZERO2
  sp_size: 1
  dtype: bfloat16

vae:
  checkpoint: ./weights/Wan2.1-T2V-1.3B/Wan2.1_VAE.pth
  vae_stride: [ 4, 8, 8 ]
  scaling_factor: 0.9152
  compile: False
  grouping: True
  use_sample: False
  dtype: bfloat16

text:
  t5_checkpoint: ./weights/Wan2.1-T2V-1.3B/models_t5_umt5-xxl-enc-bf16.pth
  t5_tokenizer: ./weights/Wan2.1-T2V-1.3B/google/umt5-xxl
  dropout: 0.1
  dtype: bfloat16
  fsdp:
    enabled: True
    sharding_strategy: HYBRID_SHARD

diffusion:
  schedule:
    type: lerp
    T: 1000.0
  sampler:
    type: euler
    prediction_type: v_lerp
  timesteps:
    training:
      type: logitnormal
      loc: 0.0
      scale: 1.0
    sampling:
      type: uniform_trailing
      steps: 50
      shift: 5.0

audio:
  vocal_separator: ./weights/HuMo/audio_separator/Kim_Vocal_2.onnx
  wav2vec_model: ./weights/whisper-large-v3

generation:
  mode: "TIA"  # TA, TIA
  extract_audio_feat: True
  seed: 666666
  frames: 97
  fps: 25
  height: 480 # 720 480
  width: 832 # 1280 832
  batch_size: 1
  sequence_parallel: 8
  output:
    dir: ./output
  # positive_prompt: ./examples/test_case.json
  sample_neg_prompt: '色调艳丽,过曝,静态,细节模糊不清,字幕,风格,作品,画作,画面,静止,整体发灰,最差质量,低质量,JPEG压缩残留,丑陋的,残缺的,多余的手指,画得不好的手部,画得不好的脸部,畸形的,毁容的,形态畸形的肢体,手指融合,静止不动的画面,杂乱的背景,三条腿,背景人很多,倒着走'
  scale_a: 5.5
  scale_t: 5.0
  step_change: 980