HuMo_local

Paused

File size: 2,165 Bytes

295978e

__object__:
  path: humo.generate
  name: Generator

dit:
  model:
    __inherit__: humo/configs/models/Wan_14B_I2V.yaml
    __object__:
      path: humo.models.wan_modules.model_humo
      name: WanModel
    insert_audio: True
  zero_vae_path: ./weights/HuMo/zero_vae_129frame.pt
  zero_vae_720p_path: ./weights/HuMo/zero_vae_720p_161frame.pt
  checkpoint_dir: ./weights/HuMo/HuMo-17B
  compile: False
  init_with_meta_device: True
  gradient_checkpoint: True
  fsdp:
    sharding_strategy: _HYBRID_SHARD_ZERO2
  sp_size: 1
  dtype: bfloat16

vae:
  checkpoint: ./weights/Wan2.1-T2V-1.3B/Wan2.1_VAE.pth
  vae_stride: [ 4, 8, 8 ]
  scaling_factor: 0.9152
  compile: False
  grouping: True
  use_sample: False
  dtype: bfloat16

text:
  t5_checkpoint: ./weights/Wan2.1-T2V-1.3B/models_t5_umt5-xxl-enc-bf16.pth
  t5_tokenizer: ./weights/Wan2.1-T2V-1.3B/google/umt5-xxl
  dropout: 0.1
  dtype: bfloat16
  fsdp:
    enabled: True
    sharding_strategy: HYBRID_SHARD

diffusion:
  schedule:
    type: lerp
    T: 1000.0
  sampler:
    type: euler
    prediction_type: v_lerp
  timesteps:
    training:
      type: logitnormal
      loc: 0.0
      scale: 1.0
    sampling:
      type: uniform_trailing
      steps: 50
      shift: 5.0

audio:
  vocal_separator: ./weights/HuMo/audio_separator/Kim_Vocal_2.onnx
  wav2vec_model: ./weights/whisper-large-v3

generation:
  mode: "TIA"  # TA, TIA
  extract_audio_feat: True
  seed: 666666
  frames: 97
  fps: 25
  height: 480 # 720 480
  width: 832 # 1280 832
  batch_size: 1
  sequence_parallel: 8
  output:
    dir: ./output
  # positive_prompt: ./examples/test_case.json
  sample_neg_prompt: '色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部，畸形的，毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走'
  scale_a: 5.5
  scale_t: 5.0
  step_change: 980