alexnasa's picture
Upload 54 files
295978e verified
__object__:
path: humo.generate
name: Generator
dit:
model:
__inherit__: humo/configs/models/Wan_14B_I2V.yaml
__object__:
path: humo.models.wan_modules.model_humo
name: WanModel
insert_audio: True
zero_vae_path: ./weights/HuMo/zero_vae_129frame.pt
zero_vae_720p_path: ./weights/HuMo/zero_vae_720p_161frame.pt
checkpoint_dir: ./weights/HuMo/HuMo-17B
compile: False
init_with_meta_device: True
gradient_checkpoint: True
fsdp:
sharding_strategy: _HYBRID_SHARD_ZERO2
sp_size: 1
dtype: bfloat16
vae:
checkpoint: ./weights/Wan2.1-T2V-1.3B/Wan2.1_VAE.pth
vae_stride: [ 4, 8, 8 ]
scaling_factor: 0.9152
compile: False
grouping: True
use_sample: False
dtype: bfloat16
text:
t5_checkpoint: ./weights/Wan2.1-T2V-1.3B/models_t5_umt5-xxl-enc-bf16.pth
t5_tokenizer: ./weights/Wan2.1-T2V-1.3B/google/umt5-xxl
dropout: 0.1
dtype: bfloat16
fsdp:
enabled: True
sharding_strategy: HYBRID_SHARD
diffusion:
schedule:
type: lerp
T: 1000.0
sampler:
type: euler
prediction_type: v_lerp
timesteps:
training:
type: logitnormal
loc: 0.0
scale: 1.0
sampling:
type: uniform_trailing
steps: 50
shift: 5.0
audio:
vocal_separator: ./weights/HuMo/audio_separator/Kim_Vocal_2.onnx
wav2vec_model: ./weights/whisper-large-v3
generation:
mode: "TIA" # TA, TIA
extract_audio_feat: True
seed: 666666
frames: 97
fps: 25
height: 480 # 720 480
width: 832 # 1280 832
batch_size: 1
sequence_parallel: 8
output:
dir: ./output
# positive_prompt: ./examples/test_case.json
sample_neg_prompt: '色调艳丽,过曝,静态,细节模糊不清,字幕,风格,作品,画作,画面,静止,整体发灰,最差质量,低质量,JPEG压缩残留,丑陋的,残缺的,多余的手指,画得不好的手部,画得不好的脸部,畸形的,毁容的,形态畸形的肢体,手指融合,静止不动的画面,杂乱的背景,三条腿,背景人很多,倒着走'
scale_a: 5.5
scale_t: 5.0
step_change: 980