Spaces:
Paused
Paused
File size: 2,165 Bytes
295978e | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 | __object__:
path: humo.generate
name: Generator
dit:
model:
__inherit__: humo/configs/models/Wan_14B_I2V.yaml
__object__:
path: humo.models.wan_modules.model_humo
name: WanModel
insert_audio: True
zero_vae_path: ./weights/HuMo/zero_vae_129frame.pt
zero_vae_720p_path: ./weights/HuMo/zero_vae_720p_161frame.pt
checkpoint_dir: ./weights/HuMo/HuMo-17B
compile: False
init_with_meta_device: True
gradient_checkpoint: True
fsdp:
sharding_strategy: _HYBRID_SHARD_ZERO2
sp_size: 1
dtype: bfloat16
vae:
checkpoint: ./weights/Wan2.1-T2V-1.3B/Wan2.1_VAE.pth
vae_stride: [ 4, 8, 8 ]
scaling_factor: 0.9152
compile: False
grouping: True
use_sample: False
dtype: bfloat16
text:
t5_checkpoint: ./weights/Wan2.1-T2V-1.3B/models_t5_umt5-xxl-enc-bf16.pth
t5_tokenizer: ./weights/Wan2.1-T2V-1.3B/google/umt5-xxl
dropout: 0.1
dtype: bfloat16
fsdp:
enabled: True
sharding_strategy: HYBRID_SHARD
diffusion:
schedule:
type: lerp
T: 1000.0
sampler:
type: euler
prediction_type: v_lerp
timesteps:
training:
type: logitnormal
loc: 0.0
scale: 1.0
sampling:
type: uniform_trailing
steps: 50
shift: 5.0
audio:
vocal_separator: ./weights/HuMo/audio_separator/Kim_Vocal_2.onnx
wav2vec_model: ./weights/whisper-large-v3
generation:
mode: "TIA" # TA, TIA
extract_audio_feat: True
seed: 666666
frames: 97
fps: 25
height: 480 # 720 480
width: 832 # 1280 832
batch_size: 1
sequence_parallel: 8
output:
dir: ./output
# positive_prompt: ./examples/test_case.json
sample_neg_prompt: '色调艳丽,过曝,静态,细节模糊不清,字幕,风格,作品,画作,画面,静止,整体发灰,最差质量,低质量,JPEG压缩残留,丑陋的,残缺的,多余的手指,画得不好的手部,画得不好的脸部,畸形的,毁容的,形态畸形的肢体,手指融合,静止不动的画面,杂乱的背景,三条腿,背景人很多,倒着走'
scale_a: 5.5
scale_t: 5.0
step_change: 980 |