| source_image: ./default.png |
| driving_audio: default.wav |
|
|
| weight_dtype: fp16 |
|
|
| data: |
| n_motion_frames: 2 |
| n_sample_frames: 16 |
| source_image: |
| width: 512 |
| height: 512 |
| driving_audio: |
| sample_rate: 16000 |
| export_video: |
| fps: 25 |
|
|
| inference_steps: 40 |
| cfg_scale: 3.5 |
|
|
| audio_ckpt_dir: ./pretrained_models/hallo |
|
|
| base_model_path: ./pretrained_models/stable-diffusion-v1-5 |
|
|
| motion_module_path: ./pretrained_models/motion_module/mm_sd_v15_v2.ckpt |
|
|
| face_analysis: |
| model_path: ./pretrained_models/face_analysis |
|
|
| wav2vec: |
| model_path: ./pretrained_models/wav2vec/wav2vec2-base-960h |
| features: all |
|
|
| audio_separator: |
| model_path: ./pretrained_models/audio_separator/Kim_Vocal_2.onnx |
|
|
| vae: |
| model_path: ./pretrained_models/sd-vae-ft-mse |
|
|
| save_path: ./.cache |
|
|
| face_expand_ratio: 1.1 |
| pose_weight: 1.1 |
| face_weight: 1.1 |
| lip_weight: 1.1 |
|
|
| unet_additional_kwargs: |
| use_inflated_groupnorm: true |
| unet_use_cross_frame_attention: false |
| unet_use_temporal_attention: false |
| use_motion_module: true |
| use_audio_module: true |
| motion_module_resolutions: |
| - 1 |
| - 2 |
| - 4 |
| - 8 |
| motion_module_mid_block: true |
| motion_module_decoder_only: false |
| motion_module_type: Vanilla |
| motion_module_kwargs: |
| num_attention_heads: 8 |
| num_transformer_block: 1 |
| attention_block_types: |
| - Temporal_Self |
| - Temporal_Self |
| temporal_position_encoding: true |
| temporal_position_encoding_max_len: 32 |
| temporal_attention_dim_div: 1 |
| audio_attention_dim: 768 |
| stack_enable_blocks_name: |
| - "up" |
| - "down" |
| - "mid" |
| stack_enable_blocks_depth: [0,1,2,3] |
| |
|
|
| enable_zero_snr: true |
|
|
| noise_scheduler_kwargs: |
| beta_start: 0.00085 |
| beta_end: 0.012 |
| beta_schedule: "linear" |
| clip_sample: false |
| steps_offset: 1 |
| |
| prediction_type: "v_prediction" |
| rescale_betas_zero_snr: True |
| timestep_spacing: "trailing" |
|
|
| sampler: DDIM |
|
|