| model_config: |
| model_name: HunyuanVideo-Foley-XXL |
| model_type: 1d |
| model_precision: bf16 |
| model_kwargs: |
| depth_triple_blocks: 18 |
| depth_single_blocks: 36 |
| hidden_size: 1536 |
| num_heads: 12 |
| mlp_ratio: 4 |
| mlp_act_type: "gelu_tanh" |
| qkv_bias: True |
| qk_norm: True |
| qk_norm_type: "rms" |
| attn_mode: "torch" |
| embedder_type: "default" |
| interleaved_audio_visual_rope: True |
| enable_learnable_empty_visual_feat: True |
| sync_modulation: False |
| add_sync_feat_to_audio: True |
| cross_attention: True |
| use_attention_mask: False |
| condition_projection: "linear" |
| sync_feat_dim: 768 |
| condition_dim: 768 |
| clip_dim: 768 |
| audio_vae_latent_dim: 128 |
| audio_frame_rate: 50 |
| patch_size: 1 |
| rope_dim_list: null |
| rope_theta: 10000 |
| text_length: 77 |
| clip_length: 64 |
| sync_length: 192 |
| use_mmaudio_singleblock: True |
| depth_triple_ssl_encoder: null |
| depth_single_ssl_encoder: 8 |
| use_repa_with_audiossl: True |
|
|
| diffusion_config: |
| denoise_type: "flow" |
| flow_path_type: "linear" |
| flow_predict_type: "velocity" |
| flow_reverse: True |
| flow_solver: "euler" |
| sample_flow_shift: 1.0 |
| sample_use_flux_shift: False |
| flux_base_shift: 0.5 |
| flux_max_shift: 1.15 |
|
|