{
  "transformer": {
    "_class_name": "AVTransformer3DModel",
    "activation_fn": "gelu-approximate",
    "attention_bias": true,
    "attention_head_dim": 128,
    "attention_type": "default",
    "caption_channels": 3840,
    "cross_attention_dim": 4096,
    "double_self_attention": false,
    "dropout": 0.0,
    "in_channels": 128,
    "norm_elementwise_affine": false,
    "norm_eps": 1e-06,
    "norm_num_groups": 32,
    "num_attention_heads": 32,
    "num_embeds_ada_norm": 1000,
    "num_layers": 48,
    "num_vector_embeds": null,
    "only_cross_attention": false,
    "cross_attention_norm": true,
    "out_channels": 128,
    "upcast_attention": false,
    "use_linear_projection": false,
    "qk_norm": "rms_norm",
    "standardization_norm": "rms_norm",
    "positional_embedding_type": "rope",
    "positional_embedding_theta": 10000.0,
    "positional_embedding_max_pos": [
      20,
      2048,
      2048
    ],
    "timestep_scale_multiplier": 1000,
    "av_ca_timestep_scale_multiplier": 1000.0,
    "causal_temporal_positioning": true,
    "audio_num_attention_heads": 32,
    "audio_attention_head_dim": 64,
    "use_audio_video_cross_attention": true,
    "share_ff": false,
    "audio_out_channels": 128,
    "audio_cross_attention_dim": 2048,
    "audio_positional_embedding_max_pos": [
      20
    ],
    "av_cross_ada_norm": true,
    "use_embeddings_connector": true,
    "connector_attention_head_dim": 128,
    "connector_num_attention_heads": 32,
    "connector_num_layers": 8,
    "connector_positional_embedding_max_pos": [
      4096
    ],
    "connector_num_learnable_registers": 128,
    "connector_norm_output": true,
    "use_middle_indices_grid": true,
    "apply_gated_attention": true,
    "connector_apply_gated_attention": true,
    "caption_projection_first_linear": false,
    "caption_projection_second_linear": false,
    "caption_proj_input_norm": false,
    "connector_learnable_registers_std": 1,
    "caption_proj_before_connector": true,
    "audio_connector_attention_head_dim": 64,
    "audio_connector_num_attention_heads": 32,
    "cross_attention_adaln": true,
    "text_encoder_norm_type": "per_token_rms",
    "rope_type": "split",
    "frequencies_precision": "float64"
  },
  "vae": {
    "_class_name": "CausalVideoAutoencoder",
    "dims": 3,
    "in_channels": 3,
    "out_channels": 3,
    "latent_channels": 128,
    "encoder_blocks": [
      [
        "res_x",
        {
          "num_layers": 4
        }
      ],
      [
        "compress_space_res",
        {
          "multiplier": 2
        }
      ],
      [
        "res_x",
        {
          "num_layers": 6
        }
      ],
      [
        "compress_time_res",
        {
          "multiplier": 2
        }
      ],
      [
        "res_x",
        {
          "num_layers": 4
        }
      ],
      [
        "compress_all_res",
        {
          "multiplier": 2
        }
      ],
      [
        "res_x",
        {
          "num_layers": 2
        }
      ],
      [
        "compress_all_res",
        {
          "multiplier": 1
        }
      ],
      [
        "res_x",
        {
          "num_layers": 2
        }
      ]
    ],
    "decoder_blocks": [
      [
        "res_x",
        {
          "num_layers": 4
        }
      ],
      [
        "compress_space",
        {
          "multiplier": 2
        }
      ],
      [
        "res_x",
        {
          "num_layers": 6
        }
      ],
      [
        "compress_time",
        {
          "multiplier": 2
        }
      ],
      [
        "res_x",
        {
          "num_layers": 4
        }
      ],
      [
        "compress_all",
        {
          "multiplier": 1
        }
      ],
      [
        "res_x",
        {
          "num_layers": 2
        }
      ],
      [
        "compress_all",
        {
          "multiplier": 2
        }
      ],
      [
        "res_x",
        {
          "num_layers": 2
        }
      ]
    ],
    "scaling_factor": 1.0,
    "norm_layer": "pixel_norm",
    "patch_size": 4,
    "latent_log_var": "uniform",
    "use_quant_conv": false,
    "causal_decoder": false,
    "timestep_conditioning": false,
    "normalize_latent_channels": false,
    "encoder_base_channels": 128,
    "decoder_base_channels": 128,
    "spatial_padding_mode": "zeros"
  },
  "scheduler": {
    "_class_name": "RectifiedFlowScheduler",
    "_diffusers_version": "0.25.1",
    "num_train_timesteps": 1000,
    "shifting": null,
    "base_resolution": null,
    "sampler": "LinearQuadratic"
  },
  "audio_vae": {
    "model": {
      "params": {
        "ddconfig": {
          "double_z": true,
          "mel_bins": 64,
          "z_channels": 8,
          "resolution": 256,
          "downsample_time": false,
          "in_channels": 2,
          "out_ch": 2,
          "ch": 128,
          "ch_mult": [
            1,
            2,
            4
          ],
          "num_res_blocks": 2,
          "attn_resolutions": [],
          "dropout": 0.0,
          "mid_block_add_attention": false,
          "norm_type": "pixel",
          "causality_axis": "height"
        },
        "sampling_rate": 16000
      }
    },
    "preprocessing": {
      "audio": {
        "sampling_rate": 16000,
        "max_wav_value": 32768.0,
        "duration": 5.12,
        "stereo": true,
        "causal_padding": 3
      },
      "stft": {
        "filter_length": 1024,
        "hop_length": 160,
        "win_length": 1024,
        "causal": true
      },
      "mel": {
        "n_mel_channels": 64,
        "mel_fmin": 0,
        "mel_fmax": 8000
      }
    }
  },
  "vocoder": {
    "vocoder": {
      "upsample_initial_channel": 1536,
      "resblock": "AMP1",
      "upsample_rates": [
        5,
        2,
        2,
        2,
        2,
        2
      ],
      "resblock_kernel_sizes": [
        3,
        7,
        11
      ],
      "upsample_kernel_sizes": [
        11,
        4,
        4,
        4,
        4,
        4
      ],
      "resblock_dilation_sizes": [
        [
          1,
          3,
          5
        ],
        [
          1,
          3,
          5
        ],
        [
          1,
          3,
          5
        ]
      ],
      "stereo": true,
      "use_tanh_at_final": false,
      "activation": "snakebeta",
      "use_bias_at_final": false
    },
    "bwe": {
      "upsample_initial_channel": 512,
      "resblock": "AMP1",
      "upsample_rates": [
        6,
        5,
        2,
        2,
        2
      ],
      "resblock_kernel_sizes": [
        3,
        7,
        11
      ],
      "upsample_kernel_sizes": [
        12,
        11,
        4,
        4,
        4
      ],
      "resblock_dilation_sizes": [
        [
          1,
          3,
          5
        ],
        [
          1,
          3,
          5
        ],
        [
          1,
          3,
          5
        ]
      ],
      "stereo": true,
      "use_tanh_at_final": false,
      "activation": "snakebeta",
      "use_bias_at_final": false,
      "apply_final_activation": false,
      "input_sampling_rate": 16000,
      "output_sampling_rate": 48000,
      "hop_length": 80,
      "n_fft": 512,
      "win_size": 512,
      "num_mels": 64
    }
  }
}