{
  "architectures": [
    "AudioVAE"
  ],
  "dec_kwargs": {
    "backbone": {
      "_attn_implementation": "flash_attention_2",
      "attention_dropout": 0.0,
      "attn_implementation": null,
      "bos_token_id": 151643,
      "eos_token_id": 151645,
      "hidden_act": "silu",
      "hidden_size": 896,
      "initializer_range": 0.02,
      "intermediate_size": 4864,
      "is_causal": true,
      "max_position_embeddings": 32768,
      "max_window_layers": 0,
      "model_type": "qwen2",
      "num_attention_heads": 14,
      "num_hidden_layers": 24,
      "num_key_value_heads": 2,
      "rms_norm_eps": 1e-06,
      "rope_theta": 1000000.0,
      "sliding_window": 32,
      "tie_word_embeddings": true,
      "torch_dtype": "bfloat16",
      "transformers_version": "4.43.1",
      "use_cache": false,
      "use_sliding_window": true,
      "vocab_size": 1
    },
    "latent_dim": 64,
    "output_dim": 320
  },
  "enc_kwargs": {
    "backbone": {
      "_attn_implementation": "flash_attention_2",
      "attention_dropout": 0.0,
      "attn_implementation": null,
      "bos_token_id": 151643,
      "eos_token_id": 151645,
      "hidden_act": "silu",
      "hidden_size": 896,
      "initializer_range": 0.02,
      "intermediate_size": 4864,
      "is_causal": true,
      "max_position_embeddings": 32768,
      "max_window_layers": 0,
      "model_type": "qwen2",
      "num_attention_heads": 14,
      "num_hidden_layers": 24,
      "num_key_value_heads": 2,
      "rms_norm_eps": 1e-06,
      "rope_theta": 1000000.0,
      "sliding_window": 32,
      "tie_word_embeddings": true,
      "torch_dtype": "bfloat16",
      "transformers_version": "4.43.1",
      "use_cache": false,
      "use_sliding_window": true,
      "vocab_size": 1
    },
    "hop_size": 320,
    "input_dim": 320,
    "latent_dim": 64
  },
  "hifi_gan_disc_kwargs": {
    "channel_increasing_factor": 4,
    "channels": 16,
    "max_downsample_channels": 512,
    "periods": [
      2,
      3,
      5,
      7,
      11
    ]
  },
  "init_method": "kaiming",
  "lambda_adv": 1.0,
  "lambda_disc": 1.0,
  "lambda_feat_match_loss": 1.0,
  "lambda_mel_loss": 1.0,
  "lambda_semantic": 2.0,
  "patch_size": -1,
  "semantic_module_kwargs": {
    "causal": true,
    "whisper_encoder": {
      "n_ctx": 1500,
      "n_head": 20,
      "n_layer": 32,
      "n_mels": 128,
      "n_state": 1280
    }
  },
  "spec_disc_kwargs": {
    "channels": 32,
    "downsample_scales": [
      2,
      2,
      2
    ],
    "in_channels": 1,
    "kernel_sizes": [
      5,
      3
    ],
    "max_downsample_channels": 512,
    "out_channels": 1,
    "stft_params": {
      "fft_sizes": [
        78,
        126,
        206,
        334,
        542,
        876,
        1418,
        2296
      ],
      "hop_sizes": [
        39,
        63,
        103,
        167,
        271,
        438,
        709,
        1148
      ],
      "win_lengths": [
        78,
        126,
        206,
        334,
        542,
        876,
        1418,
        2296
      ],
      "window": "hann_window"
    },
    "use_weight_norm": true
  },
  "torch_dtype": "bfloat16",
  "transformers_version": "4.52.4"
}