{ "architectures": [ "AudioVAE" ], "dec_kwargs": { "backbone": { "_attn_implementation": "flash_attention_2", "attention_dropout": 0.0, "attn_implementation": null, "bos_token_id": 151643, "eos_token_id": 151645, "hidden_act": "silu", "hidden_size": 896, "initializer_range": 0.02, "intermediate_size": 4864, "is_causal": true, "max_position_embeddings": 32768, "max_window_layers": 0, "model_type": "qwen2", "num_attention_heads": 14, "num_hidden_layers": 24, "num_key_value_heads": 2, "rms_norm_eps": 1e-06, "rope_theta": 1000000.0, "sliding_window": 32, "tie_word_embeddings": true, "torch_dtype": "bfloat16", "transformers_version": "4.43.1", "use_cache": false, "use_sliding_window": true, "vocab_size": 1 }, "latent_dim": 64, "output_dim": 320 }, "enc_kwargs": { "backbone": { "_attn_implementation": "flash_attention_2", "attention_dropout": 0.0, "attn_implementation": null, "bos_token_id": 151643, "eos_token_id": 151645, "hidden_act": "silu", "hidden_size": 896, "initializer_range": 0.02, "intermediate_size": 4864, "is_causal": true, "max_position_embeddings": 32768, "max_window_layers": 0, "model_type": "qwen2", "num_attention_heads": 14, "num_hidden_layers": 24, "num_key_value_heads": 2, "rms_norm_eps": 1e-06, "rope_theta": 1000000.0, "sliding_window": 32, "tie_word_embeddings": true, "torch_dtype": "bfloat16", "transformers_version": "4.43.1", "use_cache": false, "use_sliding_window": true, "vocab_size": 1 }, "hop_size": 320, "input_dim": 320, "latent_dim": 64 }, "hifi_gan_disc_kwargs": { "channel_increasing_factor": 4, "channels": 16, "max_downsample_channels": 512, "periods": [ 2, 3, 5, 7, 11 ] }, "init_method": "kaiming", "lambda_adv": 1.0, "lambda_disc": 1.0, "lambda_feat_match_loss": 1.0, "lambda_mel_loss": 1.0, "lambda_semantic": 2.0, "patch_size": -1, "semantic_module_kwargs": { "causal": true, "whisper_encoder": { "n_ctx": 1500, "n_head": 20, "n_layer": 32, "n_mels": 128, "n_state": 1280 } }, "spec_disc_kwargs": { "channels": 32, "downsample_scales": [ 2, 2, 2 ], "in_channels": 1, "kernel_sizes": [ 5, 3 ], "max_downsample_channels": 512, "out_channels": 1, "stft_params": { "fft_sizes": [ 78, 126, 206, 334, 542, 876, 1418, 2296 ], "hop_sizes": [ 39, 63, 103, 167, 271, 438, 709, 1148 ], "win_lengths": [ 78, 126, 206, 334, 542, 876, 1418, 2296 ], "window": "hann_window" }, "use_weight_norm": true }, "torch_dtype": "bfloat16", "transformers_version": "4.52.4" }