ZeyuXie
/

SemanticVocoder

semantic_vocoder

Model card Files Files and versions

SemanticVocoder / config.json

ZeyuXie's picture

Upload model

e2da711 verified 6 days ago

history blame contribute delete

3.04 kB

	{
	"model_type": "semantic_vocoder",
	"auto_map": {
	"AutoConfig": "model.SemanticVocoderConfig",
	"AutoModel": "model.SemanticVocoder"
	},
	"model_config": {
	"autoencoder": {
	"_target_": "models.autoencoder.waveform.semanticVocoder.semanticVocoder.SemanticVocoder",
	"encoder_name": "none",
	"n_timesteps": 200,
	"sample_rate": 24000,
	"clamp_pred": true,
	"downsampling_ratio": 960,
	"encoder_sampling_rate": 16000,
	"vocoder": {
	"_target_": "models.autoencoder.waveform.semanticVocoder.flow2gan.models.generator.MaeAudioGenerator",
	"latent_dim": 768,
	"hop_length": 960,
	"n_ffts": [
	512,
	256,
	128
	],
	"hop_lengths": [
	320,
	160,
	80
	],
	"channels": [
	768,
	512,
	384
	],
	"time_embed_channels": 512,
	"hidden_factor": 3,
	"conv_kernel_sizes": [
	7,
	7,
	7
	],
	"num_layers": [
	8,
	8,
	8
	],
	"use_cond_encoder": true,
	"cond_enc_channels": 512,
	"cond_enc_hidden_factor": 3,
	"cond_enc_conv_kernel_size": 7,
	"cond_enc_num_layers": 4,
	"residual_scale": 1.0,
	"init_noise_scale": 0.1,
	"pred_x1": true,
	"branch_reduction": "mean",
	"spec_scaling_loss": true,
	"loss_n_filters": 256,
	"loss_n_fft": 1024,
	"loss_hop_length": 256,
	"loss_power": 0.5,
	"loss_eps": 1e-07,
	"loss_scale_min": 0.01,
	"loss_scale_max": 100.0,
	"branch_dropout": 0.05,
	"max_add_noise_scale": 0.0
	}
	},
	"backbone": {
	"_target_": "models.dit.mask_dit.UDiT",
	"img_size": 250,
	"patch_size": 1,
	"in_chans": 768,
	"out_chans": 768,
	"input_type": "1d",
	"embed_dim": 1024,
	"depth": 24,
	"num_heads": 16,
	"mlp_ratio": 4.0,
	"qkv_bias": false,
	"qk_scale": null,
	"qk_norm": "layernorm",
	"norm_layer": "layernorm",
	"act_layer": "geglu",
	"context_norm": true,
	"use_checkpoint": true,
	"time_fusion": "ada_sola_bias",
	"ada_sola_rank": 32,
	"ada_sola_alpha": 32,
	"cls_dim": null,
	"context_dim": 1024,
	"context_fusion": "cross",
	"context_max_length": null,
	"context_pe_method": "none",
	"pe_method": "none",
	"rope_mode": "shared",
	"use_conv": true,
	"skip": true,
	"skip_norm": true
	},
	"cfg_drop_ratio": 0.2,
	"sample_strategy": "uniform",
	"_target_": "models.flow_matching.SingleTaskCrossAttentionAudioFlowMatching",
	"content_encoder": {
	"_target_": "models.content_encoder.content_encoder.ContentEncoder",
	"embed_dim": 1024,
	"text_encoder": {
	"_target_": "models.content_encoder.text_encoder.T5TextEncoder",
	"model_name": "google/flan-t5-large",
	"embed_dim": 1024
	}
	}
	}
	}