xxx123456
/

SimWhisper_Codec

audio-compression

Model card Files Files and versions

SimWhisper_Codec / SimWhisperCodec.yaml

xxx123456's picture

Upload SimWhisperCodec.yaml

8aad9db verified about 1 month ago

history blame contribute delete

1.49 kB

	generator_params:
	input_sample_rate: 16000
	output_sample_rate: 16000
	mel_hop_length: 160
	encoder_downsample_rate: 1280
	decoder_upsample_rate: 1280

	feature_extractor:
	chunk_length: 30
	feature_size: 80
	sampling_rate: 16000
	hop_length: 160
	n_fft: 400
	n_samples: 480000
	nb_max_frames: 3000
	padding_side: "right"
	padding_value: 0.0
	return_attention_mask: false

	acoustic_encoder:
	num_mel_bins: 80
	sampling_rate: 16000
	hop_length: 160
	stride_size: 2
	kernel_size: 3
	d_model: 768
	scale_embedding: false
	max_audio_seconds: 30
	encoder_layers: 12
	encoder_attention_heads: 12
	encoder_ffn_dim: 3072
	is_acoustic: true
	freeze: true

	# 下采样
	downsample:
	in_dim: 768
	latent_dim: 32
	stack_factor: 4
	hidden_dim: 512

	# GroupFSQ量化器
	quantizer:
	num_groups: 8
	num_levels_per_group: [8, 7, 6, 6]
	eps: 0.001

	# 上采样
	upsample:
	latent_dim: 32
	out_dim: 768
	stack_factor: 4
	hidden_dim: 512

	acoustic_decoder:
	num_mel_bins: 80
	sampling_rate: 16000
	hop_length: 160
	stride_size: 2
	kernel_size: 3
	d_model: 768
	scale_embedding: false
	max_audio_seconds: 30
	decoder_layers: 12
	decoder_attention_heads: 12
	decoder_ffn_dim: 3072
	activation_function: "gelu"

	vocos:
	input_channels: 80
	dim: 512
	intermediate_dim: 4096
	num_layers: 24
	n_fft: 640
	hop_size: 160
	padding: "same"