niobures
/

Dolphin

Model card Files Files and versions

Dolphin / model /config.json

niobures's picture

Dolphin (code, models, paper)

4305b2e verified 2 months ago

history blame contribute delete

2.81 kB

	{
	"model_type": "dolphin",
	"task": "audio_visual_speech_separation",
	"framework": "pytorch",
	"license": "apache-2.0",
	"tags": [
	"audio",
	"speech-separation",
	"audio-visual",
	"pytorch",
	"dolphin"
	],
	"architectures": [
	"Dolphin"
	],
	"auto_map": {
	"AutoModel": "dolphin.Dolphin"
	},
	"num_stages": 4,
	"sample_rate": 16000,
	"vpre_channels": 3872,
	"vmid_channels": 512,
	"vin_channels": 64,
	"vout_channels": 64,
	"module_audio_enc": {
	"in_channels": 1,
	"out_channels": 256,
	"kernel_size": 16,
	"stride": 4,
	"groups": 1,
	"bias": false
	},
	"module_feature_projector": {
	"num_channels": 256,
	"in_channels": 256,
	"out_channels": 128,
	"kernel_size": 1,
	"bias": false
	},
	"module_separator": {
	"num_stages": 4,
	"relative_positional_encoding": {
	"in_channels": 128,
	"num_heads": 8,
	"maxlen": 2000,
	"embed_v": false
	},
	"enc_stage": {
	"global_blocks": {
	"in_channels": 128,
	"num_mha_heads": 8,
	"dropout_rate": 0.05
	},
	"local_blocks": {
	"in_channels": 128,
	"kernel_size": 65,
	"dropout_rate": 0.05
	},
	"down_conv_layer": {
	"in_channels": 128,
	"samp_kernel_size": 5
	}
	},
	"simple_fusion": {
	"out_channels": 128
	},
	"dec_stage": {
	"global_blocks": {
	"in_channels": 128,
	"num_mha_heads": 8,
	"dropout_rate": 0.05
	},
	"local_blocks": {
	"in_channels": 128,
	"kernel_size": 65,
	"dropout_rate": 0.05
	},
	"spk_attention": {
	"in_channels": 128,
	"num_mha_heads": 8,
	"dropout_rate": 0.05
	}
	}
	},
	"module_output_layer": {
	"in_channels": 256,
	"out_channels": 128
	},
	"module_audio_dec": {
	"in_channels": 256,
	"out_channels": 1,
	"kernel_size": 16,
	"stride": 4,
	"bias": false
	},
	"video_encoder_params": {
	"layers": [
	"residual",
	"compress_space",
	"consecutive_residual",
	"compress_space",
	"consecutive_residual",
	"linear_attend_space",
	"compress_space",
	"consecutive_residual",
	"attend_space"
	],
	"image_size": 88,
	"in_channel": 1,
	"init_channel": 4,
	"max_dim": 32,
	"input_conv_kernel_size": [
	7,
	7,
	7
	],
	"output_conv_kernel_size": [
	3,
	3,
	3
	],
	"residual_conv_kernel_size": 3,
	"pad_mode": "constant",
	"attn_dim_head": 32,
	"attn_heads": 8,
	"attn_dropout": 0.0,
	"flash_attn": true,
	"linear_attn_dim_head": 8,
	"linear_attn_heads": 16,
	"num_quantizers": 1,
	"codebook_size": 256,
	"codebook_dim": 64,
	"commitment_cost": 1.0,
	"distill_cost": 1.0
	}
	}