File size: 2,805 Bytes

{
  "model_type": "dolphin",
  "task": "audio_visual_speech_separation",
  "framework": "pytorch",
  "license": "apache-2.0",
  "tags": [
    "audio",
    "speech-separation",
    "audio-visual",
    "pytorch",
    "dolphin"
  ],
  "architectures": [
    "Dolphin"
  ],
  "auto_map": {
    "AutoModel": "dolphin.Dolphin"
  },
  "num_stages": 4,
  "sample_rate": 16000,
  "vpre_channels": 3872,
  "vmid_channels": 512,
  "vin_channels": 64,
  "vout_channels": 64,
  "module_audio_enc": {
    "in_channels": 1,
    "out_channels": 256,
    "kernel_size": 16,
    "stride": 4,
    "groups": 1,
    "bias": false
  },
  "module_feature_projector": {
    "num_channels": 256,
    "in_channels": 256,
    "out_channels": 128,
    "kernel_size": 1,
    "bias": false
  },
  "module_separator": {
    "num_stages": 4,
    "relative_positional_encoding": {
      "in_channels": 128,
      "num_heads": 8,
      "maxlen": 2000,
      "embed_v": false
    },
    "enc_stage": {
      "global_blocks": {
        "in_channels": 128,
        "num_mha_heads": 8,
        "dropout_rate": 0.05
      },
      "local_blocks": {
        "in_channels": 128,
        "kernel_size": 65,
        "dropout_rate": 0.05
      },
      "down_conv_layer": {
        "in_channels": 128,
        "samp_kernel_size": 5
      }
    },
    "simple_fusion": {
      "out_channels": 128
    },
    "dec_stage": {
      "global_blocks": {
        "in_channels": 128,
        "num_mha_heads": 8,
        "dropout_rate": 0.05
      },
      "local_blocks": {
        "in_channels": 128,
        "kernel_size": 65,
        "dropout_rate": 0.05
      },
      "spk_attention": {
        "in_channels": 128,
        "num_mha_heads": 8,
        "dropout_rate": 0.05
      }
    }
  },
  "module_output_layer": {
    "in_channels": 256,
    "out_channels": 128
  },
  "module_audio_dec": {
    "in_channels": 256,
    "out_channels": 1,
    "kernel_size": 16,
    "stride": 4,
    "bias": false
  },
  "video_encoder_params": {
    "layers": [
      "residual",
      "compress_space",
      "consecutive_residual",
      "compress_space",
      "consecutive_residual",
      "linear_attend_space",
      "compress_space",
      "consecutive_residual",
      "attend_space"
    ],
    "image_size": 88,
    "in_channel": 1,
    "init_channel": 4,
    "max_dim": 32,
    "input_conv_kernel_size": [
      7,
      7,
      7
    ],
    "output_conv_kernel_size": [
      3,
      3,
      3
    ],
    "residual_conv_kernel_size": 3,
    "pad_mode": "constant",
    "attn_dim_head": 32,
    "attn_heads": 8,
    "attn_dropout": 0.0,
    "flash_attn": true,
    "linear_attn_dim_head": 8,
    "linear_attn_heads": 16,
    "num_quantizers": 1,
    "codebook_size": 256,
    "codebook_dim": 64,
    "commitment_cost": 1.0,
    "distill_cost": 1.0
  }
}