File size: 2,805 Bytes
4305b2e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
{
  "model_type": "dolphin",
  "task": "audio_visual_speech_separation",
  "framework": "pytorch",
  "license": "apache-2.0",
  "tags": [
    "audio",
    "speech-separation",
    "audio-visual",
    "pytorch",
    "dolphin"
  ],
  "architectures": [
    "Dolphin"
  ],
  "auto_map": {
    "AutoModel": "dolphin.Dolphin"
  },
  "num_stages": 4,
  "sample_rate": 16000,
  "vpre_channels": 3872,
  "vmid_channels": 512,
  "vin_channels": 64,
  "vout_channels": 64,
  "module_audio_enc": {
    "in_channels": 1,
    "out_channels": 256,
    "kernel_size": 16,
    "stride": 4,
    "groups": 1,
    "bias": false
  },
  "module_feature_projector": {
    "num_channels": 256,
    "in_channels": 256,
    "out_channels": 128,
    "kernel_size": 1,
    "bias": false
  },
  "module_separator": {
    "num_stages": 4,
    "relative_positional_encoding": {
      "in_channels": 128,
      "num_heads": 8,
      "maxlen": 2000,
      "embed_v": false
    },
    "enc_stage": {
      "global_blocks": {
        "in_channels": 128,
        "num_mha_heads": 8,
        "dropout_rate": 0.05
      },
      "local_blocks": {
        "in_channels": 128,
        "kernel_size": 65,
        "dropout_rate": 0.05
      },
      "down_conv_layer": {
        "in_channels": 128,
        "samp_kernel_size": 5
      }
    },
    "simple_fusion": {
      "out_channels": 128
    },
    "dec_stage": {
      "global_blocks": {
        "in_channels": 128,
        "num_mha_heads": 8,
        "dropout_rate": 0.05
      },
      "local_blocks": {
        "in_channels": 128,
        "kernel_size": 65,
        "dropout_rate": 0.05
      },
      "spk_attention": {
        "in_channels": 128,
        "num_mha_heads": 8,
        "dropout_rate": 0.05
      }
    }
  },
  "module_output_layer": {
    "in_channels": 256,
    "out_channels": 128
  },
  "module_audio_dec": {
    "in_channels": 256,
    "out_channels": 1,
    "kernel_size": 16,
    "stride": 4,
    "bias": false
  },
  "video_encoder_params": {
    "layers": [
      "residual",
      "compress_space",
      "consecutive_residual",
      "compress_space",
      "consecutive_residual",
      "linear_attend_space",
      "compress_space",
      "consecutive_residual",
      "attend_space"
    ],
    "image_size": 88,
    "in_channel": 1,
    "init_channel": 4,
    "max_dim": 32,
    "input_conv_kernel_size": [
      7,
      7,
      7
    ],
    "output_conv_kernel_size": [
      3,
      3,
      3
    ],
    "residual_conv_kernel_size": 3,
    "pad_mode": "constant",
    "attn_dim_head": 32,
    "attn_heads": 8,
    "attn_dropout": 0.0,
    "flash_attn": true,
    "linear_attn_dim_head": 8,
    "linear_attn_heads": 16,
    "num_quantizers": 1,
    "codebook_size": 256,
    "codebook_dim": 64,
    "commitment_cost": 1.0,
    "distill_cost": 1.0
  }
}