JusperLee commited on
Commit
08cf5a5
·
verified ·
1 Parent(s): 0f50241

Add model configuration

Browse files
Files changed (1) hide show
  1. config.json +86 -86
config.json CHANGED
@@ -1,103 +1,94 @@
1
  {
2
- "architectures": [
3
- "Dolphin"
4
- ],
5
- "auto_map": {
6
- "AutoModel": "dolphin.Dolphin"
7
- },
8
  "framework": "pytorch",
9
  "license": "apache-2.0",
 
 
 
 
 
 
 
10
  "model_config": {
11
- "module_audio_dec": {
12
- "bias": false,
13
- "in_channels": 256,
14
- "kernel_size": 16,
15
- "out_channels": 1,
16
- "stride": 4
17
- },
18
  "module_audio_enc": {
19
- "bias": false,
20
- "groups": 1,
21
  "in_channels": 1,
22
- "kernel_size": 16,
23
  "out_channels": 256,
24
- "stride": 4
 
 
 
25
  },
26
  "module_feature_projector": {
27
- "bias": false,
28
- "in_channels": 256,
29
- "kernel_size": 1,
30
  "num_channels": 256,
31
- "out_channels": 128
32
- },
33
- "module_output_layer": {
34
  "in_channels": 256,
35
- "out_channels": 128
 
 
36
  },
37
  "module_separator": {
38
- "dec_stage": {
 
 
 
 
 
 
 
39
  "global_blocks": {
40
- "dropout_rate": 0.05,
41
  "in_channels": 128,
42
- "num_mha_heads": 8
 
43
  },
44
  "local_blocks": {
45
- "dropout_rate": 0.05,
46
  "in_channels": 128,
47
- "kernel_size": 65
 
48
  },
49
- "spk_attention": {
50
- "dropout_rate": 0.05,
51
- "in_channels": 128,
52
- "num_mha_heads": 8
53
- }
54
- },
55
- "enc_stage": {
56
  "down_conv_layer": {
57
  "in_channels": 128,
58
  "samp_kernel_size": 5
59
- },
 
 
 
 
 
60
  "global_blocks": {
61
- "dropout_rate": 0.05,
62
  "in_channels": 128,
63
- "num_mha_heads": 8
 
64
  },
65
  "local_blocks": {
66
- "dropout_rate": 0.05,
67
  "in_channels": 128,
68
- "kernel_size": 65
 
 
 
 
 
 
69
  }
70
- },
71
- "num_stages": 4,
72
- "relative_positional_encoding": {
73
- "embed_v": false,
74
- "in_channels": 128,
75
- "maxlen": 2000,
76
- "num_heads": 8
77
- },
78
- "simple_fusion": {
79
- "out_channels": 128
80
  }
81
  },
82
- "num_stages": 4,
83
- "sample_rate": 16000,
 
 
 
 
 
 
 
 
 
84
  "video_encoder_params": {
85
- "attn_dim_head": 32,
86
- "attn_dropout": 0.0,
87
- "attn_heads": 8,
88
- "codebook_dim": 64,
89
- "codebook_size": 256,
90
- "commitment_cost": 1.0,
91
- "distill_cost": 1.0,
92
- "flash_attn": true,
93
- "image_size": 88,
94
- "in_channel": 1,
95
- "init_channel": 4,
96
- "input_conv_kernel_size": [
97
- 7,
98
- 7,
99
- 7
100
- ],
101
  "layers": [
102
  "residual",
103
  "compress_space",
@@ -109,30 +100,39 @@
109
  "consecutive_residual",
110
  "attend_space"
111
  ],
112
- "linear_attn_dim_head": 8,
113
- "linear_attn_heads": 16,
 
114
  "max_dim": 32,
115
- "num_quantizers": 1,
 
 
 
 
116
  "output_conv_kernel_size": [
117
  3,
118
  3,
119
  3
120
  ],
 
121
  "pad_mode": "constant",
122
- "residual_conv_kernel_size": 3
123
- },
124
- "vin_channels": 64,
125
- "vmid_channels": 512,
126
- "vout_channels": 64,
127
- "vpre_channels": 3872
 
 
 
 
 
 
128
  },
129
- "model_type": "dolphin",
130
- "tags": [
131
- "audio",
132
- "speech-separation",
133
- "audio-visual",
134
- "pytorch",
135
- "dolphin"
136
  ],
137
- "task": "audio_visual_speech_separation"
 
 
138
  }
 
1
  {
2
+ "model_type": "dolphin",
3
+ "task": "audio_visual_speech_separation",
 
 
 
 
4
  "framework": "pytorch",
5
  "license": "apache-2.0",
6
+ "tags": [
7
+ "audio",
8
+ "speech-separation",
9
+ "audio-visual",
10
+ "pytorch",
11
+ "dolphin"
12
+ ],
13
  "model_config": {
14
+ "num_stages": 4,
15
+ "sample_rate": 16000,
16
+ "vpre_channels": 3872,
17
+ "vmid_channels": 512,
18
+ "vin_channels": 64,
19
+ "vout_channels": 64,
 
20
  "module_audio_enc": {
 
 
21
  "in_channels": 1,
 
22
  "out_channels": 256,
23
+ "kernel_size": 16,
24
+ "stride": 4,
25
+ "groups": 1,
26
+ "bias": false
27
  },
28
  "module_feature_projector": {
 
 
 
29
  "num_channels": 256,
 
 
 
30
  "in_channels": 256,
31
+ "out_channels": 128,
32
+ "kernel_size": 1,
33
+ "bias": false
34
  },
35
  "module_separator": {
36
+ "num_stages": 4,
37
+ "relative_positional_encoding": {
38
+ "in_channels": 128,
39
+ "num_heads": 8,
40
+ "maxlen": 2000,
41
+ "embed_v": false
42
+ },
43
+ "enc_stage": {
44
  "global_blocks": {
 
45
  "in_channels": 128,
46
+ "num_mha_heads": 8,
47
+ "dropout_rate": 0.05
48
  },
49
  "local_blocks": {
 
50
  "in_channels": 128,
51
+ "kernel_size": 65,
52
+ "dropout_rate": 0.05
53
  },
 
 
 
 
 
 
 
54
  "down_conv_layer": {
55
  "in_channels": 128,
56
  "samp_kernel_size": 5
57
+ }
58
+ },
59
+ "simple_fusion": {
60
+ "out_channels": 128
61
+ },
62
+ "dec_stage": {
63
  "global_blocks": {
 
64
  "in_channels": 128,
65
+ "num_mha_heads": 8,
66
+ "dropout_rate": 0.05
67
  },
68
  "local_blocks": {
 
69
  "in_channels": 128,
70
+ "kernel_size": 65,
71
+ "dropout_rate": 0.05
72
+ },
73
+ "spk_attention": {
74
+ "in_channels": 128,
75
+ "num_mha_heads": 8,
76
+ "dropout_rate": 0.05
77
  }
 
 
 
 
 
 
 
 
 
 
78
  }
79
  },
80
+ "module_output_layer": {
81
+ "in_channels": 256,
82
+ "out_channels": 128
83
+ },
84
+ "module_audio_dec": {
85
+ "in_channels": 256,
86
+ "out_channels": 1,
87
+ "kernel_size": 16,
88
+ "stride": 4,
89
+ "bias": false
90
+ },
91
  "video_encoder_params": {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
92
  "layers": [
93
  "residual",
94
  "compress_space",
 
100
  "consecutive_residual",
101
  "attend_space"
102
  ],
103
+ "image_size": 88,
104
+ "in_channel": 1,
105
+ "init_channel": 4,
106
  "max_dim": 32,
107
+ "input_conv_kernel_size": [
108
+ 7,
109
+ 7,
110
+ 7
111
+ ],
112
  "output_conv_kernel_size": [
113
  3,
114
  3,
115
  3
116
  ],
117
+ "residual_conv_kernel_size": 3,
118
  "pad_mode": "constant",
119
+ "attn_dim_head": 32,
120
+ "attn_heads": 8,
121
+ "attn_dropout": 0.0,
122
+ "flash_attn": true,
123
+ "linear_attn_dim_head": 8,
124
+ "linear_attn_heads": 16,
125
+ "num_quantizers": 1,
126
+ "codebook_size": 256,
127
+ "codebook_dim": 64,
128
+ "commitment_cost": 1.0,
129
+ "distill_cost": 1.0
130
+ }
131
  },
132
+ "architectures": [
133
+ "Dolphin"
 
 
 
 
 
134
  ],
135
+ "auto_map": {
136
+ "AutoModel": "dolphin.Dolphin"
137
+ }
138
  }