File size: 10,951 Bytes
7417a6a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
Model:
TextConditionedSeparator(
  (clap): ClapModel(
    (text_model): ClapTextModel(
      (embeddings): ClapTextEmbeddings(
        (word_embeddings): Embedding(50265, 768, padding_idx=1)
        (position_embeddings): Embedding(514, 768, padding_idx=1)
        (token_type_embeddings): Embedding(1, 768)
        (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (encoder): ClapTextEncoder(
        (layer): ModuleList(
          (0-11): 12 x ClapTextLayer(
            (attention): ClapTextAttention(
              (self): ClapTextSelfAttention(
                (query): Linear(in_features=768, out_features=768, bias=True)
                (key): Linear(in_features=768, out_features=768, bias=True)
                (value): Linear(in_features=768, out_features=768, bias=True)
                (dropout): Dropout(p=0.1, inplace=False)
              )
              (output): ClapTextSelfOutput(
                (dense): Linear(in_features=768, out_features=768, bias=True)
                (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
                (dropout): Dropout(p=0.1, inplace=False)
              )
            )
            (intermediate): ClapTextIntermediate(
              (dense): Linear(in_features=768, out_features=3072, bias=True)
              (intermediate_act_fn): GELUActivation()
            )
            (output): ClapTextOutput(
              (dense): Linear(in_features=3072, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
          )
        )
      )
      (pooler): ClapTextPooler(
        (dense): Linear(in_features=768, out_features=768, bias=True)
        (activation): Tanh()
      )
    )
    (text_projection): ClapProjectionLayer(
      (linear1): Linear(in_features=768, out_features=512, bias=True)
      (activation): ReLU()
      (linear2): Linear(in_features=512, out_features=512, bias=True)
    )
    (audio_model): ClapAudioModel(
      (audio_encoder): ClapAudioEncoder(
        (patch_embed): ClapAudioPatchEmbed(
          (proj): Conv2d(1, 96, kernel_size=(4, 4), stride=(4, 4))
          (norm): LayerNorm((96,), eps=1e-05, elementwise_affine=True)
        )
        (layers): ModuleList(
          (0): ClapAudioStage(
            (blocks): ModuleList(
              (0-1): 2 x ClapAudioLayer(
                (layernorm_before): LayerNorm((96,), eps=1e-05, elementwise_affine=True)
                (attention): ClapAudioAttention(
                  (self): ClapAudioSelfAttention(
                    (query): Linear(in_features=96, out_features=96, bias=True)
                    (key): Linear(in_features=96, out_features=96, bias=True)
                    (value): Linear(in_features=96, out_features=96, bias=True)
                    (dropout): Dropout(p=0.0, inplace=False)
                  )
                  (output): ClapAudioSelfOutput(
                    (dense): Linear(in_features=96, out_features=96, bias=True)
                    (dropout): Dropout(p=0.0, inplace=False)
                  )
                )
                (drop_path): Identity()
                (layernorm_after): LayerNorm((96,), eps=1e-05, elementwise_affine=True)
                (intermediate): ClapAudioIntermediate(
                  (dense): Linear(in_features=96, out_features=384, bias=True)
                  (intermediate_act_fn): GELUActivation()
                )
                (output): ClapAudioOutput(
                  (dense): Linear(in_features=384, out_features=96, bias=True)
                  (dropout): Dropout(p=0.1, inplace=False)
                )
              )
            )
            (downsample): ClapAudioPatchMerging(
              (reduction): Linear(in_features=384, out_features=192, bias=False)
              (norm): LayerNorm((384,), eps=1e-05, elementwise_affine=True)
            )
          )
          (1): ClapAudioStage(
            (blocks): ModuleList(
              (0-1): 2 x ClapAudioLayer(
                (layernorm_before): LayerNorm((192,), eps=1e-05, elementwise_affine=True)
                (attention): ClapAudioAttention(
                  (self): ClapAudioSelfAttention(
                    (query): Linear(in_features=192, out_features=192, bias=True)
                    (key): Linear(in_features=192, out_features=192, bias=True)
                    (value): Linear(in_features=192, out_features=192, bias=True)
                    (dropout): Dropout(p=0.0, inplace=False)
                  )
                  (output): ClapAudioSelfOutput(
                    (dense): Linear(in_features=192, out_features=192, bias=True)
                    (dropout): Dropout(p=0.0, inplace=False)
                  )
                )
                (drop_path): Identity()
                (layernorm_after): LayerNorm((192,), eps=1e-05, elementwise_affine=True)
                (intermediate): ClapAudioIntermediate(
                  (dense): Linear(in_features=192, out_features=768, bias=True)
                  (intermediate_act_fn): GELUActivation()
                )
                (output): ClapAudioOutput(
                  (dense): Linear(in_features=768, out_features=192, bias=True)
                  (dropout): Dropout(p=0.1, inplace=False)
                )
              )
            )
            (downsample): ClapAudioPatchMerging(
              (reduction): Linear(in_features=768, out_features=384, bias=False)
              (norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
            )
          )
          (2): ClapAudioStage(
            (blocks): ModuleList(
              (0-5): 6 x ClapAudioLayer(
                (layernorm_before): LayerNorm((384,), eps=1e-05, elementwise_affine=True)
                (attention): ClapAudioAttention(
                  (self): ClapAudioSelfAttention(
                    (query): Linear(in_features=384, out_features=384, bias=True)
                    (key): Linear(in_features=384, out_features=384, bias=True)
                    (value): Linear(in_features=384, out_features=384, bias=True)
                    (dropout): Dropout(p=0.0, inplace=False)
                  )
                  (output): ClapAudioSelfOutput(
                    (dense): Linear(in_features=384, out_features=384, bias=True)
                    (dropout): Dropout(p=0.0, inplace=False)
                  )
                )
                (drop_path): Identity()
                (layernorm_after): LayerNorm((384,), eps=1e-05, elementwise_affine=True)
                (intermediate): ClapAudioIntermediate(
                  (dense): Linear(in_features=384, out_features=1536, bias=True)
                  (intermediate_act_fn): GELUActivation()
                )
                (output): ClapAudioOutput(
                  (dense): Linear(in_features=1536, out_features=384, bias=True)
                  (dropout): Dropout(p=0.1, inplace=False)
                )
              )
            )
            (downsample): ClapAudioPatchMerging(
              (reduction): Linear(in_features=1536, out_features=768, bias=False)
              (norm): LayerNorm((1536,), eps=1e-05, elementwise_affine=True)
            )
          )
          (3): ClapAudioStage(
            (blocks): ModuleList(
              (0-1): 2 x ClapAudioLayer(
                (layernorm_before): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
                (attention): ClapAudioAttention(
                  (self): ClapAudioSelfAttention(
                    (query): Linear(in_features=768, out_features=768, bias=True)
                    (key): Linear(in_features=768, out_features=768, bias=True)
                    (value): Linear(in_features=768, out_features=768, bias=True)
                    (dropout): Dropout(p=0.0, inplace=False)
                  )
                  (output): ClapAudioSelfOutput(
                    (dense): Linear(in_features=768, out_features=768, bias=True)
                    (dropout): Dropout(p=0.0, inplace=False)
                  )
                )
                (drop_path): Identity()
                (layernorm_after): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
                (intermediate): ClapAudioIntermediate(
                  (dense): Linear(in_features=768, out_features=3072, bias=True)
                  (intermediate_act_fn): GELUActivation()
                )
                (output): ClapAudioOutput(
                  (dense): Linear(in_features=3072, out_features=768, bias=True)
                  (dropout): Dropout(p=0.1, inplace=False)
                )
              )
            )
          )
        )
        (batch_norm): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (avgpool): AdaptiveAvgPool1d(output_size=1)
      )
    )
    (audio_projection): ClapProjectionLayer(
      (linear1): Linear(in_features=768, out_features=512, bias=True)
      (activation): ReLU()
      (linear2): Linear(in_features=512, out_features=512, bias=True)
    )
  )
  (z_encoder): PatchConv1d(
    (conv): Conv1d(1, 256, kernel_size=(16,), stride=(8,))
  )
  (text_proj): Linear(in_features=512, out_features=256, bias=True)
  (z_proj): Linear(in_features=256, out_features=256, bias=True)
  (cross): CrossAttention(
    (attn): MultiheadAttention(
      (out_proj): NonDynamicallyQuantizableLinear(in_features=256, out_features=256, bias=True)
    )
    (ln1): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
    (ff): MLP(
      (fc1): Linear(in_features=256, out_features=1024, bias=True)
      (fc2): Linear(in_features=1024, out_features=256, bias=True)
      (act): GELU(approximate='none')
    )
    (ln2): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
  )
  (transformer): TransformerEncoder(
    (layers): ModuleList(
      (0-5): 6 x TransformerEncoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=256, out_features=256, bias=True)
        )
        (linear1): Linear(in_features=256, out_features=1024, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
        (linear2): Linear(in_features=1024, out_features=256, bias=True)
        (norm1): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
        (norm2): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
        (dropout1): Dropout(p=0.1, inplace=False)
        (dropout2): Dropout(p=0.1, inplace=False)
      )
    )
  )
  (spec_decoder): Sequential(
    (0): Linear(in_features=256, out_features=256, bias=True)
    (1): GELU(approximate='none')
    (2): Linear(in_features=256, out_features=2049, bias=True)
  )
)
output waveform shape: torch.Size([2, 1, 48000])
output spectrogram shape: torch.Size([2, 12001, 2049])