TalTechNLP
/

voxlingua107-xls-r-300m-wav2vec

@@ -71,10 +71,10 @@ class EncoderWav2vecClassifier(Pretrained):
         wavs = wavs.float()
         # Feature extraction and normalization
-        feats = self.modules.wav2vec2(wavs)
         feats = feats.transpose(1, 2)
-        pooling = self.modules.attentive(feats, wav_lens) # channels = 1024
         outputs = pooling.transpose(1, 2)
         return outputs
@@ -105,7 +105,7 @@ class EncoderWav2vecClassifier(Pretrained):
             (label encoder should be provided).
         """
         outputs = self.encode_batch(wavs, wav_lens)
-        outputs = self.modules.classifier(outputs)
         out_prob = self.hparams.softmax(outputs)
         score, index = torch.max(out_prob, dim=-1)
         text_lab = self.hparams.label_encoder.decode_torch(index)
@@ -136,24 +136,21 @@ class EncoderWav2vecClassifier(Pretrained):
             (label encoder should be provided).
         """
         waveform = self.load_audio(path)
         # Fake a batch:
         batch = waveform.unsqueeze(0)
         rel_length = torch.tensor([1.0])
         outputs = self.encode_batch(batch, rel_length)
-        outputs = self.modules.classifier(outputs)
-        # print("classify_outputs_0", outputs.shape)
         out_prob = self.hparams.softmax(outputs)
-        # print("classify_out_1_softmax", out_prob)
         score, index = torch.max(out_prob, dim=-1)
         text_lab = self.hparams.label_encoder.decode_torch(index)
-        # print("classify_score_2", score)
-        # print("classify_index_3", index)
-        # print("classify_textlab_4", text_lab)
         return out_prob, score, index, text_lab
     def forward(self, wavs, wav_lens=None, normalize=False):
         return self.encode_batch(
             wavs=wavs, wav_lens=wav_lens, normalize=normalize
-        )

         wavs = wavs.float()
         # Feature extraction and normalization
+        feats = self.mods.wav2vec2(wavs)
         feats = feats.transpose(1, 2)
+        pooling = self.mods.attentive(feats, wav_lens) # channels = 1024
         outputs = pooling.transpose(1, 2)
         return outputs
             (label encoder should be provided).
         """
         outputs = self.encode_batch(wavs, wav_lens)
+        outputs = self.mods.classifier(outputs)
         out_prob = self.hparams.softmax(outputs)
         score, index = torch.max(out_prob, dim=-1)
         text_lab = self.hparams.label_encoder.decode_torch(index)
             (label encoder should be provided).
         """
         waveform = self.load_audio(path)
         # Fake a batch:
         batch = waveform.unsqueeze(0)
         rel_length = torch.tensor([1.0])
         outputs = self.encode_batch(batch, rel_length)
+        outputs = self.mods.classifier(outputs)
         out_prob = self.hparams.softmax(outputs)
         score, index = torch.max(out_prob, dim=-1)
         text_lab = self.hparams.label_encoder.decode_torch(index)
         return out_prob, score, index, text_lab
     def forward(self, wavs, wav_lens=None, normalize=False):
         return self.encode_batch(
             wavs=wavs, wav_lens=wav_lens, normalize=normalize
+        )