labhamlet
/

gramt-binaural-frame

@@ -48,6 +48,7 @@ class BinauralFeatureExtractor(SequenceFeatureExtractor):
     def _extract_fbank_features(
         self,
         waveform: np.ndarray,
     ) -> np.ndarray:
         """
         Get mel-filter bank features using TorchAudio. Note that TorchAudio requires 16-bit signed integers as inputs
@@ -65,7 +66,9 @@ class BinauralFeatureExtractor(SequenceFeatureExtractor):
             )
         waveform = torch.tensor(waveform.clone().detach())
-        waveform = self._normalize_audio(waveform)
         # If waveform has two channels, but the channel information is not the first dimension, transpose.
         if (waveform.ndim == 2) and (waveform.shape[0] > 100):
             waveform = waveform.transpose(1, 0)
@@ -106,6 +109,7 @@ class BinauralFeatureExtractor(SequenceFeatureExtractor):
         raw_speech: Union[np.ndarray, list[float], list[np.ndarray], list[list[float]]],
         sampling_rate: Optional[int] = None,
         return_tensors: Optional[Union[str, TensorType]] = None,
         **kwargs,
     ) -> BatchFeature:
         """
@@ -136,7 +140,7 @@ class BinauralFeatureExtractor(SequenceFeatureExtractor):
                 )
         # extract fbank features and pad/truncate to max_length
-        features = [self._extract_fbank_features(waveform) for waveform in raw_speech]
         features = torch.nn.utils.rnn.pad_sequence(features, batch_first=True)
         inputs = BatchFeature({"input_values": features})
         return inputs

     def _extract_fbank_features(
         self,
         waveform: np.ndarray,
+        normalize : bool,
     ) -> np.ndarray:
         """
         Get mel-filter bank features using TorchAudio. Note that TorchAudio requires 16-bit signed integers as inputs
             )
         waveform = torch.tensor(waveform.clone().detach())
+        melspec.to(waveform.device)
+        if normalize:
+            waveform = self._normalize_audio(waveform)
         # If waveform has two channels, but the channel information is not the first dimension, transpose.
         if (waveform.ndim == 2) and (waveform.shape[0] > 100):
             waveform = waveform.transpose(1, 0)
         raw_speech: Union[np.ndarray, list[float], list[np.ndarray], list[list[float]]],
         sampling_rate: Optional[int] = None,
         return_tensors: Optional[Union[str, TensorType]] = None,
+        normalize : bool = True,
         **kwargs,
     ) -> BatchFeature:
         """
                 )
         # extract fbank features and pad/truncate to max_length
+        features = [self._extract_fbank_features(waveform, normalize) for waveform in raw_speech]
         features = torch.nn.utils.rnn.pad_sequence(features, batch_first=True)
         inputs = BatchFeature({"input_values": features})
         return inputs