MIT-SLS
/

USAD-Small

@@ -1,10 +1,14 @@
 from dataclasses import make_dataclass
 import torch
 import torchaudio
 from torch import nn
-from .usad_modules import ConformerEncoder
 MAX_MEL_LENGTH = 3000  # 30 seconds
@@ -15,41 +19,77 @@ def wav_to_fbank(
     mel_dim: int = 128,
     norm_mean: float = -4.268,
     norm_std: float = 4.569,
-) -> torch.Tensor:
     """Convert waveform to fbank features.
     Args:
         wavs (torch.Tensor): (B, T_wav) waveform tensor.
         mel_dim (int, optional): mel dimension. Defaults to 128.
-        norm_mean (float, optional):
-            mean for normalization. Defaults to -4.268.
-        norm_std (float, optional):
-            std for normalization. Defaults to 4.569.
     Returns:
-        torch.Tensor: (B, T_mel, mel_dim) fbank features.
     """
     # ref: https://github.com/cwx-worst-one/EAT/tree/main/feature_extract
-    dtype = wavs.dtype
-    wavs = wavs.to(torch.float32)
-    wavs = wavs - wavs.mean(dim=-1, keepdim=True)
-    feats = [
-        torchaudio.compliance.kaldi.fbank(
-            wavs[i : i + 1],
             htk_compat=True,
-            sample_frequency=16000,
             use_energy=False,
             window_type="hanning",
             num_mel_bins=mel_dim,
             dither=0.0,
             frame_shift=10,
-        ).to(dtype=dtype)
-        for i in range(wavs.shape[0])
-    ]
-    mels = torch.stack(feats, dim=0)
-    mels = (mels - norm_mean) / (norm_std * 2)
     return mels
@@ -64,8 +104,6 @@ class UsadModel(nn.Module):
         self.cfg = cfg
         self.encoder = ConformerEncoder(cfg)
         self.max_mel_length = MAX_MEL_LENGTH
-        # NOTE: The max_mel_length is set to 3000,
-        # which corresponds to 30 seconds of audio at 100 Hz frame rate.
     @property
     def sample_rate(self) -> int:
@@ -73,7 +111,7 @@ class UsadModel(nn.Module):
     @property
     def encoder_frame_rate(self) -> int:
-        return 50  # Hz
     @property
     def mel_dim(self) -> int:
@@ -100,9 +138,12 @@ class UsadModel(nn.Module):
         """Get the device on which the model is located."""
         return next(self.parameters()).device
     def set_audio_chunk_size(self, seconds: float = 30.0) -> None:
         """Set the maximum chunk size for feature extraction.
         Args:
             seconds (float, optional): Chunk size in seconds. Defaults to 30.0.
         """
@@ -111,86 +152,202 @@ class UsadModel(nn.Module):
         ), f"Chunk size must be greater than 0.1s, got {seconds} seconds."
         self.max_mel_length = int(seconds * 100)  # 100 Hz frame rate
-    def load_audio(self, audio_path: str) -> torch.Tensor:
         """Load audio file and return waveform tensor.
         Args:
             audio_path (str): Path to the audio file.
         Returns:
             torch.Tensor: Waveform tensor of shape (wav_len,).
         """
         waveform, sr = torchaudio.load(audio_path)
         if sr != self.sample_rate:
-            waveform = torchaudio.functional.resample(waveform, sr, self.sample_rate)
         if waveform.shape[0] > 1:
             # If stereo, convert to mono by averaging channels
             waveform = waveform.mean(dim=0, keepdim=True)
         waveform = waveform.squeeze(0)  # Remove channel dimension if mono
-        return waveform.to(self.device)  # Ensure tensor is on the same device
     def forward(
         self,
         wavs: torch.Tensor,
         norm_mean: float = -4.268,
         norm_std: float = 4.569,
     ) -> dict:
-        """Forward pass for the model.
         Args:
-            wavs (torch.Tensor):
-                Input waveform tensor of shape (batch_size, wav_len).
-            norm_mean (float, optional):
-                Mean for normalization. Defaults to -4.268.
-            norm_std (float, optional):
-                Standard deviation for normalization. Defaults to 4.569.
         Returns:
-            dict: A dictionary containing the model's outputs.
         """
-        # wavs: (batch_size, wav_len)
-        mel = wav_to_fbank(wavs, norm_mean=norm_mean, norm_std=norm_std)
-        mel = mel[:, : mel.shape[1] - mel.shape[1] % 2]
         if mel.shape[1] <= self.max_mel_length:
-            x, x_len, layer_results = self.encoder(mel, return_hidden=True)
             result = {
                 "x": x,
                 "mel": mel,
                 "hidden_states": layer_results["hidden_states"],
                 "ffn": layer_results["ffn_1"],
             }
             return result
         result = {
             "x": [],
             "mel": mel,
-            "hidden_states": [[] for _ in range(self.cfg.num_layers)],
-            "ffn": [[] for _ in range(self.cfg.num_layers)],
         }
         for i in range(0, mel.shape[1], self.max_mel_length):
             if mel.shape[1] - i < 10:
                 break
             x, x_len, layer_results = self.encoder(
-                mel[:, i : i + self.max_mel_length], return_hidden=True
             )
             result["x"].append(x)
-            for j in range(self.cfg.num_layers):
-                result["hidden_states"][j].append(layer_results["hidden_states"][j])
                 result["ffn"][j].append(layer_results["ffn_1"][j])
         result["x"] = torch.cat(result["x"], dim=1)
-        for j in range(self.cfg.num_layers):
-            result["hidden_states"][j] = torch.cat(result["hidden_states"][j], dim=1)
             result["ffn"][j] = torch.cat(result["ffn"][j], dim=1)
-        # result["x"]: model final output (batch_size, seq_len)
-        # result["mel"]: mel fbank (batch_size, seq_len * 2, mel_dim)
-        # result["hidden_states"]: List of (batch_size, seq_len, encoder_dim)
-        # result["ffn"]: List of (batch_size, seq_len, encoder_dim)
         return result
     @classmethod

+import os
 from dataclasses import make_dataclass
+from typing import List, Optional, Tuple, Union
 import torch
 import torchaudio
 from torch import nn
+from torch.nn.utils.rnn import pad_sequence
+from torchaudio.compliance.kaldi import fbank
+from .usad_modules import ConformerEncoder, lengths_to_padding_mask
 MAX_MEL_LENGTH = 3000  # 30 seconds
     mel_dim: int = 128,
     norm_mean: float = -4.268,
     norm_std: float = 4.569,
+    wav_lengths: Optional[torch.Tensor] = None,
+    sample_rate: int = 16000,
+    return_lengths: bool = False,
+) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
     """Convert waveform to fbank features.
     Args:
         wavs (torch.Tensor): (B, T_wav) waveform tensor.
         mel_dim (int, optional): mel dimension. Defaults to 128.
+        norm_mean (float, optional): mean for normalization. Defaults to -4.268.
+        norm_std (float, optional): std for normalization. Defaults to 4.569.
+        wav_lengths (torch.Tensor, optional): (B,) valid waveform lengths before padding.
+        sample_rate (int, optional): waveform sample rate. Defaults to 16000.
+        return_lengths (bool, optional): return exact fbank lengths. Defaults to False.
     Returns:
+        torch.Tensor: (B, T_mel, mel_dim) fbank features. If return_lengths is True,
+        also returns a (B,) tensor with exact feature lengths before padding.
     """
     # ref: https://github.com/cwx-worst-one/EAT/tree/main/feature_extract
+    feature_dtype = wavs.dtype if wavs.is_floating_point() else torch.float32
+    wavs_float = wavs.to(torch.float32)
+    if wav_lengths is None:
+        wav_lengths = torch.full(
+            (wavs.shape[0],),
+            wavs.shape[1],
+            dtype=torch.long,
+            device=wavs.device,
+        )
+    else:
+        wav_lengths = wav_lengths.to(device=wavs.device, dtype=torch.long)
+        if wav_lengths.dim() != 1 or wav_lengths.shape[0] != wavs.shape[0]:
+            raise ValueError(
+                "wav_lengths must be a 1-D tensor with batch size elements."
+            )
+        if torch.any(wav_lengths <= 0).item():
+            raise ValueError("All wav_lengths values must be positive.")
+        if torch.any(wav_lengths > wavs.shape[1]).item():
+            raise ValueError(
+                "wav_lengths cannot exceed the padded waveform length."
+            )
+    feats = []
+    feat_lengths = []
+    for i, wav_length in enumerate(wav_lengths.detach().cpu().tolist()):
+        # Trim padding before centering so batched padding cannot affect valid audio.
+        wav = wavs_float[i, :wav_length]
+        wav = wav - wav.mean(dim=-1, keepdim=True)
+        feat = fbank(
+            wav.unsqueeze(0),
             htk_compat=True,
+            sample_frequency=sample_rate,
             use_energy=False,
             window_type="hanning",
             num_mel_bins=mel_dim,
             dither=0.0,
             frame_shift=10,
+        )
+        feat = feat[: feat.shape[0] - feat.shape[0] % 2, :]  # For compatibility
+        feat = (feat - norm_mean) / (norm_std * 2)
+        feats.append(feat.to(dtype=feature_dtype))
+        feat_lengths.append(feat.shape[0])
+    mels = pad_sequence(feats, batch_first=True, padding_value=0.0)
+    mel_lengths = torch.tensor(
+        feat_lengths, dtype=torch.long, device=wavs.device
+    )
+    if return_lengths:
+        return mels, mel_lengths
     return mels
         self.cfg = cfg
         self.encoder = ConformerEncoder(cfg)
         self.max_mel_length = MAX_MEL_LENGTH
     @property
     def sample_rate(self) -> int:
     @property
     def encoder_frame_rate(self) -> int:
+        return round(100 / self.cfg.conv_subsample_rate)  # Hz
     @property
     def mel_dim(self) -> int:
         """Get the device on which the model is located."""
         return next(self.parameters()).device
+    @property
+    def dtype(self) -> torch.dtype:
+        return next(self.parameters()).dtype
     def set_audio_chunk_size(self, seconds: float = 30.0) -> None:
         """Set the maximum chunk size for feature extraction.
         Args:
             seconds (float, optional): Chunk size in seconds. Defaults to 30.0.
         """
         ), f"Chunk size must be greater than 0.1s, got {seconds} seconds."
         self.max_mel_length = int(seconds * 100)  # 100 Hz frame rate
+    def load_audio(
+        self, audio_path: str, move_to_device: bool = True
+    ) -> torch.Tensor:
         """Load audio file and return waveform tensor.
         Args:
             audio_path (str): Path to the audio file.
         Returns:
             torch.Tensor: Waveform tensor of shape (wav_len,).
         """
         waveform, sr = torchaudio.load(audio_path)
         if sr != self.sample_rate:
+            waveform = torchaudio.functional.resample(
+                waveform, sr, self.sample_rate
+            )
         if waveform.shape[0] > 1:
             # If stereo, convert to mono by averaging channels
             waveform = waveform.mean(dim=0, keepdim=True)
         waveform = waveform.squeeze(0)  # Remove channel dimension if mono
+        if move_to_device:
+            return waveform.to(
+                self.device
+            )  # Ensure tensor is on the same device
+        return waveform
+    def load_audio_batch(
+        self, audio_paths: List[str]
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        wav_list = []
+        wav_lengths = []
+        for path in audio_paths:
+            wav = self.load_audio(path, move_to_device=False)
+            wav_list.append(wav)
+            wav_lengths.append(wav.shape[0])
+        wavs = pad_sequence(wav_list, batch_first=True).to(self.device)
+        wav_lengths = torch.tensor(
+            wav_lengths, dtype=torch.long, device=self.device
+        )
+        return wavs, wav_lengths
     def forward(
         self,
         wavs: torch.Tensor,
+        wav_lengths: Optional[torch.Tensor] = None,
+        padding_mask: Optional[torch.Tensor] = None,
+        target_layer: Optional[int] = None,
         norm_mean: float = -4.268,
         norm_std: float = 4.569,
     ) -> dict:
+        """
         Args:
+            wavs (torch.Tensor): (B, T_wav) waveform tensor.
+            wav_lengths (torch.Tensor, optional): (B,) lengths of each waveform. Defaults to None.
+            padding_mask (torch.Tensor, optional): (B, T_wav) padding mask for the waveforms.
+                If wav_lengths is not provided, this is used to infer valid lengths.
+            target_layer (int, optional): If specified, only return the output of the target layer. Defaults to None (return all layers).
+            norm_mean (float, optional): Mean for normalization. Defaults to -4.268.
+            norm_std (float, optional): Std for normalization. Defaults to 4.569.
         Returns:
+            dict: A dictionary containing the following keys:
+                - "x": (B, T_out, encoder_dim) output of the encoder
+                - "x_lengths": (B,) valid output lengths after encoder subsampling
+                - "x_padding_mask": (B, T_out) output padding mask, where padding is True
+                - "mel": (B, T_mel, mel_dim) input mel features
+                - "mel_lengths": (B,) valid mel lengths before encoder subsampling
+                - "hidden_states": list of (B, T_out, encoder_dim) hidden states of each layer
+                - "ffn": list of (B, T_out, encoder_dim) output of the feed-forward network of each layer
         """
+        # Check types
+        assert isinstance(wavs, torch.Tensor), "wavs must be a torch.Tensor"
+        assert wavs.dim() == 2, "wavs must be of shape (batch_size, seq_len)"
+        if wav_lengths is not None:
+            assert isinstance(
+                wav_lengths, torch.Tensor
+            ), "wav_lengths must be a torch.Tensor"
+            assert (
+                wav_lengths.dim() == 1
+            ), "wav_lengths must be of shape (batch_size,)"
+            assert (
+                wav_lengths.shape[0] == wavs.shape[0]
+            ), "wav_lengths must have the same batch size as wavs"
+        if padding_mask is not None:
+            assert isinstance(
+                padding_mask, torch.Tensor
+            ), "padding_mask must be a torch.Tensor"
+            assert (
+                padding_mask.dim() == 2
+            ), "padding_mask must be of shape (batch_size, seq_len)"
+            assert (
+                padding_mask.shape[0] == wavs.shape[0]
+            ), "padding_mask must have the same batch size as wavs"
+            assert (
+                padding_mask.shape[1] == wavs.shape[1]
+            ), "padding_mask must have the same seq_len as wavs"
+            if wav_lengths is None:
+                wav_lengths = (~padding_mask.to(torch.bool)).sum(dim=1)
+        if target_layer is not None:
+            assert isinstance(
+                target_layer, int
+            ), "target_layer must be an int or None"
+            assert (
+                1 <= target_layer <= self.cfg.num_layers
+            ), f"target_layer must be between 1 and {self.cfg.num_layers}"
+        mel, mel_lengths = wav_to_fbank(
+            wavs,
+            wav_lengths=wav_lengths,
+            mel_dim=self.mel_dim,
+            norm_mean=norm_mean,
+            norm_std=norm_std,
+            sample_rate=self.sample_rate,
+            return_lengths=True,
+        )
+        dtype = self.dtype
+        if mel.dtype != dtype:
+            mel = mel.to(dtype)
+        num_layers = min(
+            self.cfg.num_layers,
+            target_layer if target_layer is not None else self.cfg.num_layers,
+        )
         if mel.shape[1] <= self.max_mel_length:
+            # If the mel length is less than or equal to max_mel_length, we can process it in one go
+            x, x_len, layer_results = self.encoder(
+                inputs=mel,
+                input_lengths=mel_lengths,
+                return_hidden=True,
+                target_layer=target_layer,
+            )
             result = {
                 "x": x,
+                "x_lengths": x_len,
+                "x_padding_mask": lengths_to_padding_mask(
+                    x_len, max_len=x.size(1)
+                ),
                 "mel": mel,
+                "mel_lengths": mel_lengths,
                 "hidden_states": layer_results["hidden_states"],
                 "ffn": layer_results["ffn_1"],
             }
             return result
+        # If the mel length is greater than max_mel_length, we need to process it in chunks
         result = {
             "x": [],
+            "x_lengths": [],
             "mel": mel,
+            "mel_lengths": mel_lengths,
+            "hidden_states": [[] for _ in range(num_layers)],
+            "ffn": [[] for _ in range(num_layers)],
         }
         for i in range(0, mel.shape[1], self.max_mel_length):
             if mel.shape[1] - i < 10:
                 break
+            _mel = mel[:, i : i + self.max_mel_length]
+            _mel_lengths = None
+            if mel_lengths is not None:
+                _mel_lengths = torch.clamp(
+                    mel_lengths - i, min=0, max=self.max_mel_length
+                )
             x, x_len, layer_results = self.encoder(
+                inputs=_mel,
+                input_lengths=_mel_lengths,
+                return_hidden=True,
+                target_layer=target_layer,
             )
             result["x"].append(x)
+            result["x_lengths"].append(x_len)
+            for j in range(num_layers):
+                result["hidden_states"][j].append(
+                    layer_results["hidden_states"][j]
+                )
                 result["ffn"][j].append(layer_results["ffn_1"][j])
         result["x"] = torch.cat(result["x"], dim=1)
+        result["x_lengths"] = torch.stack(result["x_lengths"], dim=0).sum(
+            dim=0
+        )
+        result["x_padding_mask"] = lengths_to_padding_mask(
+            result["x_lengths"], max_len=result["x"].size(1)
+        )
+        for j in range(num_layers):
+            result["hidden_states"][j] = torch.cat(
+                result["hidden_states"][j], dim=1
+            )
             result["ffn"][j] = torch.cat(result["ffn"][j], dim=1)
         return result
     @classmethod