massabaali commited on Mar 14

Commit

f55a095

verified ·

1 Parent(s): ed8a7df

Upload CoLMbo model weights and code

Browse files

Files changed (36) hide show

config.json +17 -0
encoder/__pycache__/attentive_pooling.cpython-310.pyc +0 -0
encoder/__pycache__/attentive_pooling.cpython-38.pyc +0 -0
encoder/__pycache__/encoder.cpython-310.pyc +0 -0
encoder/__pycache__/encoder.cpython-38.pyc +0 -0
encoder/__pycache__/encoder.cpython-39.pyc +0 -0
encoder/__pycache__/mha.cpython-310.pyc +0 -0
encoder/__pycache__/mha.cpython-38.pyc +0 -0
encoder/__pycache__/self_attn.cpython-310.pyc +0 -0
encoder/__pycache__/self_attn.cpython-38.pyc +0 -0
encoder/attentive_pooling.py +33 -0
encoder/encoder.py +35 -0
encoder/mha.py +62 -0
encoder/self_attn.py +81 -0
load_data/__pycache__/combineddataset.cpython-38.pyc +0 -0
load_data/__pycache__/data_collactor.cpython-310.pyc +0 -0
load_data/__pycache__/data_collactor.cpython-38.pyc +0 -0
load_data/__pycache__/dataset.cpython-38.pyc +0 -0
load_data/__pycache__/extract_fbanks.cpython-310.pyc +0 -0
load_data/__pycache__/extract_fbanks.cpython-38.pyc +0 -0
load_data/__pycache__/prepare_dataloader.cpython-310.pyc +0 -0
load_data/__pycache__/prepare_dataloader.cpython-38.pyc +0 -0
load_data/__pycache__/tears.cpython-38.pyc +0 -0
load_data/__pycache__/timit.cpython-38.pyc +0 -0
load_data/__pycache__/voxceleb.cpython-38.pyc +0 -0
load_data/combineddataset.py +29 -0
load_data/data_collactor.py +74 -0
load_data/dataset.py +109 -0
load_data/extract_fbanks.py +55 -0
load_data/prepare_dataloader.py +22 -0
load_data/tears.py +232 -0
load_data/timit.py +102 -0
load_data/voxceleb.py +63 -0
mapper.py +245 -0
pytorch_model.bin +3 -0
wrapper.py +305 -0

config.json ADDED Viewed

	@@ -0,0 +1,17 @@

+{
+  "model_type": "colmbo",
+  "architectures": [
+    "CoLMboModel"
+  ],
+  "auto_map": {
+    "AutoConfig": "modeling_colmbo.CoLMboConfig",
+    "AutoModel": "modeling_colmbo.CoLMboModel"
+  },
+  "n_mels": 80,
+  "embedding_dim": 192,
+  "channel": 1024,
+  "prefix_length": 10,
+  "gpt_model_name": "gpt2",
+  "sample_rate": 16000,
+  "torch_dtype": "float32"
+}

encoder/__pycache__/attentive_pooling.cpython-310.pyc ADDED Viewed

Binary file (1.54 kB). View file

encoder/__pycache__/attentive_pooling.cpython-38.pyc ADDED Viewed

Binary file (1.53 kB). View file

encoder/__pycache__/encoder.cpython-310.pyc ADDED Viewed

Binary file (1.64 kB). View file

encoder/__pycache__/encoder.cpython-38.pyc ADDED Viewed

Binary file (1.66 kB). View file

encoder/__pycache__/encoder.cpython-39.pyc ADDED Viewed

Binary file (1.63 kB). View file

encoder/__pycache__/mha.cpython-310.pyc ADDED Viewed

Binary file (2.21 kB). View file

encoder/__pycache__/mha.cpython-38.pyc ADDED Viewed

Binary file (2.22 kB). View file

encoder/__pycache__/self_attn.cpython-310.pyc ADDED Viewed

Binary file (3.71 kB). View file

encoder/__pycache__/self_attn.cpython-38.pyc ADDED Viewed

Binary file (3.74 kB). View file

encoder/attentive_pooling.py ADDED Viewed

	@@ -0,0 +1,33 @@

+import torch
+import torch.nn as nn
+class SelfAttentionPooling(nn.Module):
+    """
+    Implementation of SelfAttentionPooling
+    Original Paper: Self-Attention Encoding and Pooling for Speaker Recognition
+    https://arxiv.org/pdf/2008.01077v1.pdf
+    """
+    def __init__(self, input_dim):
+        super(SelfAttentionPooling, self).__init__()
+        self.W = nn.Linear(input_dim, 1)
+    def forward(self, batch_rep, att_mask):
+        """
+        input:
+        batch_rep : size (N, T, H), N: batch size, T: sequence length, H: Hidden dimension
+        attention_weight:
+        att_w : size (N, T, 1)
+        return:
+        utter_rep: size (N, H)
+        """
+        seq_len = batch_rep.shape[1]
+        softmax = nn.functional.softmax
+        att_logits = self.W(batch_rep).squeeze(-1)
+        att_mask = att_mask[:, :, 0]
+        att_logits = att_mask + att_logits
+        att_w = softmax(att_logits, dim=-1).unsqueeze(-1)
+        utter_rep = torch.sum(batch_rep * att_w, dim=1)
+        attn_out_std = torch.sqrt(torch.sum(att_w * (batch_rep - utter_rep.unsqueeze(1))**2, dim=1))
+        return utter_rep, attn_out_std

encoder/encoder.py ADDED Viewed

	@@ -0,0 +1,35 @@

+import torch
+from speechbrain.lobes.models.ECAPA_TDNN import ECAPA_TDNN
+class Model(torch.nn.Module):
+    def __init__(self, n_mels=80, embedding_dim=192, channel=512):
+        super(Model, self).__init__()
+        channels = [channel for _ in range(4)]
+        channels.append(channel * 3)
+        self.model = ECAPA_TDNN(input_size=n_mels, lin_neurons=embedding_dim, channels=channels)
+    def forward(self, x):
+        x = x.squeeze(1)
+        x = self.model(x)
+        x = x.squeeze(1)
+        return x
+if __name__ == '__main__':
+    # Fixing the naming issue for 'channel'
+    model = Model(n_mels=80, embedding_dim=192, channel=1024)
+    # Load the pretrained model checkpoint
+    checkpoint = torch.load("/ocean/projects/cis220031p/abdulhan/AVIS_baseline/ECAPA/pretrained_models/spkrec-ecapa-voxceleb/embedding_model.ckpt")
+    new_state_dict = {f"model.{k}": v for k, v in checkpoint.items()}
+    # Assuming the checkpoint contains the state dict directly
+    model.load_state_dict(new_state_dict)
+    # To evaluate or use the model
+    model.eval()
+    # Test with dummy input (B, 1, n_mels, T)
+    dummy_input = torch.randn(1, 1, 300, 80)  # Example input
+    output = model(dummy_input)
+    print(output.shape)

encoder/mha.py ADDED Viewed

	@@ -0,0 +1,62 @@

+import torch
+import torch.nn as nn
+import math
+class MultiHeadAttention(nn.Module):
+    def __init__(self, d_model, num_heads):
+        super(MultiHeadAttention, self).__init__()
+        # Ensure that the model dimension (d_model) is divisible by the number of heads
+        assert d_model % num_heads == 0, "d_model must be divisible by num_heads"
+        # Initialize dimensions
+        self.d_model = d_model # Model's dimension
+        self.num_heads = num_heads # Number of attention heads
+        self.d_k = d_model // num_heads # Dimension of each head's key, query, and value
+        # Linear layers for transforming inputs
+        self.W_q = nn.Linear(d_model, d_model) # Query transformation
+        self.W_k = nn.Linear(d_model, d_model) # Key transformation
+        self.W_v = nn.Linear(d_model, d_model) # Value transformation
+        self.W_o = nn.Linear(d_model, d_model) # Output transformation
+    def scaled_dot_product_attention(self, Q, K, V, prob_phn=None, mask=None, lambda_val=None):
+        attn_scores = torch.matmul(Q, K.transpose(-2, -1)) / math.sqrt(self.d_k)
+        # Add a singleton dimension to prob_phn at index 1
+        prob_phn = prob_phn.unsqueeze(1)
+        # Expand prob_phn to match the shape of attn_scores
+        # This will not increase memory usage as expand returns a new view on the existing tensor
+        prob_phn = prob_phn.expand(-1, self.num_heads, -1, -1)
+        if lambda_val > 0:
+            attn_scores = attn_scores - lambda_val * prob_phn.transpose(-2, -1)
+        attn_mask = mask
+        if mask is not None:
+            # print(mask.shape)
+            mask = mask.unsqueeze(1)
+            mask = mask.expand(-1, self.num_heads, -1, -1)
+            attn_scores = attn_scores.masked_fill(mask == 0, -1e9)
+        attn_probs = torch.softmax(attn_scores, dim=-1)
+        attn_probs = attn_probs.float()
+        output = torch.matmul(attn_probs, V)
+        return output, attn_mask
+    def split_heads(self, x):
+        # Reshape the input to have num_heads for multi-head attention
+        batch_size, seq_length, d_model = x.size()
+        return x.view(batch_size, seq_length, self.num_heads, self.d_k).transpose(1, 2)
+    def combine_heads(self, x):
+        # Combine the multiple heads back to original shape
+        batch_size, _, seq_length, d_k = x.size()
+        return x.transpose(1, 2).contiguous().view(batch_size, seq_length, self.d_model)
+    def forward(self, Q, K, V, prob_phn=None, mask=None, lambda_val=None):
+        # Apply linear transformations and split heads
+        Q = self.split_heads(self.W_q(Q))
+        K = self.split_heads(self.W_k(K))
+        V = self.split_heads(self.W_v(V))
+        # Perform scaled dot-product attention
+        attn_output, attn_mask = self.scaled_dot_product_attention(Q, K, V, prob_phn, mask,lambda_val)
+        # Combine heads and apply output transformation
+        output = self.W_o(self.combine_heads(attn_output))
+        return output, attn_mask

encoder/self_attn.py ADDED Viewed

	@@ -0,0 +1,81 @@

+import torch
+import torch.nn as nn
+from encoder.mha import MultiHeadAttention
+from encoder.attentive_pooling import SelfAttentionPooling
+class FlippedReLU(nn.Module):
+    def __init__(self):
+        super(FlippedReLU, self).__init__()
+    def forward(self, x):
+        return torch.where(x < 0, x, torch.zeros_like(x))
+class PositionWiseFeedForward(nn.Module):
+    def __init__(self, d_model, d_ff):
+        super(PositionWiseFeedForward, self).__init__()
+        self.fc1 = nn.Linear(d_model, d_ff)
+        self.fc2 = nn.Linear(d_ff, d_model)
+        self.relu = nn.ReLU()
+    def forward(self, x):
+        return self.fc2(self.relu(self.fc1(x)))
+class EncoderLayer(nn.Module):
+    def __init__(self, d_model, num_heads, d_ff, dropout):
+        super(EncoderLayer, self).__init__()
+        self.self_attn = MultiHeadAttention(d_model, num_heads)
+        self.feed_forward = PositionWiseFeedForward(d_model, d_ff)
+        self.norm1 = nn.LayerNorm(d_model)
+        self.norm2 = nn.LayerNorm(d_model)
+        self.dropout = nn.Dropout(dropout)
+    def forward(self, x, prob_phn=None, mask=None, lambda_val=None):
+        attn_output, attn_mask = self.self_attn(x, x, x, prob_phn=prob_phn, mask=mask, lambda_val=lambda_val)
+        x = self.norm1(x + self.dropout(attn_output))
+        ff_output = self.feed_forward(x)
+        x = self.norm2(x + self.dropout(ff_output))
+        return x, attn_mask
+class TransformerSelfAttention(nn.Module):
+    def __init__(self, input_dim, num_heads, dim_feedforward, number_Of_spks, dropout=0.0):
+        """EncoderBlock.
+        Args:
+            input_dim: Dimensionality of the input
+            num_heads: Number of heads to use in the attention block
+            dim_feedforward: Dimensionality of the hidden layer in the MLP
+            dropout: Dropout probability to use in the dropout layers
+        """
+        super().__init__()
+        # Attention layer
+        self.self_mha_attn = EncoderLayer(input_dim, num_heads, dim_feedforward*8,dropout)
+        self.attn_pooling = SelfAttentionPooling(input_dim)
+        self.emb1 = nn.Linear(input_dim*2, dim_feedforward*8)
+        self.emb2 = nn.Linear(input_dim*2, dim_feedforward*8)
+        self.emb2.weight.data = self.emb1.weight.data.clone()
+        self.emb2.bias.data = self.emb1.bias.data.clone()
+        self.bn = nn.BatchNorm1d(dim_feedforward*8)
+        self.act = nn.ReLU(inplace=True)
+        self.dropout = nn.Dropout(dropout)
+        self.classifier = nn.Linear(dim_feedforward*8, number_Of_spks)
+        self.flipped_relu = FlippedReLU()
+    def forward(self, x, prob_phn=None, mask=None, lambda_val=None):
+        # Attention part
+        attn_out, attn_mask = self.self_mha_attn(x,prob_phn=prob_phn, mask=mask, lambda_val=lambda_val)
+        attn_mask= attn_mask.squeeze(1)
+        attn_out_mean,attn_out_std = self.attn_pooling(attn_out,attn_mask)
+        attn_concat = torch.cat((attn_out_mean, attn_out_std),dim=1).to(dtype=torch.float32)
+        emb1 = self.emb1(attn_concat).to(dtype=torch.float32)
+        emb1 = self.act(emb1)
+        emb2 = self.emb2(attn_concat).to(dtype=torch.float32)
+        emb2 = self.flipped_relu(emb2)
+        emb = emb1 + emb2
+        emb = self.bn(emb)
+        x = self.classifier(emb)
+        return x,emb

load_data/__pycache__/combineddataset.cpython-38.pyc ADDED Viewed

Binary file (1.41 kB). View file

load_data/__pycache__/data_collactor.cpython-310.pyc ADDED Viewed

Binary file (4.31 kB). View file

load_data/__pycache__/data_collactor.cpython-38.pyc ADDED Viewed

Binary file (4.32 kB). View file

load_data/__pycache__/dataset.cpython-38.pyc ADDED Viewed

Binary file (2.85 kB). View file

load_data/__pycache__/extract_fbanks.cpython-310.pyc ADDED Viewed

Binary file (2.39 kB). View file

load_data/__pycache__/extract_fbanks.cpython-38.pyc ADDED Viewed

Binary file (2.44 kB). View file

load_data/__pycache__/prepare_dataloader.cpython-310.pyc ADDED Viewed

Binary file (855 Bytes). View file

load_data/__pycache__/prepare_dataloader.cpython-38.pyc ADDED Viewed

Binary file (851 Bytes). View file

load_data/__pycache__/tears.cpython-38.pyc ADDED Viewed

Binary file (6.77 kB). View file

load_data/__pycache__/timit.cpython-38.pyc ADDED Viewed

Binary file (3.15 kB). View file

load_data/__pycache__/voxceleb.cpython-38.pyc ADDED Viewed

Binary file (1.82 kB). View file

load_data/combineddataset.py ADDED Viewed

	@@ -0,0 +1,29 @@

+import torch
+import random
+from torch.utils.data import Dataset, DataLoader
+class CombinedDataset(Dataset):
+    """
+    A dataset that combines two datasets (TIMIT and EARS), selecting samples based on a probability.
+    Args:
+        dataset1 (Dataset): The first dataset (e.g., TIMITDataset).
+        dataset2 (Dataset): The second dataset (e.g., EARS).
+        switch_prob (float): Probability of picking from dataset1 (default: 0.5).
+    """
+    def __init__(self, dataset1, dataset2, switch_prob=0.5):
+        self.dataset1 = dataset1
+        self.dataset2 = dataset2
+        self.len1 = len(dataset1)
+        self.len2 = len(dataset2)
+        self.switch_prob = switch_prob  # Probability of picking from dataset1
+    def __len__(self):
+        return max(self.len1, self.len2)  # Use the longer dataset length
+    def __getitem__(self, idx):
+        # Decide whether to sample from dataset1 or dataset2
+        if random.random() < self.switch_prob:
+            return self.dataset1[idx % self.len1]  # Sample from dataset1
+        else:
+            return self.dataset2[idx % self.len2]  # Sample from dataset2

load_data/data_collactor.py ADDED Viewed

	@@ -0,0 +1,74 @@

+import torch
+from transformers import AutoFeatureExtractor
+from dataclasses import dataclass, field
+from typing import Any, Dict, List, Optional, Union
+from preprocessing.ast_processor import ast
+from util_stats.local_stats import local_extract_phn_frame_probs
+from util_stats.global_stats import global_extract_phn_frame_probs
+import numpy as np
+import pickle
+import torch.nn.functional as F
+from load_data.extract_fbanks import Mel_Spectrogram
+extractor = Mel_Spectrogram()
+with open('new_lbl2ind.pkl', 'rb') as f:
+    lbl2ind = pickle.load(f)
+with open('new_spk.pkl', 'rb') as f:
+    unique_speaker_ids = pickle.load(f)
+# change the labels
+number_Of_spks = len(unique_speaker_ids)
+@dataclass
+class DataCollatorWithPadding:
+    """
+    Data collator that will dynamically pad the inputs received.
+    Args:
+        processor (:class:`~transformers.Wav2Vec2Processor`)
+            The processor used for proccessing the data.
+        padding (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.PaddingStrategy`, `optional`, defaults to :obj:`True`):
+            Select a strategy to pad the returned sequences (according to the model's padding side and padding index)
+            among:
+            * :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
+              sequence if provided).
+            * :obj:`'max_length'`: Pad to a maximum length specified with the argument :obj:`max_length` or to the
+              maximum acceptable input length for the model if that argument is not provided.
+            * :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of
+              different lengths).
+        max_length (:obj:`int`, `optional`):
+            Maximum length of the ``input_values`` of the returned list and optionally padding length (see above).
+        max_length_labels (:obj:`int`, `optional`):
+            Maximum length of the ``labels`` returned list and optionally padding length (see above).
+        pad_to_multiple_of (:obj:`int`, `optional`):
+            If set will pad the sequence to a multiple of the provided value.
+            This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability >=
+            7.5 (Volta).
+    """
+    padding: Union[bool, str] = True
+    max_length: Optional[int] = None
+    pad_to_multiple_of: Optional[int] = None
+    pad_to_multiple_of_labels: Optional[int] = None
+    flag_global_local: Optional[str] = None
+    dic_train_phn_frequency: Optional [dict] = None
+    dic_train_frame_frequency: Optional [dict] = None
+    lbl2ind: Optional [dict] = None
+    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
+        # split inputs and labels since they have to be of different lengths and need
+        # different padding methods
+        batch={}
+        batch['input_values']= [features[idx]['audio_tensor'].squeeze(0) for idx in range(len(features))]
+        batch["prompt"] = [features[idx]["prompt"] for idx in range(len(features))]
+        batch["answer"] = [features[idx]["answer"] for idx in range(len(features))]
+        batch["filename"] = [features[idx]["filename"] for idx in range(len(features))]
+        # batch["no_hot_encode"] = torch.tensor([lbl2ind[features[idx]['sid']] for idx in range(len(features))])
+        batch["no_hot_encode"] = torch.tensor([0 for idx in range(len(features))])
+        # if batch["no_hot_encode"].numel():
+        batch["labels"]= F.one_hot(batch["no_hot_encode"], number_Of_spks)
+        batch['input_values'] = extractor(torch.stack(batch['input_values']))
+        return batch

load_data/dataset.py ADDED Viewed

	@@ -0,0 +1,109 @@

+import os
+from glob import glob
+import torchaudio
+from torch.utils.data import Dataset
+import pandas as pd
+from PIL import Image
+import pickle
+from copy import deepcopy
+from glob import glob
+import random
+from sklearn.model_selection import train_test_split
+import json
+import os
+import numpy as np
+import librosa
+import torch
+import soundfile as sf
+import pandas as pd
+import random
+class EARS(Dataset):
+    """
+    EARS dataset for 10sec or less that 10sec segments.
+    Returns:
+        audio: torch.Tensor in (1,16000) or (1, <16000), audio waveform
+        sid: str (p103), speaker id
+        metadict: dict, metadata
+        caption: str, caption
+        alignment: list
+    """
+    def __init__(self, root, data_path, meta_path,utterance_path, prompts_path, sample_rate, train_mapper=False, split="train"):
+        super().__init__()
+        self.root = root
+        with open(f"{data_path}", "r") as f:
+            self.data = json.load(f)
+        with open(f"{meta_path}", "r") as f:
+            self.meta = json.load(f)
+        with open(f"{utterance_path}", "r") as f:
+            self.utterance = json.load(f)
+        with open(f"{prompts_path}", "r") as f:
+            self.prompts = json.load(f)
+        self.new_data = []
+        if train_mapper:
+            for d in self.data:
+                file_name = d["filename"]
+                sid = file_name.split("/")[0]
+                temp = random.sample(self.prompts[sid], 10)
+                for qa in temp:
+                    self.new_data.append({"filename": file_name,
+                                        "start": d["start"],
+                                        "end": d["end"],
+                                        "prompt": qa[0],
+                                        "answer": qa[1]})
+        else:
+            self.new_data = self.data
+        if split == "train":
+            random.shuffle(self.new_data)
+        self.sample_rate = sample_rate
+    def __len__(self):
+        return len(self.new_data)
+    def __getitem__(self, idx):
+        entry = self.new_data[idx]
+        filename = entry["filename"]
+        sid      = filename.split("/")[0]
+        audio_path = os.path.join(self.root, filename)
+        # Load audio
+        audio, sample_rate = torchaudio.load(audio_path)
+        start_sample, end_sample = entry["start"], entry["end"]
+        # Resample if needed
+        if sample_rate != self.sample_rate:
+            audio = torchaudio.transforms.Resample(sample_rate, self.sample_rate)(audio)
+        # Compute duration in samples
+        total_samples = end_sample - start_sample
+        num_samples_3s = 3 * self.sample_rate  # 3 seconds worth of samples
+        # Select a random 3s window within the available range
+        if total_samples >= num_samples_3s:
+            start_offset = random.randint(start_sample, end_sample - num_samples_3s)
+            end_offset = start_offset + num_samples_3s
+            audio = audio[:, start_offset:end_offset]
+        else:
+            # If less than 3s, take full segment and pad
+            pad_size = num_samples_3s - total_samples
+            audio = audio[:, start_sample:end_sample]
+            audio = torch.nn.functional.pad(audio, (0, pad_size))
+        # Normalize
+        mean = torch.mean(audio)
+        std = torch.std(audio)
+        audio = (audio - mean) / (std + 1e-8)
+        return {
+            "audio_tensor": audio,
+            "filename": filename,
+            "sid": sid,
+            "prompt": entry.get("prompt", None),
+            "answer": entry.get("answer", None),
+        }

load_data/extract_fbanks.py ADDED Viewed

	@@ -0,0 +1,55 @@

+import librosa
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+class PreEmphasis(torch.nn.Module):
+    def __init__(self, coef: float = 0.97):
+        super(PreEmphasis, self).__init__()
+        self.coef = coef
+        # make kernel
+        # In pytorch, the convolution operation uses cross-correlation. So, filter is flipped.
+        self.register_buffer(
+            'flipped_filter', torch.FloatTensor(
+                [-self.coef, 1.]).unsqueeze(0).unsqueeze(0)
+        )
+    def forward(self, inputs: torch.tensor) -> torch.tensor:
+        assert len(
+            inputs.size()) == 2, 'The number of dimensions of inputs tensor must be 2!'
+        # reflect padding to match lengths of in/out
+        inputs = inputs.unsqueeze(1)
+        inputs = F.pad(inputs, (1, 0), 'reflect')
+        return F.conv1d(inputs, self.flipped_filter).squeeze(1)
+class Mel_Spectrogram(nn.Module):
+    def __init__(self, sample_rate=16000, n_fft=512, win_length=400, hop=160, n_mels=80, coef=0.97, requires_grad=False):
+        super(Mel_Spectrogram, self).__init__()
+        self.n_fft = n_fft
+        self.n_mels = n_mels
+        self.win_length = win_length
+        self.hop = hop
+        self.pre_emphasis = PreEmphasis(coef)
+        mel_basis = librosa.filters.mel(
+            sr=sample_rate, n_fft=n_fft, n_mels=n_mels)
+        self.mel_basis = nn.Parameter(
+            torch.FloatTensor(mel_basis), requires_grad=requires_grad)
+        self.instance_norm = nn.InstanceNorm1d(num_features=n_mels)
+        window = torch.hamming_window(self.win_length)
+        self.window = nn.Parameter(
+            torch.FloatTensor(window), requires_grad=False)
+    def forward(self, x):
+        x = self.pre_emphasis(x)
+        x = torch.stft(x, n_fft=self.n_fft, hop_length=self.hop,
+                       window=self.window, win_length=self.win_length, return_complex=True)
+        x = torch.abs(x)
+        x += 1e-9
+        x = torch.log(x)
+        x = torch.matmul(self.mel_basis, x)
+        x = self.instance_norm(x)
+        x = x.permute(0, 2, 1)
+        x = x.unsqueeze(1)
+        return x

load_data/prepare_dataloader.py ADDED Viewed

	@@ -0,0 +1,22 @@

+from torch.utils.data import Dataset, DataLoader
+from torch.utils.data.distributed import DistributedSampler
+from preprocessing.ast_processor import ast
+from load_data.data_collactor import DataCollatorWithPadding
+def prepare_dataloader(dataset: Dataset, batch_size: int, valid_train_flag: str):
+    if valid_train_flag == "train":
+        data_collator = DataCollatorWithPadding(padding=True)
+    elif valid_train_flag == "valid":
+        data_collator = DataCollatorWithPadding(padding=True)
+    elif valid_train_flag == "test":
+        data_collator = DataCollatorWithPadding(padding=True)
+    return DataLoader(
+        dataset,
+        batch_size=batch_size,
+        pin_memory=True,
+        shuffle=False,
+        sampler=DistributedSampler(dataset),
+        collate_fn=data_collator
+    )

load_data/tears.py ADDED Viewed

	@@ -0,0 +1,232 @@

+import torch
+from torch.utils.data import Dataset
+import json
+import torchaudio
+import os
+from typing import Optional, Dict, Any, List, Tuple
+import pandas as pd
+import warnings
+import random
+from pathlib import Path
+from collections import defaultdict
+class TEARSDataset(Dataset):
+    """
+    TEARS dataset class that loads audio and associated metadata/responses.
+    Args:
+        json_path (str): Path to the JSON file containing TEARS data
+        tears_root (str): Root directory containing TEARS audio files
+        sample_rate (int, optional): Target sample rate for audio. Defaults to 16000.
+        duration (float, optional): Target duration in seconds. Defaults to 3.0.
+        normalize_audio (bool, optional): Whether to normalize audio. Defaults to True.
+    Returns:
+        Dict containing:
+            - audio_tensor: torch.Tensor of shape (1, num_samples)
+            - speaker_id: str, speaker identifier
+            - metadata: dict containing speaker metadata
+            - prompt: str, randomly selected prompt
+            - response: str, corresponding response
+            - filepath: str, path to audio file
+    """
+    def __init__(
+        self,
+        json_path: str,
+        tears_root: str,
+        sample_rate: int = 16000,
+        duration: float = 3.0,
+        normalize_audio: bool = True,
+        augment: bool = True
+    ):
+        super().__init__()
+        # Load the JSON data
+        with open(json_path, 'r') as f:
+            self.data = json.load(f)
+        self.tears_root = Path(tears_root)
+        self.sample_rate = sample_rate
+        self.duration = duration
+        self.normalize_audio = normalize_audio
+        self.target_samples = int(duration * sample_rate)
+        self.augment = augment
+    def __len__(self) -> int:
+        return len(self.data)
+    def augment_audio(self, waveform, sample_rate):
+        # Randomly select augmentation methods
+        augmentation_choices = ['time_stretch', 'pitch_shift', 'add_noise', 'spec_aug']
+        random.shuffle(augmentation_choices)
+        for aug in augmentation_choices[:random.randint(1, len(augmentation_choices))]:
+            if aug == 'time_stretch':
+                rate = random.uniform(0.8, 1.25)
+                effect = [['speed', str(rate)], ['rate', str(16000)]]
+                waveform, _ = torchaudio.sox_effects.apply_effects_tensor(
+                    waveform, 16000, effects=effect
+                )
+            elif aug == 'pitch_shift':
+                n_steps = random.randint(-4, 4)
+                effect = [['pitch', str(n)] for n in [n_steps*100 for n in [random.choice([-2, -1, 1, 2])]]]
+                waveform, _ = torchaudio.sox_effects.apply_effects_tensor(waveform, 16000, effect)
+            elif aug == 'add_noise':
+                noise = torch.randn_like(waveform) * random.uniform(0.001, 0.015)
+                waveform = waveform + noise
+            elif aug == 'frequency_mask':
+                freq_mask = T.FrequencyMasking(freq_mask_param=random.randint(15, 30))
+                waveform = freq_mask(waveform)
+            elif aug == 'time_mask':
+                time_mask = T.TimeMasking(time_mask_param=random.randint(20, 80))
+                waveform = time_mask(waveform)
+            elif aug == 'reverb':
+                effect = [['reverb', '-w', str(random.randint(10, 50))]]
+                waveform, _ = torchaudio.sox_effects.apply_effects_tensor(waveform, 16000, effect)
+            elif aug == 'pitch_shift':
+                steps = random.randint(-2, 2)
+                effect = [['pitch', str(steps * 100)], ['rate', '16000']]
+                waveform, _ = torchaudio.sox_effects.apply_effects_tensor(waveform, 16000, effect)
+        return waveform
+    def __getitem__(self, idx: int) -> Dict[str, Any]:
+        # Get sample data
+        sample = self.data[idx]
+        # Get file path
+        audio_path = str(self.tears_root / sample['audio_path'])
+        # Load and process audio
+        try:
+            audio, sr = torchaudio.load(audio_path)
+            # Resample if necessary
+            if sr != self.sample_rate:
+                audio = torchaudio.transforms.Resample(sr, self.sample_rate)(audio)
+            if self.augment:
+                audio = self.augment_audio(audio, self.sample_rate)
+            # Normalize if requested
+            if self.normalize_audio:
+                mean = torch.mean(audio)
+                std = torch.std(audio)
+                audio = (audio - mean) / (std + 1e-8)
+            # Handle duration
+            num_samples = audio.shape[1]
+            if num_samples >= self.target_samples:
+                # Randomly crop to target duration
+                start_sample = random.randint(0, num_samples - self.target_samples)
+                audio = audio[:, start_sample:start_sample + self.target_samples]
+            else:
+                # Pad if shorter than target duration
+                pad_size = self.target_samples - num_samples
+                audio = torch.nn.functional.pad(audio, (0, pad_size))
+        except Exception as e:
+            warnings.warn(f"Error loading audio file {audio_path}: {str(e)}")
+            # Return zero tensor if audio loading fails
+            audio = torch.zeros(1, self.target_samples)
+        # Get prompt and response
+        prompts = sample.get('prompts', [])
+        responses = sample.get('responses', [])
+        if prompts and responses and len(prompts) == len(responses):
+            rand_idx = random.randint(0, len(prompts) - 1)
+            prompt = prompts[rand_idx]
+            response = responses[rand_idx].replace("\n", " ").strip()
+        else:
+            prompt = None
+            response = None
+        return {
+            'audio_tensor': audio,
+            'sid': sample['speaker']['id'],
+            'metadata': sample['speaker'],
+            'prompt': prompt,
+            'answer': response,
+            'filename': str(audio_path)
+        }
+    @staticmethod
+    def redistribute_speakers(
+        json_paths: Dict[str, str],
+        split_ratios: Dict[str, float],
+        seed: int = 42
+    ) -> Dict[str, List[Dict]]:
+        """
+        Redistribute speakers across splits according to given ratios.
+        Args:
+            json_paths: Dict mapping split names to json file paths
+            split_ratios: Dict mapping split names to desired ratios (should sum to 1)
+            seed: Random seed for reproducibility
+        Returns:
+            Dict mapping split names to lists of samples
+        """
+        random.seed(seed)
+        # Collect all samples and group by speaker
+        speaker_samples = defaultdict(list)
+        for split, path in json_paths.items():
+            with open(path, 'r') as f:
+                data = json.load(f)
+                for sample in data:
+                    speaker_samples[sample['speaker']['id']].append(sample)
+        # Get list of all speakers
+        all_speakers = list(speaker_samples.keys())
+        random.shuffle(all_speakers)
+        # Calculate number of speakers for each split
+        total_speakers = len(all_speakers)
+        split_speakers = {
+            split: int(ratio * total_speakers)
+            for split, ratio in split_ratios.items()
+        }
+        # Adjust for rounding errors
+        remainder = total_speakers - sum(split_speakers.values())
+        if remainder > 0:
+            # Add remaining speakers to first split
+            split_speakers[list(split_speakers.keys())[0]] += remainder
+        # Distribute speakers to splits
+        new_splits = defaultdict(list)
+        current_idx = 0
+        for split, num_speakers in split_speakers.items():
+            split_speaker_ids = all_speakers[current_idx:current_idx + num_speakers]
+            for speaker_id in split_speaker_ids:
+                new_splits[split].extend(speaker_samples[speaker_id])
+            current_idx += num_speakers
+        return new_splits
+    @staticmethod
+    def save_splits(splits: Dict[str, List[Dict]], output_dir: str):
+        """Save redistributed splits to JSON files."""
+        output_dir = Path(output_dir)
+        output_dir.mkdir(parents=True, exist_ok=True)
+        for split_name, samples in splits.items():
+            output_path = output_dir / f"tears_dataset_{split_name}_with_responses.json"
+            with open(output_path, 'w') as f:
+                json.dump(samples, f, indent=2)

load_data/timit.py ADDED Viewed

	@@ -0,0 +1,102 @@

+import torch
+from torch.utils.data import Dataset
+import json
+import torchaudio
+import os
+from typing import Optional, Dict, Any, List, Tuple
+import pandas as pd
+import warnings
+import random
+class TIMITDataset(Dataset):
+    """
+    TIMIT dataset class that loads audio and associated metadata/transcriptions.
+    Args:
+        json_path (str): Path to the JSON file containing TIMIT data
+        timit_root (str): Root directory containing TIMIT audio files
+        sample_rate (int, optional): Target sample rate for audio. Defaults to 16000.
+        normalize_audio (bool, optional): Whether to normalize audio. Defaults to True.
+    Returns:
+        Dict containing:
+            - audio_tensor: torch.Tensor of shape (1, num_samples)
+            - speaker_id: str, speaker identifier
+            - metadata: dict containing speaker metadata
+            - prompts: list of prompts used
+            - responses: list of responses generated
+            - filepath: str, path to audio file
+            - phonemes: DataFrame with columns [start_sample, end_sample, phoneme]
+            - words: DataFrame with columns [start_sample, end_sample, word]
+            - text: str, complete transcription
+    """
+    def __init__(
+        self,
+        json_path: str,
+        timit_root: str,
+        sample_rate: int = 16000,
+        normalize_audio: bool = True
+    ):
+        super().__init__()
+        # Load the JSON data
+        with open(json_path, 'r') as f:
+            self.data = json.load(f)
+        self.timit_root = timit_root
+        self.sample_rate = sample_rate
+        self.normalize_audio = normalize_audio
+    def __len__(self) -> int:
+        return len(self.data)
+    def __getitem__(self, idx: int) -> Dict[str, Any]:
+        # Get sample data
+        sample = self.data[idx]
+        # Get file paths
+        audio_path = os.path.join(self.timit_root, sample['audio_path'])
+        # Load audio first
+        audio, sr = torchaudio.load(audio_path)
+        if sr != self.sample_rate:
+            audio = torchaudio.transforms.Resample(sr, self.sample_rate)(audio)
+        mean = torch.mean(audio)
+        std = torch.std(audio)
+        audio = (audio - mean) / (std + 1e-8)
+        # Get total number of samples
+        num_samples = audio.shape[1]
+        num_samples_3s = 3 * self.sample_rate  # Samples for 3 seconds
+        # Ensure the audio is at least 3 seconds long
+        if num_samples >= num_samples_3s:
+            start_sample = random.randint(0, num_samples - num_samples_3s)
+            end_sample = start_sample + num_samples_3s
+            audio = audio[:, start_sample:end_sample]
+        else:
+            # If audio is shorter than 3 seconds, pad it
+            pad_size = num_samples_3s - num_samples
+            audio = torch.nn.functional.pad(audio, (0, pad_size))
+        prompts = sample.get('prompts', [])
+        answers = sample.get('responses', [])
+        if prompts and answers and len(prompts) == len(answers):
+            rand_idx = random.randint(0, len(prompts) - 1)
+            prompt = prompts[rand_idx]
+            answer = answers[rand_idx].replace("\n", " ").strip()  # Clean response
+        else:
+            prompt = None
+            answer = None
+        return {
+            'audio_tensor': audio,
+            'sid': sample['speaker']['id'],
+            'prompt': prompt,
+            'answer': answer,
+            'filename': audio_path,
+        }

load_data/voxceleb.py ADDED Viewed

	@@ -0,0 +1,63 @@

+import torch
+from torch.utils.data import Dataset
+import pandas as pd
+import torchaudio
+import random
+import os
+class ZeroShotDataset(Dataset):
+    def __init__(self, csv_path, transform=None):
+        """
+        Args:
+            csv_path (str): Path to the CSV file.
+            transform (callable, optional): Optional transform to be applied to audio.
+        """
+        self.data = pd.read_csv(csv_path)
+        self.transform = transform
+        self.sample_rate = 16000
+    def __len__(self):
+        return len(self.data)
+    def __getitem__(self, idx):
+        row = self.data.iloc[idx]
+        root = "/ocean/projects/cis220031p/psamal/preprocess_TIMIT/"
+        # Load audio file
+        audio, sr = torchaudio.load(os.path.join(root, row["File_Path"]))
+        # Apply transformation if provided
+        if self.transform:
+            audio = self.transform(audio)
+        if sr != self.sample_rate:
+            audio = torchaudio.transforms.Resample(sr, self.sample_rate)(audio)
+        mean = torch.mean(audio)
+        std = torch.std(audio)
+        audio = (audio - mean) / (std + 1e-8)
+        # Get total number of samples
+        num_samples = audio.shape[1]
+        num_samples_3s = 3 * self.sample_rate  # Samples for 3 seconds
+        # Ensure the audio is at least 3 seconds long
+        if num_samples >= num_samples_3s:
+            start_sample = random.randint(0, num_samples - num_samples_3s)
+            end_sample = start_sample + num_samples_3s
+            audio = audio[:, start_sample:end_sample]
+        else:
+            # If audio is shorter than 3 seconds, pad it
+            pad_size = num_samples_3s - num_samples
+            audio = torch.nn.functional.pad(audio, (0, pad_size))
+        return {
+            "sid": "WBT0",
+            "audio_tensor": audio,
+            "answer": row["Ground_Truth"],
+            "prompt": row["Prompt"],
+            # "prompt": random.choice(["What is the dialect of the person?", "Based on the voice of the person, please specify the dialect of the person?", row["Prompt"]]),
+            'filename': row["File_Path"],
+        }

mapper.py ADDED Viewed

	@@ -0,0 +1,245 @@

+import torch
+import torch.nn as nn
+from torch.nn import functional as nnf
+from typing import Tuple, Optional
+def get_sid_mapper(map_type: str, emb_size, prefix_size: int, gpt_embedding_size: int, prefix_length: int, clip_length: int, num_layers: int):
+    if map_type == 'mlp':
+        mapper = MLP(emb_size, (prefix_size, (gpt_embedding_size * prefix_length) // 2, gpt_embedding_size * prefix_length))
+    elif map_type == 'transformer':
+        mapper = TransformerMapper(emb_size, prefix_size, gpt_embedding_size, prefix_length, clip_length, int(num_layers/2))
+    else:
+        raise ValueError(f"Unknown mapping type {map_type}")
+    for p in mapper.parameters():
+        p.requires_grad = True
+    return mapper
+def get_text_mapper(map_type: str, emb_size, prefix_size: int, gpt_embedding_size: int, prefix_length: int, clip_length: int, num_layers: int):
+    if map_type == 'mlp':
+        mapper = MLP(emb_size, (prefix_size, (gpt_embedding_size * prefix_length) // 2, gpt_embedding_size * prefix_length))
+    elif map_type == 'transformer':
+        mapper = TransformerMapperSeq(emb_size, prefix_size, gpt_embedding_size, prefix_length, clip_length, int(num_layers/2))
+    else:
+        raise ValueError(f"Unknown mapping type {map_type}")
+    for p in mapper.parameters():
+        p.requires_grad = True
+    return mapper
+def init_layer(layer):
+    """Initialize a Linear or Convolutional layer. """
+    nn.init.xavier_uniform_(layer.weight)
+    if hasattr(layer, 'bias'):
+        if layer.bias is not None:
+            layer.bias.data.fill_(0.)
+def init_bn(bn):
+    """Initialize a Batchnorm layer. """
+    bn.bias.data.fill_(0.)
+    bn.weight.data.fill_(1.)
+class Projection(nn.Module):
+    def __init__(self, d_in: int, d_out: int, p: float=0.5) -> None:
+        super().__init__()
+        self.linear1 = nn.Linear(d_in, d_out, bias=False)
+        self.linear2 = nn.Linear(d_out, d_out, bias=False)
+        self.layer_norm = nn.LayerNorm(d_out)
+        self.drop = nn.Dropout(p)
+        self.init_weight()
+    def init_weight(self):
+        init_layer(self.linear1)
+        init_layer(self.linear2)
+        init_bn(self.layer_norm)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        embed1 = self.linear1(x)
+        embed2 = self.drop(self.linear2(nnf.gelu(embed1)))
+        embeds = self.layer_norm(embed1 + embed2)
+        return embeds
+class MLP(nn.Module):
+    def __init__(self, emb_size, sizes: Tuple[int, ...], bias=True, act=nn.Tanh):
+        super(MLP, self).__init__()
+        self.emb_size = emb_size
+        # if self.emb_size is not None:
+        #     self.projector = Projection(emb_size, sizes[0])
+        layers = []
+        for i in range(len(sizes) - 1):
+            layers.append(nn.Linear(sizes[i], sizes[i + 1], bias=bias))
+            if i < len(sizes) - 2:
+                layers.append(act())
+        self.model = nn.Sequential(*layers)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        # if self.emb_size is not None:
+        #     x = self.projector(x)
+        return self.model(x)
+class MlpTransformer(nn.Module):
+    def __init__(self, in_dim, h_dim, out_d: Optional[int] = None, act=nnf.relu, dropout=0.):
+        super().__init__()
+        out_d = out_d if out_d is not None else in_dim
+        self.fc1 = nn.Linear(in_dim, h_dim)
+        self.act = act
+        self.fc2 = nn.Linear(h_dim, out_d)
+        self.dropout = nn.Dropout(dropout)
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.dropout(x)
+        x = self.fc2(x)
+        x = self.dropout(x)
+        return x
+class MultiHeadAttention(nn.Module):
+    def __init__(self, dim_self, dim_ref, num_heads, bias=True, dropout=0.):
+        super().__init__()
+        self.num_heads = num_heads
+        head_dim = dim_self // num_heads
+        self.scale = head_dim ** -0.5
+        self.to_queries = nn.Linear(dim_self, dim_self, bias=bias)
+        self.to_keys_values = nn.Linear(dim_ref, dim_self * 2, bias=bias)
+        self.project = nn.Linear(dim_self, dim_self)
+        self.dropout = nn.Dropout(dropout)
+    def forward(self, x, y=None, mask=None):
+        y = y if y is not None else x
+        b, n, c = x.shape
+        _, m, d = y.shape
+        # b n h dh
+        queries = self.to_queries(x).reshape(b, n, self.num_heads, c // self.num_heads)
+        # b m 2 h dh
+        keys_values = self.to_keys_values(y).reshape(b, m, 2, self.num_heads, c // self.num_heads)
+        keys, values = keys_values[:, :, 0], keys_values[:, :, 1]
+        attention = torch.einsum('bnhd,bmhd->bnmh', queries, keys) * self.scale
+        if mask is not None:
+            if mask.dim() == 2:
+                mask = mask.unsqueeze(1)
+            attention = attention.masked_fill(mask.unsqueeze(3), float("-inf"))
+        attention = attention.softmax(dim=2)
+        out = torch.einsum('bnmh,bmhd->bnhd', attention, values).reshape(b, n, c)
+        out = self.project(out)
+        return out, attention
+class TransformerLayer(nn.Module):
+    def forward_with_attention(self, x, y=None, mask=None):
+        x_, attention = self.attn(self.norm1(x), y, mask)
+        x = x + x_
+        x = x + self.mlp(self.norm2(x))
+        return x, attention
+    def forward(self, x, y=None, mask=None):
+        x = x + self.attn(self.norm1(x), y, mask)[0]
+        x = x + self.mlp(self.norm2(x))
+        return x
+    def __init__(self, dim_self, dim_ref, num_heads, mlp_ratio=4., bias=False, dropout=0., act=nnf.relu,
+                 norm_layer: nn.Module = nn.LayerNorm):
+        super().__init__()
+        self.norm1 = norm_layer(dim_self)
+        self.attn = MultiHeadAttention(dim_self, dim_ref, num_heads, bias=bias, dropout=dropout)
+        self.norm2 = norm_layer(dim_self)
+        self.mlp = MlpTransformer(dim_self, int(dim_self * mlp_ratio), act=act, dropout=dropout)
+class Transformer(nn.Module):
+    def __init__(self, dim_self: int, num_heads: int, num_layers: int, dim_ref: Optional[int] = None,
+                 mlp_ratio: float = 2., act=nnf.relu, norm_layer: nn.Module = nn.LayerNorm, enc_dec: bool = False):
+        super(Transformer, self).__init__()
+        dim_ref = dim_ref if dim_ref is not None else dim_self
+        self.enc_dec = enc_dec
+        if enc_dec:
+            num_layers = num_layers * 2
+        layers = []
+        for i in range(num_layers):
+            if i % 2 == 0 and enc_dec:  # cross
+                layers.append(TransformerLayer(dim_self, dim_ref, num_heads, mlp_ratio, act=act, norm_layer=norm_layer))
+            elif enc_dec:  # self
+                layers.append(TransformerLayer(dim_self, dim_self, num_heads, mlp_ratio, act=act, norm_layer=norm_layer))
+            else:  # self or cross
+                layers.append(TransformerLayer(dim_self, dim_ref, num_heads, mlp_ratio, act=act, norm_layer=norm_layer))
+        self.layers = nn.ModuleList(layers)
+    def forward_with_attention(self, x, y=None, mask=None):
+        attentions = []
+        for layer in self.layers:
+            x, att = layer.forward_with_attention(x, y, mask)
+            attentions.append(att)
+        return x, attentions
+    def forward(self, x, y=None, mask=None):
+        for i, layer in enumerate(self.layers):
+            if i % 2 == 0 and self.enc_dec: # cross
+                x = layer(x, y)
+            elif self.enc_dec:  # self
+                x = layer(x, x, mask)
+            else:  # self or cross
+                x = layer(x, y, mask)
+        return x
+class TransformerMapper(nn.Module):
+    def __init__(self, emb_size, dim_clip: int, dim_embedding: int, prefix_length: int, clip_length: int, num_layers: int = 8):
+        super(TransformerMapper, self).__init__()
+        self.emb_size = emb_size
+        # if self.emb_size is not None:
+        #     self.projector = Projection(emb_size, dim_clip)
+        self.clip_length = clip_length
+        self.transformer = Transformer(dim_embedding, 8, num_layers)
+        self.linear = nn.Linear(dim_clip, clip_length * dim_embedding)
+        self.prefix_const = nn.Parameter(torch.randn(prefix_length, dim_embedding), requires_grad=True)
+    def forward(self, x):
+        if self.emb_size is not None:
+            x = self.projector(x)
+        # raise SystemError(x.shape) # torch.Size([100, 1024])
+        x = self.linear(x).view(x.shape[0], self.clip_length, -1)
+        # raise SystemError(x.shape) # torch.Size([100, 40, 768])
+        prefix = self.prefix_const.unsqueeze(0).expand(x.shape[0], *self.prefix_const.shape)
+        prefix = torch.cat((x, prefix), dim=1) # shape is batch x seq x dim = b x 40+40 x 768 (clip length is 40)
+        out = self.transformer(prefix)[:, self.clip_length:]
+        # raise SystemError(out.shape) # torch.Size([100, 40, 768]) sid prefix
+        return out
+class TransformerMapperSeq(nn.Module):
+    def __init__(self, emb_size ,dim_clip: int, dim_embedding: int, prefix_length: int, clip_length: int, num_layers: int = 8):
+        super(TransformerMapperSeq, self).__init__()
+        self.emb_size = emb_size
+        # if self.emb_size is not None:
+        #     self.projector = Projection(emb_size, dim_clip)
+        self.clip_length = clip_length
+        self.transformer = Transformer(dim_embedding, 8, num_layers)
+        self.prefix_const = nn.Parameter(torch.randn(prefix_length, dim_embedding), requires_grad=True)
+    def forward(self, x):
+        # if self.emb_size is not None:
+        #     x = self.projector(x)
+        # raise SystemError(x.shape) # torch.Size([32, 80, 768])
+        x = x.view(x.shape[0], self.clip_length, -1)
+        prefix = self.prefix_const.unsqueeze(0).expand(x.shape[0], *self.prefix_const.shape)
+        # raise SystemError(prefix.shape, x.shape) # torch.Size([32, 40, 768]) torch.Size([32, 40, 1536])
+        prefix = torch.cat((x, prefix), dim=1)
+        out = self.transformer(prefix)[:, self.clip_length:]
+        # raise SystemError(out.shape) # torch.Size([100, 80, 768]) text prefix
+        return out

pytorch_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e0d80efbeffb56f4038bf9d320d15b5377d12b1cb85833e908d9f0f6b5c2bbab
+size 2066033810

wrapper.py ADDED Viewed

	@@ -0,0 +1,305 @@

+import numpy as np
+from transformers import AutoTokenizer
+import os
+import torch
+from collections import OrderedDict
+import librosa
+from importlib_resources import files
+import yaml
+import argparse
+import torchaudio
+import torchaudio.transforms as T
+import collections
+import random
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.nn.parallel import DistributedDataParallel as DDP
+import logging
+from glob import glob
+from mapper import get_sid_mapper, get_text_mapper
+from transformers import GPT2LMHeadModel
+from transformers import AutoTokenizer
+class ExpWrapper():
+    def __init__(self, config_wrapper, gpu_id):
+        self.tok_len          = config_wrapper['tok_len']
+        self.text_prefix_length    = config_wrapper['text_prefix_length']
+        self.sid_prefix_length = config_wrapper['sid_prefix_length']
+        self.norm_sid_emb     = config_wrapper['norm_sid_emb']
+        self.gpu_id           = gpu_id
+        self.gpt = GPT2LMHeadModel.from_pretrained(config_wrapper['text_decoder'])
+        self.gpt = self.gpt.to(self.gpu_id)
+        # for param in self.gpt.parameters():
+        #     param.requires_grad = False
+        self.gpt_embedding_size = self.gpt.transformer.wte.weight.shape[1]
+        self.sid_mapper = get_sid_mapper(config_wrapper["map_type"],None,
+                                         config_wrapper["prefix_size"], self.gpt_embedding_size,
+                                         config_wrapper["sid_prefix_length"], config_wrapper["sid_prefix_length_clip"],
+                                         config_wrapper["num_layers"])
+        # self.text_mapper = get_text_mapper(config_wrapper["map_type"], None,
+        #                                    config_wrapper["prefix_size"], self.gpt_embedding_size,
+        #                                    config_wrapper["text_prefix_length"], config_wrapper["text_prefix_length_clip"],
+        #                                    config_wrapper["num_layers"])
+        # # this is temporary
+        # if config_wrapper["checkpoint_path"]:
+        #     checkpoint = torch.load(config_wrapper["checkpoint_path"])
+        #     state_dict = checkpoint['model']
+        #     text_project_weights = {k.replace('caption_decoder.text_project.',''): v for k, v in state_dict.items()
+        #                             if 'caption_decoder.text_project' in k}
+        #     self.text_mapper.load_state_dict(text_project_weights)
+        self.sid_mapper = self.sid_mapper.to(self.gpu_id)
+        # self.text_mapper = self.text_mapper.to(self.gpu_id)
+        self.tokenizer = AutoTokenizer.from_pretrained(config_wrapper['text_decoder'])
+        self.tokenizer.add_special_tokens({'pad_token': '!'})
+    def init_mapper(self):
+        self.sid_mapper = DDP(self.sid_mapper, device_ids=[self.gpu_id], find_unused_parameters=True)
+    def freeze_llm(self):
+        for param in self.sid_mapper.parameters():
+            param.requires_grad = False
+        for param in self.gpt.parameters():
+            param.requires_grad = False
+    def default_collate(self, batch):
+        r"""Puts each data field into a tensor with outer dimension batch size"""
+        elem = batch[0]
+        elem_type = type(elem)
+        if isinstance(elem, torch.Tensor):
+            out = None
+            if torch.utils.data.get_worker_info() is not None:
+                # If we're in a background process, concatenate directly into a
+                # shared memory tensor to avoid an extra copy
+                numel = sum([x.numel() for x in batch])
+                storage = elem.storage()._new_shared(numel)
+                out = elem.new(storage)
+            return torch.stack(batch, 0, out=out)
+        elif elem_type.__module__ == 'numpy' and elem_type.__name__ != 'str_' \
+                and elem_type.__name__ != 'string_':
+            if elem_type.__name__ == 'ndarray' or elem_type.__name__ == 'memmap':
+                # array of string classes and object
+                if self.np_str_obj_array_pattern.search(elem.dtype.str) is not None:
+                    raise TypeError(
+                        self.default_collate_err_msg_format.format(elem.dtype))
+                return self.default_collate([torch.as_tensor(b) for b in batch])
+            elif elem.shape == ():  # scalars
+                return torch.as_tensor(batch)
+        elif isinstance(elem, float):
+            return torch.tensor(batch, dtype=torch.float64)
+        elif isinstance(elem, int):
+            return torch.tensor(batch)
+        elif isinstance(elem, collections.abc.Mapping):
+            return {key: self.default_collate([d[key] for d in batch]) for key in elem}
+        elif isinstance(elem, tuple) and hasattr(elem, '_fields'):  # namedtuple
+            return elem_type(*(self.default_collate(samples) for samples in zip(*batch)))
+        elif isinstance(elem, collections.abc.Sequence):
+            # check to make sure that the elements in batch have consistent size
+            it = iter(batch)
+            elem_size = len(next(it))
+            if not all(len(elem) == elem_size for elem in it):
+                raise RuntimeError(
+                    'each element in list of batch should be of equal size')
+            transposed = zip(*batch)
+            return [self.default_collate(samples) for samples in transposed]
+        raise TypeError(self.default_collate_err_msg_format.format(elem_type))
+    def load_model(self, st, model):
+        try:
+            model.load_state_dict(st)
+        except:
+            for key in list(st.keys()):
+                if "module." in key:
+                    st[key.replace("module.", "")] = st.pop(key)
+            model.load_state_dict(st)
+        return model
+    def load_model(self, st, model):
+        try:
+            model.load_state_dict(st)
+        except:
+            for key in list(st.keys()):
+                if "module." in key:
+                    st[key.replace("module.", "")] = st.pop(key)
+            model.load_state_dict(st)
+        return model
+    def load_sid_model(self, sid_model, snapshot_path, sid_ck_name):
+        loc = f"cuda:{self.gpu_id}"
+        # sid_model_path = sorted(glob(f"{snapshot_path}/sid_model_epoch_*.pt"),
+        #                         key=lambda x: float(x.split('_')[-1].replace('.pt', '')))[0]
+        sid_model_path = f"{snapshot_path}/{sid_ck_name}"
+        snapshot = torch.load(sid_model_path, map_location=loc)
+        sid_model = self.load_model(snapshot["sid_model"], sid_model)
+        best_val_loss = snapshot["val_loss"]
+        epochs_run = snapshot["epochs_run"]
+    def load_mapper(self, snapshot_path, mapper_ck_name):
+        loc = f"cuda:{self.gpu_id}"
+        mapper_path = sorted(glob(f"{snapshot_path}/mapper_*.pt"))[-1]
+        mapper_path = f"{snapshot_path}/{mapper_ck_name}"
+        snapshot = torch.load(mapper_path, map_location=loc)
+        self.sid_mapper = self.load_model(snapshot["sid_mapper"],self.sid_mapper)
+        # self.text_mapper = self.load_model(snapshot["text_mapper"],self.text_mapper)
+        self.epochs_run = snapshot["epochs_run"]
+        logging.info(f"Resuming training from mapper at Epoch {self.epochs_run}")
+    def save_mapper(self, epoch, snapshot_path, val_epoch_ce_llm):
+        mapper = {
+            # "text_mapper": self.text_mapper.state_dict(),
+            "sid_mapper": self.sid_mapper.state_dict(),
+            "epochs_run": epoch,
+        }
+        part = snapshot_path
+        torch.save(mapper, f"{part}/unfrozen_mapper_epoch_{str(epoch).zfill(4)}_val_epoch_ce_llm_{val_epoch_ce_llm}.pt")
+        logging.info(f"Epoch {epoch} | Training mapper saved at {snapshot_path}")
+    def preprocess_prompt(self, texts): # true false
+        r"""Load list of prompts and return tokenized text"""
+        tokenized_texts = []
+        for ttext in texts:
+            tok = self.tokenizer.encode_plus(
+                        text=ttext, add_special_tokens=True,
+                        max_length=10,
+                        pad_to_max_length=True, return_tensors="pt", truncation=True)
+            for key in tok.keys():
+                tok[key] = tok[key].reshape(-1).to(self.gpu_id)
+            tokenized_texts.append(tok)
+        return self.default_collate(tokenized_texts)
+    def preprocess_prompt_single(self, texts): # true false
+        r"""Load list of prompts and return tokenized text"""
+        tokenized_texts = []
+        tok = self.tokenizer.encode_plus(
+                    text=texts, add_special_tokens=True,
+                    max_length=10,
+                    pad_to_max_length=True, return_tensors="pt", truncation=True)
+        for key in tok.keys():
+            tok[key] = tok[key].reshape(-1).to(self.gpu_id)
+        tokenized_texts.append(tok)
+        return self.default_collate(tokenized_texts)
+    def preprocess_text(self, texts): # true false
+        r"""Load list of prompts and return tokenized text"""
+        tokenized_texts = []
+        for ttext in texts:
+            ttext = ttext + ' <|endoftext|>'
+            tok = self.tokenizer.encode_plus(
+                        text=ttext, add_special_tokens=True,
+                        max_length=self.tok_len,
+                        pad_to_max_length=True, return_tensors="pt", truncation=True)
+            for key in tok.keys():
+                tok[key] = tok[key].reshape(-1).to(self.gpu_id)
+            tokenized_texts.append(tok)
+        return self.default_collate(tokenized_texts)
+    def _get_text_embeddings(self, preprocessed_texts):
+        r"""Load preprocessed prompts and return a prompt embeddings"""
+        with torch.no_grad():
+            texts_embed = self.gpt.transformer.wte(preprocessed_texts['input_ids'])
+        return texts_embed
+    def get_sid_prefix(self, sid_embeddings):
+        r"""Produces audio embedding which is fed to LM"""
+        if self.norm_sid_emb:
+            sid_embeddings = sid_embeddings / sid_embeddings.norm(2, -1).reshape(-1,1)
+        # raise SystemError(sid_embeddings.shape) # torch.Size([2, 1024])
+        sids_prefix = self.sid_mapper(sid_embeddings).contiguous().view(-1, self.sid_prefix_length, self.gpt_embedding_size)
+        # raise SystemError(sids_prefix.shape) # torch.Size([2, 40, 768]) batch_size, seq_len, embed_size
+        return sids_prefix
+    def get_prompt_prefix(self, texts):
+        r"""Load list of text prompts and return prompt prefix and prompt embeddings"""
+        preprocessed_texts = self.preprocess_prompt(texts)
+        print(preprocessed_texts)
+        texts_embed = self._get_text_embeddings(preprocessed_texts)
+        return texts_embed, preprocessed_texts
+    def get_prompt_prefix_single(self, texts):
+        r"""Load list of text prompts and return prompt prefix and prompt embeddings"""
+        preprocessed_texts = self.preprocess_prompt_single(texts)
+        texts_embed = self._get_text_embeddings(preprocessed_texts)
+        return texts_embed, preprocessed_texts
+    def get_text_prefix(self, texts):
+        r"""Load list of text prompts and return prompt prefix and prompt embeddings"""
+        preprocessed_texts = self.preprocess_text(texts)
+        texts_embed = self._get_text_embeddings(preprocessed_texts)
+        return texts_embed, preprocessed_texts
+    def generate_beam(self, beam_size: int = 1, sids_prefix=None, entry_length=80, temperature=1., stop_token: str = ' <|endoftext|>'):
+        stop_token_index = self.tokenizer.encode(stop_token)[0]
+        tokens = None
+        scores = None
+        device = next(self.gpt.parameters()).device
+        seq_lengths = torch.ones(beam_size, device=device)
+        is_stopped = torch.zeros(beam_size, device=device, dtype=torch.bool)
+        with torch.no_grad():
+            generated = sids_prefix # sid embedding
+            for i in range(entry_length):
+                outputs = self.gpt(inputs_embeds=generated)
+                logits = outputs.logits
+                logits = logits[:, -1, :] / (temperature if temperature > 0 else 1.0)
+                logits = logits.softmax(-1).log()
+                if scores is None:
+                    scores, next_tokens = logits.topk(beam_size, -1)
+                    generated = generated.expand(beam_size, *generated.shape[1:])
+                    next_tokens, scores = next_tokens.permute(1, 0), scores.squeeze(0)
+                    if tokens is None:
+                        tokens = next_tokens
+                    else:
+                        tokens = tokens.expand(beam_size, *tokens.shape[1:])
+                        tokens = torch.cat((tokens, next_tokens), dim=1)
+                else:
+                    logits[is_stopped] = -float(np.inf)
+                    logits[is_stopped, 0] = 0
+                    scores_sum = scores[:, None] + logits
+                    seq_lengths[~is_stopped] += 1
+                    scores_sum_average = scores_sum / seq_lengths[:, None]
+                    scores_sum_average, next_tokens = scores_sum_average.view(-1).topk(beam_size, -1)
+                    next_tokens_source = next_tokens // scores_sum.shape[1]
+                    seq_lengths = seq_lengths[next_tokens_source]
+                    next_tokens = next_tokens % scores_sum.shape[1]
+                    next_tokens = next_tokens.unsqueeze(1)
+                    tokens = tokens[next_tokens_source]
+                    tokens = torch.cat((tokens, next_tokens), dim=1)
+                    generated = generated[next_tokens_source]
+                    scores = scores_sum_average * seq_lengths
+                    is_stopped = is_stopped[next_tokens_source]
+                next_token_embed = self.gpt.transformer.wte(next_tokens.squeeze()).view(generated.shape[0], 1, -1)
+                generated = torch.cat((generated, next_token_embed), dim=1)
+                is_stopped = is_stopped + next_tokens.eq(stop_token_index).squeeze()
+                if is_stopped.all():
+                    break
+        scores = scores / seq_lengths
+        output_list = tokens.cpu().numpy()
+        ############ Shuo added for attn plot ###########
+        # token_list = []
+        # text_list = []
+        # for output, length in zip(output_list, seq_lengths):
+        #     for item in output[:int(length)]:
+        #         token_list.append(item)
+        #         text_list.append(self.tokenizer.decode(item))
+        ############ Shuo added for attn plot ###########
+        output_texts = [self.tokenizer.decode(output[:int(length)]) for output, length in zip(output_list, seq_lengths)]
+        order = scores.argsort(descending=True)
+        #output_texts = [[output_texts[i], scores[i].item()] for i in order]
+        output_texts = [output_texts[i] for i in order]
+        return output_texts
+        # return output_texts, token_list, text_list