guilinhu commited on Nov 19, 2025

Commit

df9f13e

verified ·

1 Parent(s): 17dbfd4

Upload folder using huggingface_hub

Browse files

Files changed (20) hide show

.gitignore +2 -0
LICENSE +21 -0
README.md +54 -3
config/model_config.json +92 -0
eval.py +346 -0
requirements.txt +202 -0
src/datasets/joint_training_dataset.py +441 -0
src/datasets/noise.py +202 -0
src/hl_module/joint_train_hl_module_new.py +543 -0
src/losses/.DS_Store +0 -0
src/losses/SNRLP.py +42 -0
src/metrics/metrics.py +100 -0
src/models/blocks/model1_block.py +434 -0
src/models/blocks/model2_block.py +448 -0
src/models/network/model1.py +196 -0
src/models/network/model2_joint.py +186 -0
src/models/network/net_conversation_joint.py +360 -0
src/train_joint.py +202 -0
src/training/tain_val.py +88 -0
src/utils.py +285 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ wandb/
2	+ __pycache__/

LICENSE ADDED Viewed

	@@ -0,0 +1,21 @@

+MIT License
+Copyright (c) 2025 Guilin Hu
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

README.md CHANGED Viewed

@@ -1,3 +1,54 @@
----
-license: mit
----

+# Proactive Hearing Assistants that Isolate Egocentric Conversations
+## More Information
+For more information, please refer to our website: [https://proactivehearing.cs.washington.edu/](https://proactivehearing.cs.washington.edu/).
+## Abstract
+We introduce proactive hearing assistants that automatically identify and separate the wearer’s conversation partners, without requiring explicit prompts. Our system operates on egocentric binaural audio and uses the wearer’s self-speech as an anchor, leveraging turn-taking behavior and dialogue dynamics to infer conversational partners and suppress others. To enable real-time, on-device operation, we propose a dual-model architecture: a lightweight streaming model runs every 12.5 ms for low-latency extraction of the conversation partners, while a slower model runs less frequently to capture longer-range conversational dynamics. Results on real-world 2- and 3-speaker conversation test sets, collected with binaural egocentric hardware from 11 participants totaling 6.8 hours, show generalization in identifying and isolating conversational partners in multi-conversation settings. Our work marks a step toward hearing assistants that adapt proactively to conversational dynamics and engagement.
+## Training and Evaluation
+### 1. Installing Requirements
+Before training or evaluating the model, please create an environment and install all dependencies:
+```
+pip install -r requirements.txt
+```
+### 2. Model Training
+To train the model, run:
+```
+python src/train_joint.py --config <path_to_config> --run_dir <path_to_model_checkpoint>
+```
+To resume training, make sure that <path_to_model_checkpoint> points to the same directory used previously, and rerun the command above.
+### 3. Model Evaluation
+To evaluate the model, run:
+```
+python eval.py <path to testing dataset> <path to model checkpoint> --use_cuda --save
+```
+## Citation
+If you use our work, please cite:
+```
+@inproceedings{hu2025proactive,
+  title={Proactive Hearing Assistants that Isolate Egocentric Conversations},
+  author={Hu, Guilin and Itani, Malek and Chen, Tuochao and Gollakota, Shyamnath},
+  booktitle={Proceedings of the 2025 Conference on Empirical Methods in Natural Language Processing},
+  pages={25377--25394},
+  year={2025}
+}
+```

config/model_config.json ADDED Viewed

	@@ -0,0 +1,92 @@

+{
+    "project_name": "magic_hear",
+    "pl_module": "src.hl_module.joint_train_hl_module_new.PLModule",
+    "pl_module_args": {
+        "freeze_model1": false,
+        "metrics": [
+            "snr_i",
+            "si_snr_i",
+            "si_sdr_i"
+        ],
+        "model": "src.models.network.net_conversation_joint.Net_Conversation",
+        "model_params": {
+            "model1_block_name": "src.models.blocks.model1_block.GridNetBlock",
+            "num_layers_model1": 6,
+            "latent_dim_model1": 32,
+            "use_speaker_emb_model1": false,
+            "use_self_speech_model2": false,
+            "one_emb_model1": true,
+            "model1_block_params": {
+                "emb_ks": 2,
+                "emb_hs": 2,
+                "hidden_channels": 64,
+                "n_head": 4
+            },
+            "model2_block_name": "src.models.blocks.model2_block.GridNetBlock",
+            "num_layers_model2": 6,
+            "latent_dim_model2": 32,
+            "lstm_fold_chunk": 80,
+            "model2_block_params": {
+                "emb_ks": 1,
+                "emb_hs": 1,
+                "hidden_channels": 64,
+                "n_head": 4,
+                "use_attention": false
+            },
+            "stft_chunk_size": 200,
+            "stft_pad_size": 32,
+            "stft_back_pad": 32,
+            "num_input_channels": 1,
+            "num_output_channels": 1,
+            "num_sources": 1,
+            "use_sp_feats": false,
+            "use_first_ln": true,
+            "n_imics": 1,
+            "window": "rect",
+            "E": 2
+        },
+        "loss": "src.losses.SNRLP.SNRLPLoss",
+        "loss_params": {
+            "snr_loss_name": "snr",
+            "neg_weight": 100
+        },
+        "optimizer": "torch.optim.AdamW",
+        "optimizer_params": {
+            "lr": 2e-3
+        },
+        "scheduler": "torch.optim.lr_scheduler.ReduceLROnPlateau",
+        "scheduler_params": {
+            "mode": "min",
+            "patience": 4,
+            "factor": 0.5,
+            "min_lr": 1e-6
+        },
+        "sr": 16000,
+        "grad_clip": 1,
+        "use_dp": true
+    },
+    "train_dataset": "src.datasets.joint_training_dataset.Dataset",
+    "train_data_args": {
+        "input_dir": [],
+        "output_conversation": 1,
+        "batch_size": 4,
+        "clean_embed": true,
+        "random_audio_length": 160000,
+        "required_first_speaker_as_self_speech": true,
+        "spk_emb_exist": false
+    },
+    "val_dataset": "src.datasets.joint_training_dataset.Dataset",
+    "val_data_args": {
+        "input_dir": [],
+        "output_conversation": 1,
+        "batch_size": 4,
+        "clean_embed": true,
+        "random_audio_length": 160000,
+        "required_first_speaker_as_self_speech": true,
+        "spk_emb_exist": false
+    },
+    "epochs": 130,
+    "batch_size": 4,
+    "eval_batch_size": 4,
+    "num_workers": 12
+}

eval.py ADDED Viewed

	@@ -0,0 +1,346 @@

+from src.metrics.metrics import Metrics
+import src.utils as utils
+import argparse
+import os, json, glob
+import numpy as np
+import torch
+import pandas as pd
+import torchaudio
+import matplotlib.pyplot as plt
+import torch.nn as nn
+import copy
+import torch.nn.functional as F
+from torchmetrics.functional import signal_noise_ratio as snr
+def mod_pad(x, chunk_size, pad):
+    mod = 0
+    if (x.shape[-1] % chunk_size) != 0:
+        mod = chunk_size - (x.shape[-1] % chunk_size)
+    x = F.pad(x, (0, mod))
+    x = F.pad(x, pad)
+    return x, mod
+class LayerNormPermuted(nn.LayerNorm):
+    def __init__(self, *args, **kwargs):
+        super(LayerNormPermuted, self).__init__(*args, **kwargs)
+    def forward(self, x):
+        """
+        Args:
+            x: [B, C, T, F]
+        """
+        x = x.permute(0, 2, 3, 1)  # [B, T, F, C]
+        x = super().forward(x)
+        x = x.permute(0, 3, 1, 2)  # [B, C, T, F]
+        return x
+def save_audio_file_torch(file_path, wavform, sample_rate=16000, rescale=False):
+    if rescale:
+        wavform = wavform / torch.max(wavform) * 0.9
+    torchaudio.save(file_path, wavform, sample_rate)
+def get_mixture_and_gt(curr_dir, rng, SHIFT_VALUE=0, noise_audio_list=[]):
+    metadata2 = utils.read_json(os.path.join(curr_dir, "metadata.json"))
+    diags = metadata2["target_dialogue"]
+    if os.path.exists(os.path.join(curr_dir, "self_speech.wav")):
+        self_speech = utils.read_audio_file_torch(os.path.join(curr_dir, "self_speech.wav"), 1)
+    elif os.path.exists(os.path.join(curr_dir, "self_speech_original.wav")):
+        self_speech = utils.read_audio_file_torch(os.path.join(curr_dir, "self_speech_original.wav"), 1)
+    other_speech = torch.zeros_like(self_speech)
+    for i in range(len(diags) - 1):
+        wav = utils.read_audio_file_torch(os.path.join(curr_dir, f"target_speech{i}.wav"), 1)
+        other_speech += wav
+    if os.path.exists(os.path.join(curr_dir, f"intereference.wav")):
+        interfere = utils.read_audio_file_torch(os.path.join(curr_dir, f"intereference.wav"), 1)
+    else:
+        interfere = torch.zeros_like(self_speech)
+        interfere += utils.read_audio_file_torch(os.path.join(curr_dir, f"intereference0.wav"), 1)
+        interfere += utils.read_audio_file_torch(os.path.join(curr_dir, f"intereference1.wav"), 1)
+    gt = self_speech + other_speech
+    tgt_snr = rng.uniform(-10, 10)
+    interfere = scale_noise_to_snr(gt, interfere, tgt_snr)
+    mixture = gt + interfere
+    if noise_audio_list != []:
+        print("added noise")
+        noise_audio = noise_sample(noise_audio_list, mixture.shape[-1], rng)
+        wham_scale = rng.uniform(0, 1)
+        mixture += noise_audio * wham_scale
+    embed_path = os.path.join(curr_dir, "embed.pt")
+    if os.path.exists(embed_path):
+        embed = torch.load(embed_path, weights_only=False)
+        embed = torch.from_numpy(embed)
+    else:
+        embed = torch.zeros(256)
+    L = mixture.shape[-1]
+    peak = np.abs(mixture).max()
+    if peak > 1:
+        mixture /= peak
+        self_speech /= peak
+        gt /= peak
+    inputs = {
+        "mixture": mixture.float(),
+        "embed": embed.float(),
+        "self_speech": self_speech[0:1, :].float(),
+    }
+    targets = {
+        "self": self_speech[0:1, :].numpy(),
+        "other": other_speech[0:1, :].numpy(),
+        "target": gt[0:1, :].float(),
+    }
+    return inputs, targets, metadata2
+def scale_utterance(audio, timestamp, rng, db_change=7):
+    for start, end in timestamp:
+        if rng.uniform(0, 1) < 0.3:
+            random_db = rng.uniform(-db_change, db_change)
+            amplitude_factor = 10 ** (random_db / 20)
+            audio[..., start:end] *= amplitude_factor
+    return audio
+def get_snr(target, mixture, EPS=1e-9):
+    """
+    Computes the average SNR across all channels
+    """
+    return snr(mixture, target).mean()
+def scale_noise_to_snr(target_speech: torch.Tensor, noise: torch.Tensor, target_snr: float):
+    current_snr = get_snr(target_speech, noise + target_speech)
+    pwr = (current_snr - target_snr) / 20
+    k = 10**pwr
+    return k * noise
+def run_testcase(model, inputs, device) -> np.ndarray:
+    with torch.inference_mode():
+        inputs["mixture"] = inputs["mixture"][0:1, ...].unsqueeze(0).to(device)
+        inputs["embed"] = inputs["embed"].unsqueeze(0).to(device)
+        inputs["self_speech"] = inputs["self_speech"][0:1, ...].unsqueeze(0).to(device)
+        inputs["start_idx"] = 0
+        inputs["end_idx"] = inputs["mixture"].shape[-1]
+        outputs = model(inputs)
+        output_target = outputs["output"].squeeze(0)
+        final_output = output_target.cpu().numpy()
+        return final_output
+def get_timestamp_mask(timestamps, mask_shape):
+    mask = torch.zeros(mask_shape)
+    for s, e in timestamps:
+        mask[..., s:e] = 1
+    return mask
+def noise_sample(noise_file_list, audio_length, rng: np.random.RandomState):
+    # NOTE: hardcoded. assume noise is 48k and target is 16k
+    target_sr = 16000
+    acc_len = 0
+    concatenated_audio = None
+    while acc_len <= audio_length:
+        noise_file = rng.choice(noise_file_list)
+        info = torchaudio.info(noise_file)
+        noise_sr = info.sample_rate
+        noise_wav, _ = torchaudio.load(noise_file)
+        noise_wav = noise_wav[0:1, ...]
+        if noise_sr != target_sr:
+            resampler = torchaudio.transforms.Resample(orig_freq=noise_sr, new_freq=target_sr)
+            noise_wav = resampler(noise_wav)
+        if concatenated_audio is None:
+            concatenated_audio = noise_wav
+        else:
+            concatenated_audio = torch.cat((concatenated_audio, noise_wav), dim=1)
+        acc_len = concatenated_audio.shape[-1]
+    concatenated_audio = concatenated_audio[..., :audio_length]
+    assert concatenated_audio.shape[1] == audio_length
+    return concatenated_audio
+def main(args: argparse.Namespace):
+    device = "cuda" if args.use_cuda else "cpu"
+    # Load model
+    model = utils.load_torch_pretrained(args.run_dir).model
+    model_name = args.run_dir.split("/")[-1]
+    model = model.to(device)
+    model.eval()
+    # Initialize metrics
+    snr = Metrics("snr")
+    snr_i = Metrics("snr_i")
+    si_sdr = Metrics("si_sdr")
+    records = []
+    noise_audio_list = []
+    if args.noise_dir is not None:
+        noise_audio_sublist = glob.glob(os.path.join(args.noise_dir, "*.wav"))
+        if not noise_audio_sublist:
+            print("no noise file found")
+        noise_audio_list.extend(noise_audio_sublist)
+    for i in range(0, 200):
+        rng = np.random.RandomState(i)
+        dataset_name = os.path.basename(args.test_dir)
+        curr_dir = os.path.join(args.test_dir, "{:05d}".format(i))
+        meta_dir = os.path.join(curr_dir, "metadata.json")
+        if not os.path.exists(meta_dir):
+            continue
+        inputs, targets, metadata = get_mixture_and_gt(curr_dir, rng, noise_audio_list=noise_audio_list)
+        if inputs is None:
+            continue
+        self_timestamps = metadata["target_dialogue"][0]["timestamp"]
+        target_speech = targets["target"].cpu().numpy()
+        row = {"test_case_index": i}
+        mixture = inputs["mixture"].cpu().numpy()
+        self_speech = inputs["self_speech"].squeeze(0).cpu().numpy()
+        inputs["mixture"] = inputs["mixture"][0:1, ...]
+        target_speech = target_speech[0:1, ...]
+        output_target = run_testcase(model, inputs, device)
+        self_timestamps = metadata["target_dialogue"][0]["timestamp"]
+        self_mask = get_timestamp_mask(self_timestamps, target_speech.shape)
+        self_mask[..., : args.sr] = 0
+        if mixture.ndim == 1:
+            mixture = mixture[np.newaxis, ...]
+        total_input_sisdr = si_sdr(est=mixture[0:1], gt=target_speech, mix=mixture[0:1]).item()
+        total_output_sisdr = si_sdr(est=output_target, gt=target_speech, mix=mixture[0:1]).item()
+        row[f"sisdr_input_total"] = total_input_sisdr
+        row[f"sisdr_output_total"] = total_output_sisdr
+        # self
+        self_sisdr_mix = si_sdr(
+            est=self_mask * mixture[:1], gt=self_mask * target_speech, mix=self_mask * mixture[:1]
+        ).item()
+        self_sisdr_pred = si_sdr(
+            est=self_mask * output_target, gt=self_mask * target_speech, mix=self_mask * mixture[:1]
+        ).item()
+        row[f"sisdr_mix_self"] = self_sisdr_mix
+        row[f"sisdr_pred_self"] = self_sisdr_pred
+        # ======other speaker======
+        other_timestamps = metadata["target_dialogue"][1]["timestamp"]
+        if len(metadata["target_dialogue"]) > 2:
+            for j in range(2, len(metadata["target_dialogue"])):
+                timestamp = metadata["target_dialogue"][j]["timestamp"]
+                other_timestamps = other_timestamps + timestamp
+        other_mask = get_timestamp_mask(other_timestamps, target_speech.shape)
+        other_mask[..., : args.sr] = 0
+        other_sisdr_mix = si_sdr(
+            est=other_mask * mixture[:1], gt=other_mask * target_speech, mix=other_mask * mixture[:1]
+        ).item()
+        other_sisdr_pred = si_sdr(
+            est=other_mask * output_target, gt=other_mask * target_speech, mix=other_mask * mixture[:1]
+        ).item()
+        row[f"sisdr_mix_other"] = other_sisdr_mix
+        row[f"sisdr_pred_other"] = other_sisdr_pred
+        print(i)
+        records.append(row)
+        if noise_audio_list != []:
+            save_folder = f"./result_{dataset_name}_noise/{model_name}/{i}"
+        else:
+            save_folder = f"./result_{dataset_name}/{model_name}/{i}"
+        os.makedirs(save_folder, exist_ok=True)
+        if type(self_speech) == np.ndarray:
+            self_speech = torch.from_numpy(self_speech)
+        if self_speech.dim() == 1:
+            self_speech = self_speech.unsqueeze(0)
+        if args.save:
+            save_audio_file_torch(
+                f"{save_folder}/mix.wav", torch.from_numpy(mixture[0:1]), sample_rate=args.sr, rescale=False
+            )
+            save_audio_file_torch(f"{save_folder}/self.wav", self_speech, sample_rate=args.sr, rescale=False)
+            save_audio_file_torch(
+                f"{save_folder}/output_target.wav", torch.from_numpy(output_target), sample_rate=args.sr, rescale=False
+            )
+            save_audio_file_torch(
+                f"{save_folder}/target_speech.wav", torch.from_numpy(target_speech), sample_rate=args.sr, rescale=False
+            )
+    results_df = pd.DataFrame.from_records(records)
+    columns = ["test_case_index"] + [col for col in results_df.columns if col != "test_case_index"]
+    results_df = results_df[columns]
+    if noise_audio_list != []:
+        results_csv_path = f"./result_{dataset_name}_noise/{model_name}_multi.csv"
+    else:
+        results_csv_path = f"./result_{dataset_name}/{model_name}_multi.csv"
+    results_df.to_csv(results_csv_path, index=False)
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("test_dir", type=str, help="Path to test dataset")
+    parser.add_argument("run_dir", type=str, help="Path to model run checkpoint")
+    parser.add_argument("--sr", type=int, default=16000, help="Project sampling rate")
+    parser.add_argument("--noise_dir", type=str, default=None, help="Wham noise directory")
+    parser.add_argument("--use_cuda", action="store_true", help="Whether to use cuda")
+    parser.add_argument("--save", action="store_true", help="Whether to save output audio")
+    main(parser.parse_args())

requirements.txt ADDED Viewed

	@@ -0,0 +1,202 @@

+absl-py==2.3.1
+aiohappyeyeballs==2.6.1
+aiohttp==3.11.16
+aiosignal==1.3.2
+annotated-types==0.7.0
+antlr4-python3-runtime==4.9.3
+asteroid==0.7.0
+asteroid-filterbanks==0.4.0
+asttokens @ file:///home/conda/feedstock_root/build_artifacts/asttokens_1733250440834/work
+async-timeout==5.0.1
+attrs==25.3.0
+audioread==3.0.1
+auraloss==0.4.0
+beautifulsoup4==4.13.4
+cached-property==2.0.1
+certifi==2025.1.31
+cffi==1.17.1
+cftime==1.6.4.post1
+charset-normalizer==3.4.1
+ci_sdr==0.0.2
+click==8.1.8
+coloredlogs==15.0.1
+comm @ file:///home/conda/feedstock_root/build_artifacts/bld/rattler-build_comm_1753453984/work
+ConfigArgParse==1.7
+contourpy==1.3.0
+ctc_segmentation==1.7.4
+cycler==0.12.1
+Cython==3.0.12
+DateTime==5.5
+debugpy @ file:///home/conda/feedstock_root/build_artifacts/bld/rattler-build_debugpy_1752827114/work
+decorator @ file:///home/conda/feedstock_root/build_artifacts/decorator_1740384970518/work
+Distance==0.1.3
+docker-pycreds==0.4.0
+editdistance==0.8.1
+einops==0.8.1
+espnet==202412
+espnet-tts-frontend==0.0.3
+eval_type_backport==0.2.2
+exceptiongroup @ file:///home/conda/feedstock_root/build_artifacts/exceptiongroup_1746947292760/work
+executing @ file:///home/conda/feedstock_root/build_artifacts/executing_1745502089858/work
+fast_bss_eval==0.1.3
+filelock==3.18.0
+flatbuffers==25.2.10
+fonttools==4.57.0
+frozenlist==1.5.0
+fsspec==2025.3.2
+g2p-en==2.1.0
+gdown==5.2.0
+gitdb==4.0.12
+GitPython==3.1.44
+grpcio==1.74.0
+h5py==3.13.0
+huggingface-hub==0.30.2
+humanfriendly==10.0
+hydra-core==1.3.2
+HyperPyYAML==1.2.2
+idna==3.10
+importlib-metadata==4.13.0
+importlib_resources==6.5.2
+inflect==7.5.0
+intervaltree==3.1.0
+ipykernel @ file:///home/conda/feedstock_root/build_artifacts/ipykernel_1753749834440/work
+ipython @ file:///home/conda/feedstock_root/build_artifacts/ipython_1701831663892/work
+jaconv==0.4.0
+jamo==0.4.1
+jedi @ file:///home/conda/feedstock_root/build_artifacts/jedi_1733300866624/work
+Jinja2==3.1.6
+jiwer==4.0.0
+joblib==1.4.2
+julius==0.2.7
+jupyter_client @ file:///home/conda/feedstock_root/build_artifacts/jupyter_client_1733440914442/work
+jupyter_core @ file:///home/conda/feedstock_root/build_artifacts/jupyter_core_1748333051527/work
+kaldiio==2.18.1
+kiwisolver==1.4.7
+lazy_loader==0.4
+librosa==0.9.2
+lightning-utilities==0.14.3
+llvmlite==0.43.0
+Markdown==3.9
+MarkupSafe==3.0.2
+matplotlib==3.9.4
+matplotlib-inline @ file:///home/conda/feedstock_root/build_artifacts/matplotlib-inline_1733416936468/work
+mir_eval==0.8.2
+more-itertools==10.6.0
+mpmath==1.3.0
+msgpack==1.1.0
+multidict==6.4.3
+nest_asyncio @ file:///home/conda/feedstock_root/build_artifacts/nest-asyncio_1733325553580/work
+netCDF4==1.7.2
+networkx==3.2.1
+nltk==3.9.1
+noisereduce==3.0.3
+numba==0.60.0
+numpy==1.23.5
+nvidia-cublas-cu12==12.4.5.8
+nvidia-cuda-cupti-cu12==12.4.127
+nvidia-cuda-nvrtc-cu12==12.4.127
+nvidia-cuda-runtime-cu12==12.4.127
+nvidia-cudnn-cu12==9.1.0.70
+nvidia-cufft-cu12==11.2.1.3
+nvidia-curand-cu12==10.3.5.147
+nvidia-cusolver-cu12==11.6.1.9
+nvidia-cusparse-cu12==12.3.1.170
+nvidia-cusparselt-cu12==0.6.2
+nvidia-nccl-cu12==2.21.5
+nvidia-nvjitlink-cu12==12.4.127
+nvidia-nvtx-cu12==12.4.127
+omegaconf==2.3.0
+onnxruntime==1.19.2
+openai-whisper==20250625
+opt_einsum==3.4.0
+packaging==24.2
+pandas==2.2.3
+parso @ file:///home/conda/feedstock_root/build_artifacts/parso_1733271261340/work
+pb-bss-eval==0.0.2
+pesq==0.0.4
+pexpect @ file:///home/conda/feedstock_root/build_artifacts/pexpect_1733301927746/work
+pickleshare @ file:///home/conda/feedstock_root/build_artifacts/pickleshare_1733327343728/work
+pillow==11.2.1
+platformdirs==4.3.7
+pooch==1.8.2
+prompt_toolkit @ file:///home/conda/feedstock_root/build_artifacts/prompt-toolkit_1744724089886/work
+propcache==0.3.1
+protobuf==5.29.4
+psutil @ file:///home/conda/feedstock_root/build_artifacts/psutil_1740663125313/work
+ptyprocess @ file:///home/conda/feedstock_root/build_artifacts/ptyprocess_1733302279685/work/dist/ptyprocess-0.7.0-py2.py3-none-any.whl#sha256=92c32ff62b5fd8cf325bec5ab90d7be3d2a8ca8c8a3813ff487a8d2002630d1f
+pure_eval @ file:///home/conda/feedstock_root/build_artifacts/pure_eval_1733569405015/work
+pybind11==2.13.6
+pycparser==2.22
+pydantic==2.11.3
+pydantic_core==2.33.1
+pydub==0.25.1
+Pygments @ file:///home/conda/feedstock_root/build_artifacts/pygments_1750615794071/work
+pyparsing==3.2.3
+pypinyin==0.44.0
+pyroomacoustics==0.8.3
+PySocks==1.7.1
+pystoi==0.4.1
+python-dateutil @ file:///home/conda/feedstock_root/build_artifacts/bld/rattler-build_python-dateutil_1751104122/work
+python-sofa==0.2.0
+pytorch-lightning==2.5.1
+pytorch-ranger==0.1.1
+pytz==2025.2
+pyworld==0.3.5
+PyYAML==6.0.2
+pyzmq @ file:///home/conda/feedstock_root/build_artifacts/pyzmq_1749898437650/work
+RapidFuzz==3.13.0
+regex==2024.11.6
+requests==2.32.3
+resampy==0.4.3
+Resemblyzer==0.1.4
+ruamel.yaml==0.18.15
+ruamel.yaml.clib==0.2.12
+safetensors==0.5.3
+scikit-learn==1.6.1
+scipy==1.13.1
+sentencepiece==0.1.97
+sentry-sdk==2.26.0
+setproctitle==1.3.5
+silero-vad==5.1.2
+six @ file:///home/conda/feedstock_root/build_artifacts/bld/rattler-build_six_1753199211/work
+smmap==5.0.2
+sortedcontainers==2.4.0
+soundfile==0.13.1
+soupsieve==2.7
+sox==1.5.0
+soxbindings==1.2.3
+soxr==0.5.0.post1
+stack_data @ file:///home/conda/feedstock_root/build_artifacts/stack_data_1733569443808/work
+sympy==1.13.1
+tensorboard==2.20.0
+tensorboard-data-server==0.7.2
+tensorboardX==2.6.4
+threadpoolctl==3.6.0
+tiktoken==0.9.0
+tokenizers==0.21.1
+torch==2.6.0
+torch-complex==0.4.4
+torch-optimizer==0.1.0
+torch-stoi==0.2.3
+torchaudio==2.6.0
+torchmetrics==0.11.4
+torchvision==0.21.0
+tornado @ file:///home/conda/feedstock_root/build_artifacts/tornado_1748003328568/work
+tqdm==4.67.1
+traitlets @ file:///home/conda/feedstock_root/build_artifacts/traitlets_1733367359838/work
+transformers==4.51.3
+triton==3.2.0
+typeguard==4.4.2
+typing==3.7.4.3
+typing-inspection==0.4.0
+typing_extensions==4.13.2
+tzdata==2025.2
+Unidecode==1.3.8
+urllib3==2.4.0
+wandb==0.19.9
+wcwidth @ file:///home/conda/feedstock_root/build_artifacts/wcwidth_1733231326287/work
+webrtcvad==2.0.10
+Werkzeug==3.1.3
+yarl==1.19.0
+zipp==3.21.0
+zope.interface==7.2

src/datasets/joint_training_dataset.py ADDED Viewed

	@@ -0,0 +1,441 @@

+"""
+Torch dataset object for synthetically rendered
+spatial data
+"""
+import random
+from typing import Tuple
+from pathlib import Path
+import torch
+import numpy as np
+import os, glob
+import src.utils as utils
+from .noise import WhitePinkBrownAugmentation
+import torchaudio
+from torchmetrics.functional import signal_noise_ratio as snr
+from torch.utils.data._utils.collate import default_collate
+MAX_LEN = 50
+def save_audio_file_torch(file_path, wavform, sample_rate = 16000, rescale = False):
+    if rescale:
+        wavform = wavform/torch.max(wavform)*0.9
+    torchaudio.save(file_path, wavform, sample_rate)
+def perturb_amplitude_db(audio, db_change=10):
+    random_db = np.random.uniform(-db_change, db_change)
+    amplitude_factor = 10 ** (random_db / 20)
+    audio = audio * amplitude_factor
+    return audio
+def scale_to_tgt_pwr(audio: np.ndarray, timestamp, tgt_pwr_dB: float, EPS=1e-9):
+    segments = []
+    for start_time, end_time in timestamp:
+        start_time = max(0, start_time)
+        end_time = min(audio.size(-1), end_time)
+        segment = audio[..., start_time:end_time]
+        segments.append(segment)
+    # Concatenate segments
+    concatenated = torch.cat(segments, dim=-1)
+    avg_pwr = torch.mean(concatenated**2)
+    avg_pwr_dB = 10 * torch.log10(avg_pwr + EPS)
+    scale = 10 ** ((tgt_pwr_dB - avg_pwr_dB) / 20)
+    audio_scaled = scale * audio
+    concatenated_scaled=scale*concatenated
+    scaled_pwr_dB = 10 * torch.log10(torch.mean(concatenated_scaled**2) + EPS)
+    assert torch.abs(tgt_pwr_dB - scaled_pwr_dB) < 0.1
+    return audio_scaled
+def scale_utterance(audio, timestamp, rng, db_change=7):
+    for start, end in timestamp:
+        if rng.uniform(0, 1) < 0.3:
+            random_db=rng.uniform(-db_change, db_change)
+            amplitude_factor = 10 ** (random_db / 20)
+            audio[..., start:end] *= amplitude_factor
+    return audio
+def get_snr(target, mixture, EPS=1e-9):
+    """
+    Computes the average SNR across all channels
+    """
+    return snr(mixture, target).mean()
+def scale_noise_to_snr(target_speech: torch.Tensor, noise: torch.Tensor, target_snr: float):
+    """
+    Rescales a BINAURAL noise signal to achieve an average SNR (across both channels) equal to target snr.
+    Let k be the noise scaling factor
+    SNR_tgt = (SNR_left_scaled + SNR_right_scaled) / 2 = 0.5 * (10 log(S_L^T S_L/S_N^T S_N) - 20 log(k) + 10 log(S_R^T S_R / N_R^T N_R) - 20 log(k))
+            = 0.5 * (SNR_left_unscaled + SNR_right_unscaled - 40 log(k)) = avg_snr_initial - 20 log (k)
+    """
+    current_snr = get_snr(target_speech, noise + target_speech)
+    pwr = (current_snr - target_snr) / 20
+    k = 10 ** pwr
+    return k * noise
+def custom_collate_fn(batch):
+    """
+    batch: List of tuples (inputs_dict, targets_dict).
+    inputs_dict: Dictionary of inputs like 'mixture', 'embed', etc.
+    targets_dict: Dictionary of targets like 'target', 'masked_target', etc.
+    """
+    # Separate inputs and targets
+    inputs = [item[0] for item in batch]  # item[0] contains the 'inputs' dict
+    targets = [item[1] for item in batch]  # item[1] contains the 'targets' dict
+    # Process inputs - use default_collate for everything except 'self_timestamp'
+    collated_inputs = {}
+    for key in inputs[0].keys():
+        if key == 'self_timestamp':
+            # Handle self_timestamp as a list of lists (variable-length)
+            collated_inputs[key] = [item[key] for item in inputs]
+        else:
+            # For fixed-length tensors, stack them using default_collate
+            collated_inputs[key] = default_collate([item[key] for item in inputs])
+    # Process targets (normal fixed-length tensors)
+    collated_targets = default_collate(targets)
+    return collated_inputs, collated_targets
+class Dataset(torch.utils.data.Dataset):
+    """
+    Dataset of mixed waveforms and their corresponding ground truth waveforms
+    recorded at different microphone.
+    Data format is a pair of Tensors containing mixed waveforms and
+    ground truth waveforms respectively. The tensor's dimension is formatted
+    as (n_microphone, duration).
+    Each scenario is represented by a folder. Multiple datapoints are generated per
+    scenario. This can be customized using the points_per_scenario parameter.
+    """
+    def __init__(self, input_dir, n_mics=1, sr=8000,
+                 sig_len = 30, downsample = 1,
+                 split = 'val', output_conversation = 0,
+                 batch_size = 8,
+                 clean_embed=False,
+                 noise_dir = None,
+                 random_audio_length=800,
+                 required_first_speaker_as_self_speech=True,
+                 spk_emb_exist=True,
+                 amplitude_aug_range=0,
+                 noise_amplitude_aug_range=7,
+                 utter_db_aug=7,
+                 input_mean="L",
+                 min_snr=-10,
+                 max_snr=10,
+                 original_val=False,
+                 apply_timestamp_aug=False,
+                 snr_control=True
+                 ):
+        super().__init__()
+        self.dirs = []
+        self.spk_emb_exist=spk_emb_exist
+        for _dir in input_dir:
+            dir_list = sorted(list(Path(_dir).glob('[0-9]*')))
+            for dest in dir_list:
+                meta_path = os.path.join(dest, 'metadata.json')
+                embed_path = os.path.join(dest, 'embed.pt')
+                self_speech_path=os.path.join(dest, 'self_speech.wav')
+                if self.spk_emb_exist and os.path.exists(meta_path) and os.path.exists(embed_path):
+                    self.dirs.append(dest)
+                elif not self.spk_emb_exist and os.path.exists(meta_path):
+                    self.dirs.append(dest)
+        self.noise_dirs = []
+        if noise_dir is not None:
+            for sub_dir in noise_dir:
+                noise_audio_list = glob.glob(os.path.join(sub_dir, '*.wav'))
+                if not noise_dir:
+                    print("no noise file found")
+                self.noise_dirs.extend(noise_audio_list)
+        self.clean_embed = clean_embed
+        self.n_mics = n_mics
+        self.sig_len = int(sig_len*sr/downsample)
+        self.sr = sr
+        self.downsample = downsample
+        self.scales = [-3, 3]
+        self.output_conversation = output_conversation
+        self.apply_timestamp_aug = apply_timestamp_aug
+        # Data augmentation
+        ### calculate the stat
+        self.batch_size = batch_size
+        self.split = split
+        print(self.split, (len(self.dirs)//batch_size)*batch_size)
+        self.random_audio_length=random_audio_length
+        self.required_first_speaker_as_self_speech=required_first_speaker_as_self_speech
+        self.amplitude_aug_range=amplitude_aug_range
+        self.noise_amplitude_aug_range=noise_amplitude_aug_range
+        self.pwr_thresh = -60
+        self.min_snr=min_snr
+        self.max_snr=max_snr
+        self.utter_db_aug=utter_db_aug
+        self.input_mean=input_mean
+        self.original_val=original_val
+        self.snr_control=snr_control
+    def __len__(self) -> int:
+        return (len(self.dirs)//self.batch_size)*self.batch_size
+    def noise_sample(self, noise_file_list, audio_length, rng: np.random.RandomState):
+        # NOTE: hardcoded. assume noise is 48k and target is 16k
+        # noise_audio=utils.read_audio_file_torch(noise_file, 3)
+        target_sr = 16000
+        acc_len=0
+        concatenated_audio = None
+        while acc_len<=audio_length:
+            noise_file=rng.choice(noise_file_list)
+            info = torchaudio.info(noise_file)
+            noise_sr=info.sample_rate
+            noise_wav, _ = torchaudio.load(noise_file)
+            if noise_wav.shape[0]>1 and self.input_mean=="L":
+                noise_wav=noise_wav[0:1, ...]
+            elif noise_wav.shape[0]>1 and self.input_mean=="R":
+                noise_wav=noise_wav[1:2, ...]
+            elif noise_wav.shape[0]>1 and self.input_mean==True:
+                noise_wav=torch.mean(noise_wav, dim=0)
+                noise_wav=noise_wav.unsqueeze(0)
+            if noise_sr != target_sr:
+                resampler = torchaudio.transforms.Resample(orig_freq=noise_sr, new_freq=target_sr)
+                noise_wav = resampler(noise_wav)
+            if concatenated_audio is None:
+                concatenated_audio = noise_wav
+            else:
+                concatenated_audio = torch.cat((concatenated_audio, noise_wav), dim=1)
+            acc_len=concatenated_audio.shape[-1]
+        concatenated_audio=concatenated_audio[..., :audio_length]
+        assert concatenated_audio.shape[1]==audio_length
+        return concatenated_audio
+    def __getitem__(self, idx: int) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        Returns:
+            mixed_data - M x T
+            target_voice_data - M x T
+            window_idx_one_hot - 1-D
+        """
+        if self.split == 'train':
+            seed = idx + np.random.randint(1000000)
+        else:
+            seed = idx
+        rng = np.random.RandomState(seed)
+        curr_dir = self.dirs[idx%len(self.dirs)]
+        return self.get_mixture_and_gt(curr_dir, rng)
+    def diffuse_speech_pattern(self, audio: torch.Tensor, timestamps: list, rng: np.random.RandomState, beta=8000):
+        zero_segments = np.array([timestamps[0][0]] + [timestamps[i+1][0] - timestamps[i][1] for i in range(len(timestamps) - 1)] + [audio.shape[-1] - timestamps[-1][1]])
+        total_zeros = sum(zero_segments)
+        # Add noise "diffusion"
+        noise = rng.normal(loc=0, scale=beta)
+        zero_segments = zero_segments + noise
+        # Ensure all elements are still positive
+        zero_segments[zero_segments <= 0] = 1
+        # Normalize so that sum is 1
+        zero_segments = zero_segments / zero_segments.sum()
+        zero_segments = zero_segments * total_zeros
+        # Floor indices so that we don't exceed audio size
+        zero_segments = np.floor(zero_segments).astype(np.int32)
+        assert zero_segments.sum() <= total_zeros
+        # Fill in time stamps
+        new_audio = torch.zeros_like(audio)
+        start_index = 0
+        for z, (s, e) in zip(zero_segments[:-1], timestamps):
+            start_index += z
+            new_audio[..., start_index:start_index+(e-s)] = audio[..., s:e]
+            start_index += (e - s)
+        return new_audio
+    def process_audio(self, audio, timestamp, rng, utter_db_aug, tgt_pwr_dB):
+        if self.apply_timestamp_aug:
+            audio = self.diffuse_speech_pattern(audio, timestamp, rng, beta=16000)
+        if timestamp==[]:
+            return audio
+        else:
+            audio = scale_to_tgt_pwr(audio, timestamp, tgt_pwr_dB)
+            audio=scale_utterance(audio, timestamp, rng, utter_db_aug)
+            return audio
+    def get_mixture_and_gt(self, curr_dir, rng):
+        metadata2 = utils.read_json(os.path.join(curr_dir, 'metadata.json'))
+        # process self speech
+        self_speech = utils.read_audio_file_torch(os.path.join(curr_dir, 'self_speech.wav'), 1, self.input_mean)
+        self_speech_original=None
+        if os.path.exists(os.path.join(curr_dir, 'self_speech_original.wav')):
+            self_speech_original=utils.read_audio_file_torch(os.path.join(curr_dir, 'self_speech_original.wav'), 1, self.input_mean)
+        self_timestamp=metadata2['target_dialogue'][0]['timestamp']
+        if self_speech_original is not None:
+            list_of_self=[self_speech, self_speech_original]
+            concat_self_speech=torch.cat(list_of_self, dim=0)
+            utterance_adj_concat_self=scale_utterance(concat_self_speech, self_timestamp, rng, self.utter_db_aug)
+            self_speech=utterance_adj_concat_self[0:1, ...]
+            self_speech_original=utterance_adj_concat_self[1:2, ...]
+        else:
+            self_speech=scale_utterance(self_speech, self_timestamp, rng, self.utter_db_aug)
+        # process interference speech
+        if os.path.exists(os.path.join(curr_dir, f'intereference.wav')):
+            interfere = utils.read_audio_file_torch(os.path.join(curr_dir, f'intereference.wav'), 1, self.input_mean)
+            scale = 0.8
+        else:
+            interfers = metadata2["interference"]
+            interfere = torch.zeros_like(self_speech)
+            if os.path.exists(os.path.join(curr_dir, f'intereference0.wav')):
+                for i in range(0, len(interfers)):
+                    current_inter=utils.read_audio_file_torch(os.path.join(curr_dir, f'intereference{i}.wav'), 1, self.input_mean)
+                    inter_timestamp=metadata2['interference'][i]['timestamp']
+                    current_inter=scale_utterance(current_inter, inter_timestamp, rng, self.utter_db_aug)
+                    interfere += current_inter
+            elif os.path.exists(os.path.join(curr_dir, f'interference0.wav')):
+                for i in range(0, len(interfers)):
+                    current_inter= utils.read_audio_file_torch(os.path.join(curr_dir, f'interference{i}.wav'), 1, self.input_mean)
+                    inter_timestamp=metadata2['interference'][i]['timestamp']
+                    current_inter=scale_utterance(current_inter, inter_timestamp, rng, self.utter_db_aug)
+                    interfere += current_inter
+            scale = 1
+        # process other speech
+        other_speech = torch.zeros_like(self_speech)
+        if self.output_conversation:
+            diags = metadata2["target_dialogue"]
+            for i in range(len(diags) - 1):
+                if os.path.exists(os.path.join(curr_dir, f'target_speech{i}.wav')):
+                    wav = utils.read_audio_file_torch(os.path.join(curr_dir, f'target_speech{i}.wav'), 1, self.input_mean)
+                    other_timestamp=metadata2['target_dialogue'][i+1]['timestamp']
+                    wav=scale_utterance(wav, other_timestamp, rng, self.utter_db_aug)
+                    other_speech += wav
+                elif os.path.exists(os.path.join(curr_dir, f'other_speech{i}.wav')):
+                    wav = utils.read_audio_file_torch(os.path.join(curr_dir, f'other_speech{i}.wav'), 1, self.input_mean)
+                    other_timestamp=metadata2['target_dialogue'][i+1]['timestamp']
+                    wav=scale_utterance(wav, other_timestamp, rng, self.utter_db_aug)
+                    other_speech += wav
+                else:
+                    raise Exception("no audio file to load")
+        # add noise, e.g. WHAM
+        if self.noise_dirs!=[] and random.random() < 0.3:
+            audio_length=interfere.shape[1]
+            noise=self.noise_sample(self.noise_dirs, audio_length, rng)
+            wham_scale = rng.uniform(0, 1)
+            interfere += noise*wham_scale
+        if self_speech_original is not None:
+            gt = self_speech_original + other_speech
+        else:
+            gt = self_speech + other_speech
+        mixture=gt+interfere
+        if self.snr_control==True:
+            tgt_snr = rng.uniform(self.min_snr, self.max_snr)
+            noise = scale_noise_to_snr(gt, mixture - gt, tgt_snr)
+            mixture = noise + gt
+        noise_augmentor = WhitePinkBrownAugmentation(
+            max_white_level=1e-2,    # Adjust as needed
+            max_pink_level=5e-2,     # Adjust as needed
+            max_brown_level=5e-2     # Adjust as needed
+        )
+        if self.split=="train" and random.random() < 0.3:
+            mixture, gt = noise_augmentor(mixture, gt, rng)
+        reverb_path = os.path.join(curr_dir, f'embed.pt')
+        if self.spk_emb_exist:
+            embed = torch.load(reverb_path, weights_only=False)
+            embed = torch.from_numpy(embed)
+        else:
+            embed=torch.zeros(256)
+        self.output_conversation
+        input_length=self_speech.shape[1]
+        start_idx=rng.randint(input_length-self.random_audio_length)
+        end_idx=start_idx+self.random_audio_length
+        # ====peak normalization======
+        peak = torch.abs(mixture).max()
+        if peak > 1:
+            mixture /= peak
+            gt /= peak
+            self_speech /= peak
+        inputs = {
+            'mixture': mixture.float(),
+            'embed': embed.float(),
+            'self_speech': self_speech[0:1, :].float(),
+            'start_idx_list': start_idx,
+            'end_idx_list': end_idx
+        }
+        targets = {
+            'target': gt[0:1, :].float()
+        }
+        return inputs, targets

src/datasets/noise.py ADDED Viewed

	@@ -0,0 +1,202 @@

+import torch
+import numpy as np
+def generate_white_noise(noise_shape, max_level, rng: np.random.RandomState):
+    # Choose white noise level
+    white_noise_level = max_level * rng.rand()
+    # print(white_noise_level)
+    # Generate white noise
+    white_noise = white_noise_level*torch.from_numpy(rng.normal(0, 1, size=noise_shape)).float()
+    return white_noise
+def generate_pink_noise(noise_shape, max_level, rng: np.random.RandomState):
+    # Choose pink noise level
+    pink_noise_level = max_level * rng.rand()
+    # print(pink_noise_level)
+    # Generate pink noise
+    pink_noise = powerlaw_psd_gaussian(1, noise_shape, random_state = 0)
+    pink_noise = pink_noise_level*torch.from_numpy(pink_noise).float()
+    return pink_noise
+def generate_brown_noise(noise_shape, max_level, rng: np.random.RandomState):
+    # Choose brown noise level
+    brown_noise_level = max_level * rng.rand()
+    # print(brown_noise_level)
+    # Generate brown noise
+    brown_noise = powerlaw_psd_gaussian(2, noise_shape, random_state = 0)
+    brown_noise = brown_noise_level*torch.from_numpy(brown_noise).float()
+    return brown_noise
+"""Generate colored noise."""
+from numpy import sqrt, newaxis, integer
+from numpy.fft import irfft, rfftfreq
+from numpy.random import default_rng, Generator, RandomState
+from numpy import sum as npsum
+def powerlaw_psd_gaussian(exponent, size, fmin=0, random_state=None):
+    """Gaussian (1/f)**beta noise.
+    Based on the algorithm in:
+    Timmer, J. and Koenig, M.:
+    On generating power law noise.
+    Astron. Astrophys. 300, 707-710 (1995)
+    Normalised to unit variance
+    Parameters:
+    -----------
+    exponent : float
+        The power-spectrum of the generated noise is proportional to
+        S(f) = (1 / f)**beta
+        flicker / pink noise:   exponent beta = 1
+        brown noise:            exponent beta = 2
+        Furthermore, the autocorrelation decays proportional to lag**-gamma
+        with gamma = 1 - beta for 0 < beta < 1.
+        There may be finite-size issues for beta close to one.
+    shape : int or iterable
+        The output has the given shape, and the desired power spectrum in
+        the last coordinate. That is, the last dimension is taken as time,
+        and all other components are independent.
+    fmin : float, optional
+        Low-frequency cutoff.
+        Default: 0 corresponds to original paper.
+        The power-spectrum below fmin is flat. fmin is defined relative
+        to a unit sampling rate (see numpy's rfftfreq). For convenience,
+        the passed value is mapped to max(fmin, 1/samples) internally
+        since 1/samples is the lowest possible finite frequency in the
+        sample. The largest possible value is fmin = 0.5, the Nyquist
+        frequency. The output for this value is white noise.
+    random_state :  int, numpy.integer, numpy.random.Generator, numpy.random.RandomState,
+                    optional
+        Optionally sets the state of NumPy's underlying random number generator.
+        Integer-compatible values or None are passed to np.random.default_rng.
+        np.random.RandomState or np.random.Generator are used directly.
+        Default: None.
+    Returns
+    -------
+    out : array
+        The samples.
+    Examples:
+    ---------
+    # generate 1/f noise == pink noise == flicker noise
+    >>> import colorednoise as cn
+    >>> y = cn.powerlaw_psd_gaussian(1, 5)
+    """
+    # Make sure size is a list so we can iterate it and assign to it.
+    try:
+        size = list(size)
+    except TypeError:
+        size = [size]
+    # The number of samples in each time series
+    samples = size[-1]
+    # Calculate Frequencies (we asume a sample rate of one)
+    # Use fft functions for real output (-> hermitian spectrum)
+    f = rfftfreq(samples)
+    # Validate / normalise fmin
+    if 0 <= fmin <= 0.5:
+        fmin = max(fmin, 1./samples) # Low frequency cutoff
+    else:
+        raise ValueError("fmin must be chosen between 0 and 0.5.")
+    # Build scaling factors for all frequencies
+    s_scale = f
+    ix   = npsum(s_scale < fmin)   # Index of the cutoff
+    if ix and ix < len(s_scale):
+        s_scale[:ix] = s_scale[ix]
+    s_scale = s_scale**(-exponent/2.)
+    # Calculate theoretical output standard deviation from scaling
+    w      = s_scale[1:].copy()
+    w[-1] *= (1 + (samples % 2)) / 2. # correct f = +-0.5
+    sigma = 2 * sqrt(npsum(w**2)) / samples
+    # Adjust size to generate one Fourier component per frequency
+    size[-1] = len(f)
+    # Add empty dimension(s) to broadcast s_scale along last
+    # dimension of generated random power + phase (below)
+    dims_to_add = len(size) - 1
+    s_scale     = s_scale[(newaxis,) * dims_to_add + (Ellipsis,)]
+    # prepare random number generator
+    normal_dist = _get_normal_distribution(random_state)
+    # Generate scaled random power + phase
+    sr = normal_dist(scale=s_scale, size=size)
+    si = normal_dist(scale=s_scale, size=size)
+    # If the signal length is even, frequencies +/- 0.5 are equal
+    # so the coefficient must be real.
+    if not (samples % 2):
+        si[..., -1] = 0
+        sr[..., -1] *= sqrt(2)    # Fix magnitude
+    # Regardless of signal length, the DC component must be real
+    si[..., 0] = 0
+    sr[..., 0] *= sqrt(2)    # Fix magnitude
+    # Combine power + corrected phase to Fourier components
+    s  = sr + 1J * si
+    # Transform to real time series & scale to unit variance
+    y = irfft(s, n=samples, axis=-1) / sigma
+    return y
+def _get_normal_distribution(random_state):
+    normal_dist = None
+    if isinstance(random_state, (integer, int)) or random_state is None:
+        random_state = default_rng(random_state)
+        normal_dist = random_state.normal
+    elif isinstance(random_state, (Generator, RandomState)):
+        normal_dist = random_state.normal
+    else:
+        raise ValueError(
+            "random_state must be one of integer, numpy.random.Generator, or None"
+            "numpy.random.Randomstate"
+        )
+    return normal_dist
+class WhitePinkBrownAugmentation:
+    def __init__(self, max_white_level=1e-3, max_pink_level=5e-3, max_brown_level=5e-3):
+        """
+        max_shift: Maximum shift (inclusive) in both directions
+        unique: Whether the same shift across channels is unique
+        """
+        self.max_white_level = max_white_level
+        self.max_pink_level = max_pink_level
+        self.max_brown_level = max_brown_level
+    def __call__(self, audio_data, gt_audio, rng: np.random.RandomState):
+        wn = generate_white_noise(audio_data.shape, self.max_white_level, rng)
+        pn = generate_pink_noise(audio_data.shape, self.max_pink_level, rng)
+        bn = generate_brown_noise(audio_data.shape, self.max_brown_level, rng)
+        # print("ssss")
+        augmented_audio = audio_data + (wn + pn + bn)
+        return augmented_audio, gt_audio

src/hl_module/joint_train_hl_module_new.py ADDED Viewed

	@@ -0,0 +1,543 @@

+import os
+import torch
+import torch.nn as nn
+import torch.optim as optim
+import wandb
+import torch
+from numpy import mean
+from src.metrics.metrics import Metrics
+import src.utils as utils
+import numpy as np
+class FakeModel(nn.Module):
+    def __init__(self, model):
+        super(FakeModel, self).__init__()
+        self.model = model
+class PLModule(object):
+    def __init__(
+        self,
+        model,
+        model_params,
+        sr,
+        optimizer,
+        optimizer_params,
+        scheduler=None,
+        scheduler_params=None,
+        loss=None,
+        loss_params=None,
+        metrics=[],
+        slow_model_ckpt=None,
+        prev_ckpt=None,
+        grad_clip=None,
+        use_dp=True,
+        val_log_interval=10,  # Unused, only kept for compatibility TODO: Remove
+        samples_per_speaker_number=3,
+        freeze_model1=False,
+    ):
+        self.model = utils.import_attr(model)(**model_params)
+        self.use_dp = use_dp
+        if use_dp:
+            self.model = nn.DataParallel(self.model)
+        self.sr = sr
+        # Log a val sample every this many intervals
+        # self.val_log_interval = val_log_interval
+        self.samples_per_speaker_number = samples_per_speaker_number
+        # Initialize metrics
+        self.metrics = [Metrics(metric) for metric in metrics]
+        # Metric values
+        self.metric_values = {}
+        # Dataset statistics
+        self.statistics = {}
+        # Assine metric to monitor, and how to judge different models based on it
+        # i.e. How do we define the best model (Here, we minimize val loss)
+        self.monitor = "val/loss"
+        self.monitor_mode = "min"
+        # Mode, either train or val
+        self.mode = None
+        self.val_samples = {}
+        self.train_samples = {}
+        self.input_snr_calculated = False
+        self.input_snr = []
+        self.snr_metric = Metrics("snr")
+        # Initialize loss function
+        self.loss_fn = utils.import_attr(loss)(**loss_params)
+        # Initaize weights if checkpoint is provided
+        # prev ckpt is for the checkpoint of the complete joint model (fast+slow) you want to train from
+        if prev_ckpt is not None:
+            if prev_ckpt.endswith(".ckpt"):
+                print("load prev model", prev_ckpt)
+                state = torch.load(prev_ckpt)["state_dict"]
+                # print(state.keys())
+                print(state["current_epoch"])
+                if self.use_dp:
+                    _model = self.model.module
+                else:
+                    _model = self.model
+                mdl = FakeModel(_model)
+                mdl.load_state_dict(state)
+                self.model = nn.DataParallel(mdl.model)
+            else:
+                print("load prev model", prev_ckpt)
+                state = torch.load(prev_ckpt)
+                print(state["current_epoch"])
+                state = state["model"]
+                if self.use_dp:
+                    self.model.module.load_state_dict(state)
+                else:
+                    self.model.load_state_dict(state)
+        # init ckpt stands for the slow model's initial weights checkpoint path
+        elif slow_model_ckpt is not None:
+            print(f"Loading model 1 weights from checkpoint: {slow_model_ckpt}")
+            model1_ckpt = torch.load(slow_model_ckpt)
+            print("current epoch is {}".format(model1_ckpt["current_epoch"]))
+            model1_state_dict = {
+                key.replace("tce_model.", ""): value
+                for key, value in model1_ckpt["model"].items()
+                if key.startswith("tce_model.")
+            }
+            if self.use_dp:
+                self.model.module.model1.load_state_dict(model1_state_dict, strict=False)
+            else:
+                self.model.model1.load_state_dict(model1_state_dict, strict=False)
+        else:
+            print("Loading model from scratch, no slow model init ckpt or joint model init ckpt")
+        # whether freeze slow model during training
+        self.freeze = freeze_model1
+        if freeze_model1:
+            self.freeze_model1()
+            params_to_optimize = filter(lambda p: p.requires_grad, self.model.parameters())
+            # Initialize optimizer
+            self.optimizer = utils.import_attr(optimizer)(params_to_optimize, **optimizer_params)
+            self.optim_name = optimizer
+            self.opt_params = optimizer_params
+        else:
+            # Initialize optimizer
+            self.optimizer = utils.import_attr(optimizer)(self.model.parameters(), **optimizer_params)
+            self.optim_name = optimizer
+            self.opt_params = optimizer_params
+        # Grad clip
+        self.grad_clip = grad_clip
+        if self.grad_clip is not None:
+            print(f"USING GRAD CLIP: {self.grad_clip}")
+        else:
+            print("ERROR! NOT USING GRAD CLIP" * 100)
+        # Initialize scheduler
+        self.scheduler = self.init_scheduler(scheduler, scheduler_params)
+        self.scheduler_name = scheduler
+        self.scheduler_params = scheduler_params
+        self.epoch = 0
+    def freeze_model1(self):
+        """Freezes the weights of model1."""
+        print("Freezing model1 weights")
+        model1 = self.model.module.model1 if self.use_dp else self.model.model1
+        for param in model1.parameters():
+            param.requires_grad = False
+        print("Model1 weights frozen.")
+    def load_state(self, path, map_location=None):
+        state = torch.load(path, map_location=map_location)
+        if self.use_dp:
+            self.model.module.load_state_dict(state["model"])
+        else:
+            self.model.load_state_dict(state["model"])
+        # Re-initialize optimizer
+        if not self.freeze:
+            self.optimizer = utils.import_attr(self.optim_name)(self.model.parameters(), **self.opt_params)
+        else:
+            params_to_optimize = filter(lambda p: p.requires_grad, self.model.parameters())
+            self.optimizer = utils.import_attr(self.optim_name)(params_to_optimize, **self.opt_params)
+        # Re-initialize scheduler (Order might be important?)
+        if self.scheduler is not None:
+            self.scheduler = self.init_scheduler(self.scheduler_name, self.scheduler_params)
+        self.optimizer.load_state_dict(state["optimizer"])
+        if self.scheduler is not None:
+            self.scheduler.load_state_dict(state["scheduler"])
+        self.epoch = state["current_epoch"]
+        print("Load model from epoch", self.epoch)
+        self.metric_values = state["metric_values"]
+        if "statistics" in self.statistics:
+            self.statistics = state["statistics"]
+    def dump_state(self, path):
+        if self.use_dp:
+            _model = self.model.module
+        else:
+            _model = self.model
+        state = dict(
+            model=_model.state_dict(),
+            optimizer=self.optimizer.state_dict(),
+            current_epoch=self.epoch,
+            metric_values=self.metric_values,
+            statistics=self.statistics,
+        )
+        if self.scheduler is not None:
+            state["scheduler"] = self.scheduler.state_dict()
+        print("save to " + path)
+        torch.save(state, path)
+    def get_current_lr(self):
+        for param_group in self.optimizer.param_groups:
+            return param_group["lr"]
+    def on_epoch_start(self):
+        print()
+        print("=" * 25, "STARTING EPOCH", self.epoch, "=" * 25)
+        print()
+    def get_avg_metric_at_epoch(self, metric, epoch=None):
+        if epoch is None:
+            epoch = self.epoch
+        return self.metric_values[epoch][metric]["epoch"] / self.metric_values[epoch][metric]["num_elements"]
+    def on_epoch_end(self, best_path, wandb_run):
+        assert self.epoch + 1 == len(
+            self.metric_values
+        ), "Current epoch must be equal to length of metrics (0-indexed)"
+        monitor_metric_last = self.get_avg_metric_at_epoch(self.monitor)
+        # Go over all epochs
+        save = True
+        for epoch in range(len(self.metric_values) - 1):
+            monitor_metric_at_epoch = self.get_avg_metric_at_epoch(self.monitor, epoch)
+            if self.monitor_mode == "max":
+                # If there is any model with monitor larger than current, then
+                # this is not the best model
+                if monitor_metric_last < monitor_metric_at_epoch:
+                    save = False
+                    break
+            if self.monitor_mode == "min":
+                # If there is any model with monitor smaller than current, then
+                # this is not the best model
+                if monitor_metric_last > monitor_metric_at_epoch:
+                    save = False
+                    break
+        # If this is best, save it
+        if save:
+            print("Current checkpoint is the best! Saving it...")
+            self.dump_state(best_path)
+        val_loss = self.get_avg_metric_at_epoch("val/loss")
+        val_snr_i = self.get_avg_metric_at_epoch("val/snr_i")
+        val_si_snr_i = self.get_avg_metric_at_epoch("val/si_snr_i")
+        print(f"Val loss: {val_loss:.02f}")
+        print(f"Val SNRi: {val_snr_i:.02f}dB")
+        print(f"Val SI-SDRi: {val_si_snr_i:.02f}dB")
+        # Log stuff on wandb
+        wandb_run.log({"lr-Adam": self.get_current_lr()}, commit=False, step=self.epoch + 1)
+        for metric in self.metric_values[self.epoch]:
+            wandb_run.log({metric: self.get_avg_metric_at_epoch(metric)}, commit=False, step=self.epoch + 1)
+        for statistic in self.statistics:
+            if not self.statistics[statistic]["logged"]:
+                data = self.statistics[statistic]["data"]
+                reduction = self.statistics[statistic]["reduction"]
+                if reduction == "mean":
+                    val = mean(data)
+                elif reduction == "sum":
+                    val = sum(data)
+                elif reduction == "histogram":
+                    data = [[d] for d in data]
+                    table = wandb.Table(data=data, columns=[statistic])
+                    val = wandb.plot.histogram(table, statistic, title=statistic)
+                else:
+                    assert 0, f"Unknown reduction {reduction}."
+                wandb_run.log({statistic: val}, commit=False)
+                self.statistics[statistic]["logged"] = True
+        wandb_run.log({"epoch": self.epoch}, commit=True, step=self.epoch + 1)
+        if self.scheduler is not None:
+            if type(self.scheduler) == torch.optim.lr_scheduler.ReduceLROnPlateau:
+                # Get last metric
+                self.scheduler.step(monitor_metric_last)
+            else:
+                self.scheduler.step()
+        self.epoch += 1
+    def log_statistic(self, name, value, reduction="mean"):
+        if name not in self.statistics:
+            self.statistics[name] = dict(logged=False, data=[], reduction=reduction)
+        self.statistics[name]["data"].append(value)
+    def log_metric(self, name, value, batch_size=1, on_step=False, on_epoch=True, prog_bar=True, sync_dist=True):
+        """
+        Logs a metric
+        value must be the AVERAGE value across the batch
+        Must provide batch size for accurate average computation
+        """
+        epoch_str = self.epoch
+        if epoch_str not in self.metric_values:
+            self.metric_values[epoch_str] = {}
+        if name not in self.metric_values[epoch_str]:
+            self.metric_values[epoch_str][name] = dict(step=None, epoch=None)
+        if type(value) == torch.Tensor:
+            value = value.item()
+        if on_step:
+            if self.metric_values[epoch_str][name]["step"] is None:
+                self.metric_values[epoch_str][name]["step"] = []
+            self.metric_values[epoch_str][name]["step"].append(value)
+        if on_epoch:
+            if self.metric_values[epoch_str][name]["epoch"] is None:
+                self.metric_values[epoch_str][name]["epoch"] = 0
+                self.metric_values[epoch_str][name]["num_elements"] = 0
+            self.metric_values[epoch_str][name]["epoch"] += value * batch_size
+            self.metric_values[epoch_str][name]["num_elements"] += batch_size
+    def val_naive(self, batch, batch_idx):
+        inputs, targets = batch
+        a = torch.cuda.memory_allocated(inputs["mixture"].device)
+        outputs = self.model(inputs)
+        b = torch.cuda.memory_allocated(inputs["mixture"].device)
+        print("Infer consume M", (b - a) / 1e6)
+        return outputs
+    def train_naive(self, batch, batch_idx):
+        self.reset_grad()
+        inputs, targets = batch
+        a = torch.cuda.memory_allocated(inputs["mixture"].device)
+        # print("a", a/1e9 )
+        outputs = self.model(inputs)
+        est = outputs["output"]
+        gt = targets["target"]
+        # Compute loss
+        loss = self.loss_fn(est=est, gt=gt).mean()
+        b = torch.cuda.memory_allocated(inputs["mixture"].device)
+        loss.backward(retain_graph=True)
+        c = torch.cuda.memory_allocated(inputs["mixture"].device)
+        self.backprop()
+        d = torch.cuda.memory_allocated(inputs["mixture"].device)
+        print("Training consume G", (b - a) / 1e9, (c - a) / 1e9, (d - c) / 1e9, a / 1e9)
+        return outputs
+    def silence_audio(self, input, timestamp):
+        output_audio = input.clone()
+        for start, end in timestamp:
+            output_audio[start:end] = 0.0
+        return output_audio
+    def _step(self, batch, batch_idx, step="train"):
+        inputs, targets = batch
+        batch_size = inputs["mixture"].shape[0]
+        start_idx = inputs["start_idx_list"][0].item()
+        end_idx = inputs["end_idx_list"][0].item()
+        inputs["start_idx"] = start_idx
+        inputs["end_idx"] = end_idx
+        outputs = self.model(inputs)
+        est = outputs["output"].clone()
+        if "audio_range" in outputs:
+            audio_range = outputs["audio_range"]
+            start_indices = audio_range[:, 0]  # Shape: [batch]
+            end_indices = audio_range[:, 1]
+            sliced_gt = []
+            sliced_mix = []
+            sliced_self = []
+            # masked_est_list=[]
+            gt_clone = targets["target"].clone()
+            mix_clone = inputs["mixture"][:, 0:1].clone()
+            full_self_speech_clone = inputs["self_speech"].clone()
+            for index in range(est.size(0)):
+                start = start_indices[index].item()
+                end = end_indices[index].item()
+                sliced_gt.append(gt_clone[index, :, start:end])
+                sliced_mix.append(mix_clone[index, :, start:end])
+                sliced_self.append(full_self_speech_clone[index, :, start:end])
+            # Stack the sliced audio to form the final tensor
+            gt = torch.stack(sliced_gt, dim=0)
+            mix = torch.stack(sliced_mix, dim=0)
+            self_speech_final = torch.stack(sliced_self, dim=0)
+        else:
+            mix = inputs["mixture"][:, 0:1].clone()
+            gt = targets["target"].clone()
+            self_speech_final = targets["self_speech"].clone()
+        # Compute loss
+        loss = self.loss_fn(est=est, gt=gt).mean()
+        est_detached = est.detach().clone()
+        with torch.no_grad():
+            # Log loss
+            self.log_metric(
+                f"{step}/loss",
+                loss.item(),
+                batch_size=batch_size,
+                on_step=(step == "train"),
+                on_epoch=True,
+                prog_bar=True,
+                sync_dist=True,
+            )
+            # Log metrics
+            for metric in self.metrics:
+                if step == "train" and (metric.name == "PESQ" or metric.name == "STOI"):
+                    continue
+                metric_val = metric(est=est_detached, gt=gt, mix=mix, self_speech=self_speech_final)
+                for i in range(batch_size):
+                    # if gt is all zero, cannot compute metric
+                    if torch.all(gt[i] == 0):
+                        # print(f"Skipping sample {i} in batch because gt is all zeros.")
+                        continue
+                    val = metric_val[i].item()
+                    self.log_metric(
+                        f"{step}/{metric.name}",
+                        val,
+                        batch_size=1,
+                        on_step=False,
+                        on_epoch=True,
+                        prog_bar=True,
+                        sync_dist=True,
+                    )
+        # Create collection of things to show in a sample on wandb
+        sample = {
+            "mixture": mix,
+            "output": est_detached,
+            "target": gt,
+        }
+        return loss, sample
+    def train(self):
+        self.model.train()
+        self.mode = "train"
+    def eval(self):
+        self.model.eval()
+        self.mode = "val"
+    def training_step(self, batch, batch_idx):
+        loss, sample = self._step(batch, batch_idx, step="train")
+        target = sample["target"]
+        return loss, target.shape[0]
+    def validation_step(self, batch, batch_idx):
+        loss, sample = self._step(batch, batch_idx, step="val")
+        target = sample["target"]
+        return loss, target.shape[0]
+    def reset_grad(self):
+        self.optimizer.zero_grad()
+    def backprop(self):
+        # print("BACKPROP")
+        # print(self.grad_clip)
+        # Gradient clipping
+        if self.grad_clip is not None:
+            # print("Clipping grad norm")
+            torch.nn.utils.clip_grad_norm_(self.model.parameters(), self.grad_clip)
+        self.optimizer.step()
+    def configure_optimizers(self):
+        if self.scheduler is not None:
+            # For reduce LR on plateau, we need to provide more information
+            if type(self.scheduler) == torch.optim.lr_scheduler.ReduceLROnPlateau:
+                scheduler_cfg = {
+                    "scheduler": self.scheduler,
+                    "interval": "epoch",
+                    "frequency": 1,
+                    "monitor": self.monitor,
+                    "strict": False,
+                }
+            else:
+                scheduler_cfg = self.scheduler
+            return [self.optimizer], [scheduler_cfg]
+        else:
+            return self.optimizer
+    def init_scheduler(self, scheduler, scheduler_params):
+        if scheduler is not None:
+            if scheduler == "sequential":
+                schedulers = []
+                milestones = []
+                for scheduler_param in scheduler_params:
+                    sched = utils.import_attr(scheduler_param["name"])(self.optimizer, **scheduler_param["params"])
+                    schedulers.append(sched)
+                    milestones.append(scheduler_param["epochs"])
+                # Cumulative sum for milestones
+                for i in range(1, len(milestones)):
+                    milestones[i] = milestones[i - 1] + milestones[i]
+                # Remove last milestone as it is implied by num epochs
+                milestones.pop()
+                scheduler = torch.optim.lr_scheduler.SequentialLR(self.optimizer, schedulers, milestones)
+            else:
+                scheduler = utils.import_attr(scheduler)(self.optimizer, **scheduler_params)
+        return scheduler

src/losses/.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

src/losses/SNRLP.py ADDED Viewed

	@@ -0,0 +1,42 @@

+import torch
+import torch.nn as nn
+import math
+from src.losses.SNRLosses import SNRLosses
+from src.losses.LogPowerLoss import LogPowerLoss
+class SNRLPLoss(nn.Module):
+    def __init__(self, snr_loss_name = "snr", neg_weight = 1) -> None:
+        super().__init__()
+        self.snr_loss = SNRLosses(snr_loss_name)
+        #self.lp_loss = LogPowerLoss()
+        self.lp_loss = nn.L1Loss()#LogPowerLoss()
+        self.neg_weight = neg_weight
+    def forward(self, est: torch.Tensor, gt: torch.Tensor, **kwargs):
+        """
+        input: (B, C, T) (B, C, T)
+        """
+        # print(est.shape, gt.shape)
+        neg_loss = 0
+        pos_loss = 0
+        comp_loss = torch.zeros((est.shape[0]), device=est.device)
+        mask = (torch.max(torch.max(torch.abs(gt), dim=2)[0], dim=1)[0] == 0)
+        #print("mask", mask)
+        # If there's at least one negative sample
+        if any(mask):
+            est_neg, gt_neg = est[mask], gt[mask]
+            neg_loss = self.lp_loss(est_neg, gt_neg)
+            comp_loss[mask] = neg_loss * self.neg_weight
+        # If there's at least one positive sample
+        if any((~ mask)):
+            est_pos, gt_pos = est[~mask], gt[~mask]
+            pos_loss = self.snr_loss(est_pos, gt_pos)
+            # Compute_joint_loss
+            comp_loss[~mask] = pos_loss
+        return comp_loss

src/metrics/metrics.py ADDED Viewed

	@@ -0,0 +1,100 @@

+import torch
+import torch.nn as nn
+from torchaudio.functional import resample
+from torchmetrics.functional import(
+    scale_invariant_signal_distortion_ratio as si_sdr,
+    scale_invariant_signal_noise_ratio as si_snr,
+    signal_noise_ratio as snr)
+from torchmetrics.functional.audio.stoi import short_time_objective_intelligibility as STOI
+from torchmetrics.functional.audio.pesq import perceptual_evaluation_speech_quality as PESQ
+import numpy as np
+import copy
+from src.losses.MultiResoLoss import MultiResoFuseLoss
+from src.losses.Perceptual_Loss import PLCPALoss
+def compute_decay(est, mix):
+    """
+    [*, C, T]
+    """
+    types = type(est)
+    assert type(mix) == types, "All arrays must be the same type"
+    if types == np.ndarray:
+        est, mix = torch.from_numpy(est), torch.from_numpy(mix)
+    # Ensure that, no matter what, we do not modify the original arrays
+    est = est.clone()
+    mix = mix.clone()
+    P_est = 10 * torch.log10(torch.sum(est ** 2, dim=-1)) # [*, C]
+    P_mix = 10 * torch.log10(torch.sum(mix ** 2, dim=-1))
+    return (P_mix - P_est).mean(dim=-1) # [*]
+class Metrics(nn.Module):
+    def __init__(self, name, fs = 24000, **kwargs) -> None:
+        super().__init__()
+        self.fs = fs
+        self.func = None
+        self.name=name
+        if name == 'snr':
+            self.func = lambda est, gt, mix, self_speech: snr(preds=est, target=gt)
+        elif name == 'snr_i':
+            self.func = lambda est, gt, mix, self_speech: snr(preds=est, target=gt) - snr(preds=mix, target=gt)
+        elif name == 'si_snr':
+            self.func = lambda est, gt, mix, self_speech: si_snr(preds=est, target=gt)
+        elif name == 'si_snr_i':
+            self.func = lambda est, gt, mix, self_speech: si_snr(preds=est, target=gt) - si_snr(preds=mix, target=gt)
+        elif name == 'si_sdr':
+            self.func = lambda est, gt, mix, self_speech: si_sdr(preds=est, target=gt)
+        elif name == 'si_sdr_i':
+            self.func = lambda est, gt, mix, self_speech: si_sdr(preds=est, target=gt) - si_sdr(preds=mix, target=gt)
+        elif name == 'si_sdr_i_adj':
+            self.func = lambda est, gt, mix, self_speech: si_sdr(preds=est, target=gt) - si_sdr(preds=mix, target=gt+self_speech)
+        elif name == 'STOI':
+            self.func = lambda est, gt, mix, self_speech: STOI(preds=est, target=gt, fs=fs)
+        elif name == 'PESQ':
+            fs_new = 16000
+            self.func = lambda est, gt, mix, self_speech: PESQ(preds=resample(est, fs, fs_new), target=resample(gt, fs, fs_new), fs=fs_new, mode = "nb")
+        elif name == 'Multi_Reso_L1':
+            mult_ireso_loss = MultiResoFuseLoss(**kwargs)
+            self.func = lambda est, gt, mix, self_speech: mult_ireso_loss(est = est, gt = gt)
+        elif name == 'PLCPALoss':
+            plcpa = PLCPALoss(**kwargs)
+            self.func = lambda est, gt, mix, self_speech: plcpa(est = est, gt = gt)
+        else:
+            raise NotImplementedError(f"Metric {name} not implemented!")
+    def forward(self, est, gt, mix, self_speech=None):
+        """
+        input: (*, C, T)
+        output: (*)
+        """
+        types = type(est)
+        assert type(gt) == types and type(mix) == types, "All arrays must be the same type"
+        if types == np.ndarray:
+            est, gt, mix = torch.from_numpy(est), torch.from_numpy(gt), torch.from_numpy(mix)
+        # Ensure that, no matter what, we do not modify the original arrays
+        est = est.clone()
+        gt = gt.clone()
+        mix = mix.clone()
+        if self_speech is not None:
+            if type(self_speech)==np.ndarray:
+                self_speech=torch.from_numpy(self_speech)
+            self_speech=self_speech.clone()
+        # print("shape of est in metrics is {}".format(est.shape)) [1, 1, 160000]
+        # print("shape of gt is {}".format(gt.shape))
+        # print("mix has shape {}".format(mix.shape))
+        # per_channel_metrics = self.func(est=est, gt=gt, mix=mix) # [*, C]
+        per_channel_metrics = self.func(est=est, gt=gt, mix=mix, self_speech=self_speech) # [*, C]
+        if self.name == "PLCPALoss":
+            return per_channel_metrics[0].mean(dim=-1), per_channel_metrics[1].mean(dim=-1), per_channel_metrics[2].mean(dim=-1)
+        else:
+            return per_channel_metrics.mean(dim=-1) # [*]

src/models/blocks/model1_block.py ADDED Viewed

	@@ -0,0 +1,434 @@

+import math
+import time
+from collections import OrderedDict
+from typing import Dict, List, Optional, Tuple
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from espnet2.torch_utils.get_layer_from_string import get_layer
+from torch.nn import init
+from torch.nn.parameter import Parameter
+import src.utils as utils
+class Lambda(nn.Module):
+    def __init__(self, lambd):
+        super().__init__()
+        import types
+        assert type(lambd) is types.LambdaType
+        self.lambd = lambd
+    def forward(self, x):
+        return self.lambd(x)
+class LayerNormPermuted(nn.LayerNorm):
+    def __init__(self, *args, **kwargs):
+        super(LayerNormPermuted, self).__init__(*args, **kwargs)
+    def forward(self, x):
+        """
+        Args:
+            x: [B, C, T, F]
+        """
+        x = x.permute(0, 2, 3, 1)  # [B, T, F, C]
+        x = super().forward(x)
+        x = x.permute(0, 3, 1, 2)  # [B, C, T, F]
+        return x
+# Use native layernorm implementation
+class LayerNormalization4D(nn.Module):
+    def __init__(self, C, eps=1e-5, preserve_outdim=False):
+        super().__init__()
+        self.norm = nn.LayerNorm(C, eps=eps)
+        self.preserve_outdim = preserve_outdim
+    def forward(self, x: torch.Tensor):
+        """
+        input: (*, C)
+        """
+        x = self.norm(x)
+        return x
+class LayerNormalization4DCF(nn.Module):
+    def __init__(self, input_dimension, eps=1e-5):
+        assert len(input_dimension) == 2
+        Q, C = input_dimension
+        super().__init__()
+        self.norm = nn.LayerNorm((Q * C), eps=eps)
+    def forward(self, x: torch.Tensor):
+        """
+        input: (B, T, Q * C)
+        """
+        x = self.norm(x)
+        return x
+class LayerNormalization4D_old(nn.Module):
+    def __init__(self, input_dimension, eps=1e-5):
+        super().__init__()
+        param_size = [1, input_dimension, 1, 1]
+        self.gamma = Parameter(torch.Tensor(*param_size).to(torch.float32))
+        self.beta = Parameter(torch.Tensor(*param_size).to(torch.float32))
+        init.ones_(self.gamma)
+        init.zeros_(self.beta)
+        self.eps = eps
+    def forward(self, x):
+        if x.ndim == 4:
+            _, C, _, _ = x.shape
+            stat_dim = (1,)
+        else:
+            raise ValueError("Expect x to have 4 dimensions, but got {}".format(x.ndim))
+        mu_ = x.mean(dim=stat_dim, keepdim=True)  # [B,1,T,F]
+        std_ = torch.sqrt(x.var(dim=stat_dim, unbiased=False, keepdim=True) + self.eps)  # [B,1,T,F]
+        x_hat = ((x - mu_) / std_) * self.gamma + self.beta
+        return x_hat
+def mod_pad(x, chunk_size, pad):
+    # Mod pad the rminput to perform integer number of
+    # inferences
+    mod = 0
+    if (x.shape[-1] % chunk_size) != 0:
+        mod = chunk_size - (x.shape[-1] % chunk_size)
+    x = F.pad(x, (0, mod))
+    x = F.pad(x, pad)
+    return x, mod
+class Attention_STFT_causal(nn.Module):
+    def __getitem__(self, key):
+        return getattr(self, key)
+    def __init__(
+        self,
+        emb_dim,
+        n_freqs,
+        approx_qk_dim=512,
+        n_head=4,
+        activation="prelu",
+        eps=1e-5,
+        skip_conn=True,
+        use_flash_attention=False,
+        dim_feedforward=-1,
+        local_context_len=-1,
+        # 6
+    ):
+        super().__init__()
+        self.position_code = utils.PositionalEncoding(emb_dim * n_freqs, max_len=5000)
+        self.skip_conn = skip_conn
+        self.n_freqs = n_freqs
+        self.E = math.ceil(approx_qk_dim * 1.0 / n_freqs)  # approx_qk_dim is only approximate
+        self.n_head = n_head
+        self.V_dim = emb_dim // n_head
+        self.emb_dim = emb_dim
+        assert emb_dim % n_head == 0
+        E = self.E
+        self.use_flash_attention = use_flash_attention
+        self.local_context_len = local_context_len
+        self.add_module(
+            "attn_conv_Q",
+            nn.Sequential(
+                nn.Linear(emb_dim, E * n_head),  # [B, T, Q, HE]
+                get_layer(activation)(),
+                # [B, T, Q, H, E] -> [B, H, T, Q, E] ->  [B * H, T, Q * E]
+                Lambda(
+                    lambda x: x.reshape(x.shape[0], x.shape[1], x.shape[2], n_head, E)
+                    .permute(0, 3, 1, 2, 4)
+                    .reshape(x.shape[0] * n_head, x.shape[1], x.shape[2] * E)
+                ),  # (BH, T, Q * E)
+                LayerNormalization4DCF((n_freqs, E), eps=eps),
+            ),
+        )
+        self.add_module(
+            "attn_conv_K",
+            nn.Sequential(
+                nn.Linear(emb_dim, E * n_head),
+                get_layer(activation)(),
+                Lambda(
+                    lambda x: x.reshape(x.shape[0], x.shape[1], x.shape[2], n_head, E)
+                    .permute(0, 3, 1, 2, 4)
+                    .reshape(x.shape[0] * n_head, x.shape[1], x.shape[2] * E)
+                ),
+                LayerNormalization4DCF((n_freqs, E), eps=eps),
+            ),
+        )
+        self.add_module(
+            "attn_conv_V",
+            nn.Sequential(
+                nn.Linear(emb_dim, (emb_dim // n_head) * n_head),
+                get_layer(activation)(),
+                Lambda(
+                    lambda x: x.reshape(x.shape[0], x.shape[1], x.shape[2], n_head, (emb_dim // n_head))
+                    .permute(0, 3, 1, 2, 4)
+                    .reshape(x.shape[0] * n_head, x.shape[1], x.shape[2] * (emb_dim // n_head))
+                ),
+                LayerNormalization4DCF((n_freqs, emb_dim // n_head), eps=eps),
+            ),
+        )
+        self.dim_feedforward = dim_feedforward
+        if dim_feedforward == -1:
+            self.add_module(
+                "attn_concat_proj",
+                nn.Sequential(
+                    nn.Linear(emb_dim, emb_dim),
+                    get_layer(activation)(),
+                    Lambda(lambda x: x.reshape(x.shape[0], x.shape[1], x.shape[2] * x.shape[3])),
+                    LayerNormalization4DCF((n_freqs, emb_dim), eps=eps),
+                ),
+            )
+        else:
+            self.linear1 = nn.Linear(emb_dim, dim_feedforward)
+            self.dropout = nn.Dropout(p=0.1)
+            self.activation = nn.ReLU()
+            self.linear2 = nn.Linear(dim_feedforward, emb_dim)
+            self.dropout2 = nn.Dropout(p=0.1)
+            self.norm = LayerNormalization4DCF((n_freqs, emb_dim), eps=eps)
+    def _ff_block(self, x):
+        x = self.linear2(self.dropout(self.activation(self.linear1(x))))
+        return self.dropout2(x)
+    def get_lookahead_mask(self, seq_len, device):
+        if self.local_context_len == -1:
+            mask = (torch.triu(torch.ones((seq_len, seq_len), device=device)) == 1).transpose(0, 1)
+            return mask.detach().to(device)
+        else:
+            mask1 = torch.triu(torch.ones((seq_len, seq_len), device=device)) == 1
+            mask2 = torch.triu(torch.ones((seq_len, seq_len), device=device), diagonal=self.local_context_len) == 0
+            mask = (mask1 * mask2).transpose(0, 1)
+            return mask.detach().to(device)
+    def forward(self, batch):
+        ### input/output B T F C
+        # attention
+        inputs = batch
+        B0, T0, Q0, C0 = batch.shape
+        # positional encoding
+        pos_code = self.position_code(batch)  # 1, T, embed_dim
+        _, T, QC = pos_code.shape
+        pos_code = pos_code.reshape(1, T, Q0, C0)
+        batch = batch + pos_code
+        Q = self["attn_conv_Q"](batch)  # [B', T, Q * C]
+        K = self["attn_conv_K"](batch)  # [B', T, Q * C]
+        V = self["attn_conv_V"](batch)  # [B', T, Q * C]
+        emb_dim = Q.shape[-1]
+        local_mask = self.get_lookahead_mask(batch.shape[1], batch.device)
+        attn_mat = torch.matmul(Q, K.transpose(1, 2)) / (emb_dim**0.5)  # [B', T, T]
+        attn_mat.masked_fill_(local_mask == 0, -float("Inf"))
+        attn_mat = F.softmax(attn_mat, dim=2)  # [B', T, T]
+        V = torch.matmul(attn_mat, V)  # [B', T, Q*C]
+        V = V.reshape(-1, T0, V.shape[-1])  # [BH, T, Q * C]
+        V = V.transpose(1, 2)  # [B', Q * C, T]
+        batch = V.reshape(B0, self.n_head, self.n_freqs, self.V_dim, T0)  # [B, H, Q, C, T]
+        batch = batch.transpose(2, 3)  # [B, H, C, Q, T]
+        batch = batch.reshape(B0, self.n_head * self.V_dim, self.n_freqs, T0)  # [B, HC, Q, T]
+        batch = batch.permute(0, 3, 2, 1)  # [B, T, Q, C]
+        if self.dim_feedforward == -1:
+            batch = self["attn_concat_proj"](batch)  # [B, T, Q * C]
+        else:
+            batch = batch + self._ff_block(batch)  # [B, T, Q, C]
+            batch = batch.reshape(batch.shape[0], batch.shape[1], batch.shape[2] * batch.shape[3])
+            batch = self.norm(batch)
+        batch = batch.reshape(batch.shape[0], batch.shape[1], Q0, C0)  # [B, T, Q, C])
+        # Add batch if attention is performed
+        if self.skip_conn:
+            return batch + inputs
+        else:
+            return batch
+class GridNetBlock(nn.Module):
+    def __getitem__(self, key):
+        return getattr(self, key)
+    def __init__(
+        self,
+        emb_dim,
+        emb_ks,
+        emb_hs,
+        n_freqs,
+        hidden_channels,
+        lstm_fold_chunk,
+        n_head=4,
+        approx_qk_dim=512,
+        activation="prelu",
+        eps=1e-5,
+        pool="mean",
+        last=False,
+        local_context_len=-1,
+        # 6
+    ):
+        super().__init__()
+        bidirectional = True  # bidirectional within the intra frame lstm
+        self.global_atten_causal = True
+        self.last = last
+        self.pool = pool
+        self.lstm_fold_chunk = lstm_fold_chunk
+        self.E = math.ceil(approx_qk_dim * 1.0 / n_freqs)  # approx_qk_dim is only approximate
+        self.V_dim = emb_dim // n_head
+        self.H = hidden_channels
+        in_channels = emb_dim * emb_ks
+        self.in_channels = in_channels
+        self.n_freqs = n_freqs
+        ## intra RNN can be optimized by conv or linear because the frequence length are not very large
+        self.intra_norm = LayerNormalization4D_old(emb_dim, eps=eps)
+        self.intra_rnn = nn.LSTM(in_channels, hidden_channels, 1, batch_first=True, bidirectional=True)
+        self.intra_linear = nn.ConvTranspose1d(hidden_channels * 2, emb_dim, emb_ks, stride=emb_hs)
+        self.emb_dim = emb_dim
+        self.emb_ks = emb_ks
+        self.emb_hs = emb_hs
+        # inter RNN
+        self.inter_norm = LayerNormalization4D_old(emb_dim, eps=eps)
+        self.inter_rnn = nn.LSTM(in_channels, hidden_channels, 1, batch_first=True, bidirectional=bidirectional)
+        self.inter_linear = nn.ConvTranspose1d(hidden_channels * (bidirectional + 1), emb_dim, emb_ks, stride=emb_hs)
+        # attention
+        self.pool_atten_causal = Attention_STFT_causal(
+            emb_dim=emb_dim,
+            n_freqs=n_freqs,
+            approx_qk_dim=approx_qk_dim,
+            n_head=n_head,
+            activation=activation,
+            eps=eps,
+            local_context_len=local_context_len,
+        )
+    def _unfold_timedomain(self, x):
+        BQ, C, T = x.shape
+        x = torch.split(x, self.lstm_fold_chunk, dim=-1)  # [Num_chunk, BQ, C, 100]
+        x = torch.cat(x, dim=0).reshape(-1, BQ, C, self.lstm_fold_chunk)  # [Num_chunk, BQ, C, 100]
+        x = x.permute(1, 0, 3, 2)  # [BQ, Num_chunk, 100, C]
+        return x
+    def forward(self, x, init_state=None):
+        """GridNetBlock Forward.
+        Args:
+            x: [B, C, T, Q]
+            out: [B, C, T, Q]
+        """
+        B, C, old_T, old_Q = x.shape
+        T = math.ceil((old_T - self.emb_ks) / self.emb_hs) * self.emb_hs + self.emb_ks
+        Q = math.ceil((old_Q - self.emb_ks) / self.emb_hs) * self.emb_hs + self.emb_ks
+        x = F.pad(x, (0, Q - old_Q, 0, T - old_T))
+        # ===========================Intra RNN start================================
+        # define intra RNN
+        input_ = x
+        intra_rnn = self.intra_norm(input_)  # [B, C, T, Q]
+        intra_rnn = intra_rnn.transpose(1, 2).contiguous().view(B * T, C, Q)  # [BT, C, Q]
+        intra_rnn = torch.split(intra_rnn, self.emb_ks, dim=-1)  # [Q/I, BT, C, I]
+        intra_rnn = torch.stack(intra_rnn, dim=0)
+        intra_rnn = intra_rnn.permute(1, 2, 3, 0).flatten(1, 2)  # [BT, CI, Q/I]
+        intra_rnn = intra_rnn.transpose(1, 2)  # [BT, -1, nC*emb_ks]
+        self.intra_rnn.flatten_parameters()
+        # apply intra frame LSTM
+        intra_rnn, _ = self.intra_rnn(intra_rnn)  # [BT, -1, H]
+        intra_rnn = intra_rnn.transpose(1, 2)  # [BT, H, -1]
+        intra_rnn = self.intra_linear(intra_rnn)  # [BT, C, Q]
+        intra_rnn = intra_rnn.view([B, T, C, Q])
+        intra_rnn = intra_rnn.transpose(1, 2).contiguous()  # [B, C, T, Q]
+        intra_rnn = intra_rnn + input_  # [B, C, T, Q]
+        intra_rnn = intra_rnn[:, :, :, :old_Q]  # [B, C, T, Q]
+        Q = old_Q
+        # ===========================Intra RNN end================================
+        # ===========================Inter RNN start================================
+        # fold the time domain to chunk
+        inter_rnn = self.inter_norm(intra_rnn)  # [B, C, T, F]
+        inter_rnn = inter_rnn.permute(0, 3, 1, 2).contiguous().view(B * Q, C, T)  # [BF, C, T]
+        inter_rnn = self._unfold_timedomain(inter_rnn)  ### BQ, NUM_CHUNK, CHUNK_SIZE, C
+        BQ, NUM_CHUNK, CHUNKSIZE, C = inter_rnn.shape
+        inter_rnn = inter_rnn.reshape(BQ * NUM_CHUNK, CHUNKSIZE, C)  ### BQ* NUM_CHUNK, CHUNK_SIZE, C
+        inter_rnn = inter_rnn.transpose(2, 1)  # [B, C, T]
+        input_ = inter_rnn
+        inter_rnn = torch.split(inter_rnn, self.emb_ks, dim=-1)
+        inter_rnn = torch.stack(inter_rnn, dim=0)
+        inter_rnn = inter_rnn.permute(1, 2, 3, 0)
+        BF, C, EO, _T = inter_rnn.shape
+        inter_rnn = inter_rnn.reshape(BF, C * EO, _T)
+        inter_rnn = inter_rnn.transpose(1, 2)
+        self.inter_rnn.flatten_parameters()
+        inter_rnn, _ = self.inter_rnn(inter_rnn)  # [BF, -1, H]
+        inter_rnn = inter_rnn.transpose(1, 2)  # [BF, H, -1]
+        inter_rnn = self.inter_linear(inter_rnn)  # [BF, C, T]
+        inter_rnn = inter_rnn + input_  # [BQ* NUM_CHUNK, C, T]
+        inter_rnn = inter_rnn.reshape(B, Q, NUM_CHUNK, C, CHUNKSIZE)
+        inter_rnn = inter_rnn.permute(0, 1, 2, 4, 3)  # B, Q, NUM_CHUNK, CHUNKSIZE, C
+        input_ = inter_rnn  # B, Q, NUM_CHUNK, CHUNKSIZE, C
+        if self.pool == "mean":
+            inter_rnn = torch.mean(inter_rnn, dim=3)  # B, Q, NUM_CHUNK, C
+        elif self.pool == "max":
+            inter_rnn, _ = torch.max(inter_rnn, dim=3)  # B, Q, NUM_CHUNK, C
+        else:
+            raise ValueError("INvalid pool type!")
+        # ===========================Inter RNN end================================
+        # ===========================attention start================================
+        inter_rnn = inter_rnn.transpose(1, 2)  # B, NUM_CHUNK, Q, C
+        inter_rnn = self.pool_atten_causal(inter_rnn)  # B T Q C
+        inter_rnn = inter_rnn.transpose(1, 2)  # B Q T C
+        if self.last == True:
+            return inter_rnn, init_state
+        else:
+            inter_rnn = inter_rnn.unsqueeze(3)
+            inter_rnn = input_ + inter_rnn  # B, Q, NUM_CHUNK, CHUNKSIZE, C
+            inter_rnn = inter_rnn.reshape(B, Q, T, C)
+            inter_rnn = inter_rnn.permute(0, 3, 2, 1)  # B C T Q
+            inter_rnn = inter_rnn[..., :old_T, :]
+            # ===========================attention end================================
+            return inter_rnn, init_state

src/models/blocks/model2_block.py ADDED Viewed

	@@ -0,0 +1,448 @@

+import math
+import time
+from collections import OrderedDict
+from typing import Dict, List, Optional, Tuple
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from espnet2.torch_utils.get_layer_from_string import get_layer
+from torch.nn import init
+from torch.nn.parameter import Parameter
+import src.utils as utils
+class Lambda(nn.Module):
+    def __init__(self, lambd):
+        super().__init__()
+        import types
+        assert type(lambd) is types.LambdaType
+        self.lambd = lambd
+    def forward(self, x):
+        return self.lambd(x)
+class LayerNormPermuted(nn.LayerNorm):
+    def __init__(self, *args, **kwargs):
+        super(LayerNormPermuted, self).__init__(*args, **kwargs)
+    def forward(self, x):
+        """
+        Args:
+            x: [B, C, T, F]
+        """
+        x = x.permute(0, 2, 3, 1)  # [B, T, F, C]
+        x = super().forward(x)
+        x = x.permute(0, 3, 1, 2)  # [B, C, T, F]
+        return x
+# Use native layernorm implementation
+class LayerNormalization4D(nn.Module):
+    def __init__(self, C, eps=1e-5, preserve_outdim=False):
+        super().__init__()
+        self.norm = nn.LayerNorm(C, eps=eps)
+        self.preserve_outdim = preserve_outdim
+    def forward(self, x: torch.Tensor):
+        """
+        input: (*, C)
+        """
+        x = self.norm(x)
+        return x
+class LayerNormalization4DCF(nn.Module):
+    def __init__(self, input_dimension, eps=1e-5):
+        assert len(input_dimension) == 2
+        Q, C = input_dimension
+        super().__init__()
+        self.norm = nn.LayerNorm((Q * C), eps=eps)
+    def forward(self, x: torch.Tensor):
+        """
+        input: (B, T, Q * C)
+        """
+        x = self.norm(x)
+        return x
+class LayerNormalization4D_old(nn.Module):
+    def __init__(self, input_dimension, eps=1e-5):
+        super().__init__()
+        param_size = [1, input_dimension, 1, 1]
+        self.gamma = Parameter(torch.Tensor(*param_size).to(torch.float32))
+        self.beta = Parameter(torch.Tensor(*param_size).to(torch.float32))
+        init.ones_(self.gamma)
+        init.zeros_(self.beta)
+        self.eps = eps
+    def forward(self, x):
+        if x.ndim == 4:
+            _, C, _, _ = x.shape
+            stat_dim = (1,)
+        else:
+            raise ValueError("Expect x to have 4 dimensions, but got {}".format(x.ndim))
+        mu_ = x.mean(dim=stat_dim, keepdim=True)  # [B,1,T,F]
+        std_ = torch.sqrt(x.var(dim=stat_dim, unbiased=False, keepdim=True) + self.eps)  # [B,1,T,F]
+        x_hat = ((x - mu_) / std_) * self.gamma + self.beta
+        return x_hat
+def mod_pad(x, chunk_size, pad):
+    # Mod pad the rminput to perform integer number of
+    # inferences
+    mod = 0
+    if (x.shape[-1] % chunk_size) != 0:
+        mod = chunk_size - (x.shape[-1] % chunk_size)
+    x = F.pad(x, (0, mod))
+    x = F.pad(x, pad)
+    return x, mod
+class Attention_STFT_causal(nn.Module):
+    def __getitem__(self, key):
+        return getattr(self, key)
+    def __init__(
+        self,
+        emb_dim,
+        n_freqs,
+        approx_qk_dim=512,
+        n_head=4,
+        activation="prelu",
+        eps=1e-5,
+        skip_conn=True,
+        use_flash_attention=False,
+        dim_feedforward=-1,
+    ):
+        super().__init__()
+        self.position_code = utils.PositionalEncoding(emb_dim * n_freqs, max_len=5000)
+        self.skip_conn = skip_conn
+        self.n_freqs = n_freqs
+        self.E = math.ceil(approx_qk_dim * 1.0 / n_freqs)  # approx_qk_dim is only approximate
+        self.n_head = n_head
+        self.V_dim = emb_dim // n_head
+        self.emb_dim = emb_dim
+        assert emb_dim % n_head == 0
+        E = self.E
+        self.add_module(
+            "attn_conv_Q",
+            nn.Sequential(
+                nn.Linear(emb_dim, E * n_head),  # [B, T, Q, HE]
+                get_layer(activation)(),
+                # [B, T, Q, H, E] -> [B, H, T, Q, E] ->  [B * H, T, Q * E]
+                Lambda(
+                    lambda x: x.reshape(x.shape[0], x.shape[1], x.shape[2], n_head, E)
+                    .permute(0, 3, 1, 2, 4)
+                    .reshape(x.shape[0] * n_head, x.shape[1], x.shape[2] * E)
+                ),  # (BH, T, Q * E)
+                LayerNormalization4DCF((n_freqs, E), eps=eps),
+            ),
+        )
+        self.add_module(
+            "attn_conv_K",
+            nn.Sequential(
+                nn.Linear(emb_dim, E * n_head),
+                get_layer(activation)(),
+                Lambda(
+                    lambda x: x.reshape(x.shape[0], x.shape[1], x.shape[2], n_head, E)
+                    .permute(0, 3, 1, 2, 4)
+                    .reshape(x.shape[0] * n_head, x.shape[1], x.shape[2] * E)
+                ),
+                LayerNormalization4DCF((n_freqs, E), eps=eps),
+            ),
+        )
+        self.add_module(
+            "attn_conv_V",
+            nn.Sequential(
+                nn.Linear(emb_dim, (emb_dim // n_head) * n_head),
+                get_layer(activation)(),
+                Lambda(
+                    lambda x: x.reshape(x.shape[0], x.shape[1], x.shape[2], n_head, (emb_dim // n_head))
+                    .permute(0, 3, 1, 2, 4)
+                    .reshape(x.shape[0] * n_head, x.shape[1], x.shape[2] * (emb_dim // n_head))
+                ),
+                LayerNormalization4DCF((n_freqs, emb_dim // n_head), eps=eps),
+            ),
+        )
+        self.dim_feedforward = dim_feedforward
+        if dim_feedforward == -1:
+            self.add_module(
+                "attn_concat_proj",
+                nn.Sequential(
+                    nn.Linear(emb_dim, emb_dim),
+                    get_layer(activation)(),
+                    Lambda(lambda x: x.reshape(x.shape[0], x.shape[1], x.shape[2] * x.shape[3])),
+                    LayerNormalization4DCF((n_freqs, emb_dim), eps=eps),
+                ),
+            )
+        else:
+            self.linear1 = nn.Linear(emb_dim, dim_feedforward)
+            self.dropout = nn.Dropout(p=0.1)
+            self.activation = nn.ReLU()
+            self.linear2 = nn.Linear(dim_feedforward, emb_dim)
+            self.dropout2 = nn.Dropout(p=0.1)
+            self.norm = LayerNormalization4DCF((n_freqs, emb_dim), eps=eps)
+    def _ff_block(self, x):
+        x = self.linear2(self.dropout(self.activation(self.linear1(x))))
+        return self.dropout2(x)
+    def get_lookahead_mask(self, seq_len, device):
+        """Creates a binary mask for each sequence which masks future frames.
+        Arguments
+        ---------
+        seq_len: int
+            Length of the sequence.
+        device: torch.device
+            The device on which to create the mask.
+        Example
+        -------
+        >>> a = torch.LongTensor([[1,1,0], [2,3,0], [4,5,0]])
+        >>> get_lookahead_mask(a.shape[1], device)
+        tensor([[0., -inf, -inf],
+                [0., 0., -inf],
+                [0., 0., 0.]])
+        """
+        mask = (torch.triu(torch.ones((seq_len, seq_len), device=device)) == 1).transpose(0, 1)
+        return mask.detach().to(device)
+    def forward(self, batch):
+        ### input/output B T F C
+        # attention
+        inputs = batch
+        B0, T0, Q0, C0 = batch.shape
+        # print("dim of just entering attention stft causal is {}".format(batch.shape))
+        # [2, 12, 133, 16]
+        # positional encoding
+        pos_code = self.position_code(batch)  # 1, T, embed_dim
+        # print("pos_code", pos_code.shape)
+        _, T, QC = pos_code.shape
+        pos_code = pos_code.reshape(1, T, Q0, C0)
+        batch = batch + pos_code
+        # print("shape of q is {}".format(Q.shape))
+        # print("batch shape is {}".format(batch.shape)) [1, 4800, 16, 133]
+        Q = self["attn_conv_Q"](batch)  # [B', T, Q * C]
+        K = self["attn_conv_K"](batch)  # [B', T, Q * C]
+        V = self["attn_conv_V"](batch)  # [B', T, Q * C]
+        emb_dim = Q.shape[-1]
+        local_mask = self.get_lookahead_mask(batch.shape[1], batch.device)
+        attn_mat = torch.matmul(Q, K.transpose(1, 2)) / (emb_dim**0.5)  # [B', T, T]
+        attn_mat.masked_fill_(local_mask == 0, -float("Inf"))
+        attn_mat = F.softmax(attn_mat, dim=2)  # [B', T, T]
+        V = torch.matmul(attn_mat, V)  # [B', T, Q*C]
+        V = V.reshape(-1, T0, V.shape[-1])  # [BH, T, Q * C]
+        V = V.transpose(1, 2)  # [B', Q * C, T]
+        batch = V.reshape(B0, self.n_head, self.n_freqs, self.V_dim, T0)  # [B, H, Q, C, T]
+        batch = batch.transpose(2, 3)  # [B, H, C, Q, T]
+        batch = batch.reshape(B0, self.n_head * self.V_dim, self.n_freqs, T0)  # [B, HC, Q, T]
+        batch = batch.permute(0, 3, 2, 1)  # [B, T, Q, C]
+        if self.dim_feedforward == -1:
+            batch = self["attn_concat_proj"](batch)  # [B, T, Q * C]
+        else:
+            batch = batch + self._ff_block(batch)  # [B, T, Q, C]
+            batch = batch.reshape(batch.shape[0], batch.shape[1], batch.shape[2] * batch.shape[3])
+            batch = self.norm(batch)
+        batch = batch.reshape(batch.shape[0], batch.shape[1], Q0, C0)  # [B, T, Q, C])
+        # print("dim of output of attention stft causal is {}".format(batch.shape))
+        # [2, 12, 133, 16]
+        # Add batch if attention is performed
+        if self.skip_conn:
+            return batch + inputs
+        else:
+            return batch
+class GridNetBlock(nn.Module):
+    def __getitem__(self, key):
+        return getattr(self, key)
+    def __init__(
+        self,
+        emb_dim,
+        emb_ks,
+        emb_hs,
+        n_freqs,
+        hidden_channels,
+        n_head=4,
+        approx_qk_dim=512,
+        activation="prelu",
+        eps=1e-5,
+        pool="mean",
+        use_attention=False,
+    ):
+        super().__init__()
+        bidirectional = False
+        self.global_atten_causal = True
+        self.pool = pool
+        self.E = math.ceil(approx_qk_dim * 1.0 / n_freqs)  # approx_qk_dim is only approximate
+        self.V_dim = emb_dim // n_head
+        self.H = hidden_channels
+        in_channels = emb_dim * emb_ks
+        self.in_channels = in_channels
+        self.n_freqs = n_freqs
+        ## intra RNN can be optimized by conv or linear because the frequence length are not very large
+        self.intra_norm = LayerNormalization4D_old(emb_dim, eps=eps)
+        self.intra_rnn = nn.LSTM(in_channels, hidden_channels, 1, batch_first=True, bidirectional=True)
+        self.intra_linear = nn.ConvTranspose1d(hidden_channels * 2, emb_dim, emb_ks, stride=emb_hs)
+        self.emb_dim = emb_dim
+        self.emb_ks = emb_ks
+        self.emb_hs = emb_hs
+        # inter RNN
+        self.inter_norm = LayerNormalization4D_old(emb_dim, eps=eps)
+        self.inter_rnn = nn.LSTM(in_channels, hidden_channels, 1, batch_first=True, bidirectional=bidirectional)
+        self.inter_linear = nn.ConvTranspose1d(hidden_channels * (bidirectional + 1), emb_dim, emb_ks, stride=emb_hs)
+        # attention
+        self.use_attention = use_attention
+        if self.use_attention:
+            self.pool_atten_causal = Attention_STFT_causal(
+                emb_dim=emb_dim,
+                n_freqs=n_freqs,
+                approx_qk_dim=approx_qk_dim,
+                n_head=n_head,
+                activation=activation,
+                eps=eps,
+            )
+    def init_buffers(self, batch_size, device):
+        return None
+    # def _unfold_timedomain(self, x):
+    #     BQ, C, T= x.shape
+    #     # print("shape of x is {}".format(x.shape))
+    #     # [117, 16, 4801] for causality testing
+    #     # 4800 if training
+    #     x = torch.split(x, self.lstm_fold_chunk, dim=-1) # [Num_chunk, BQ, C, 100]
+    #     x = torch.cat(x, dim=0).reshape(-1, BQ, C, self.lstm_fold_chunk) # [Num_chunk, BQ, C, 100]
+    #     x = x.permute(1, 0, 3, 2) # [BQ, Num_chunk, 100, C]
+    #     return x
+    def forward(self, x, init_state=None):
+        """GridNetBlock Forward.
+        Args:
+            x: [B, C, T, Q]
+            out: [B, C, T, Q]
+        """
+        B, C, old_T, old_Q = x.shape
+        # print("shape of x is {}".format(x.shape))
+        # print("old q is {}".format(old_Q))
+        # print("dim just entered grid net block is {}".format(x.shape))
+        # [1, 16, 4801, 117]
+        T = math.ceil((old_T - self.emb_ks) / self.emb_hs) * self.emb_hs + self.emb_ks
+        Q = math.ceil((old_Q - self.emb_ks) / self.emb_hs) * self.emb_hs + self.emb_ks
+        x = F.pad(x, (0, Q - old_Q, 0, T - old_T))
+        # ===========================Intra RNN start================================
+        # define intra RNN
+        input_ = x
+        intra_rnn = self.intra_norm(input_)  # [B, C, T, Q]
+        intra_rnn = intra_rnn.transpose(1, 2).contiguous().view(B * T, C, Q)  # [BT, C, Q]
+        intra_rnn = torch.split(intra_rnn, self.emb_ks, dim=-1)  # [Q/I, BT, C, I]
+        intra_rnn = torch.stack(intra_rnn, dim=0)
+        intra_rnn = intra_rnn.permute(1, 2, 3, 0).flatten(1, 2)  # [BT, CI, Q/I]
+        intra_rnn = intra_rnn.transpose(1, 2)  # [BT, -1, nC*emb_ks]
+        self.intra_rnn.flatten_parameters()
+        # apply intra frame LSTM
+        intra_rnn, _ = self.intra_rnn(intra_rnn)  # [BT, -1, H]
+        intra_rnn = intra_rnn.transpose(1, 2)  # [BT, H, -1]
+        intra_rnn = self.intra_linear(intra_rnn)  # [BT, C, Q]
+        intra_rnn = intra_rnn.view([B, T, C, Q])
+        intra_rnn = intra_rnn.transpose(1, 2).contiguous()  # [B, C, T, Q]
+        intra_rnn = intra_rnn + input_  # [B, C, T, Q]
+        intra_rnn = intra_rnn[:, :, :, :old_Q]  # [B, C, T, Q]
+        Q = old_Q
+        # ===========================Intra RNN end================================
+        # print("dim after intra rnn is {}".format(intra_rnn.shape))
+        # [1, 16, 4801, 117]
+        # [B, C, T, Q]
+        # inter_rnn=intra_rnn
+        # ===========================Inter RNN start================================
+        # fold the time domain to chunk
+        input_ = intra_rnn
+        inter_rnn = self.inter_norm(intra_rnn)  # [B, C, T, Q]
+        inter_rnn = inter_rnn.transpose(1, 3).reshape(B * Q, T, C)
+        # inter_rnn = (
+        #     inter_rnn.permute(0, 3, 1, 2).contiguous().view(B * Q, C, T)
+        # )  # [BF, C, T]
+        # print("dim of inter rnn is {}".format(inter_rnn.shape))
+        # [117, 16, 4801]
+        self.inter_rnn.flatten_parameters()
+        # print("inter rnn shape is {}".format(inter_rnn.shape))
+        # [133, 400, 16]
+        inter_rnn, _ = self.inter_rnn(inter_rnn)  # [B * Q, -1, H]
+        inter_rnn = inter_rnn.transpose(1, 2)  # [BF, H, -1]
+        inter_rnn = self.inter_linear(inter_rnn)  # [BF, C, T]
+        _, new_C, new_T = inter_rnn.shape
+        inter_rnn = inter_rnn.reshape(B, Q, new_C, new_T)
+        inter_rnn = inter_rnn.permute(0, 2, 3, 1)
+        # print("shape of inter rnn is {}".format(inter_rnn.shape)) # [133, 16, 4800]
+        # print("shape of input_ is {}".format(input_.shape)) # [1, 16, 4800, 133]
+        inter_rnn = inter_rnn + input_
+        # ===========================Inter RNN end================================
+        # inter rnn shape is [B, C, T, Q]
+        # ===========================attention start================================
+        if self.use_attention:
+            out = inter_rnn  # [B, C, T, Q]
+            inter_rnn = inter_rnn.permute(0, 2, 3, 1)
+            inter_rnn = self.pool_atten_causal(inter_rnn)  # B T Q C
+            inter_rnn = inter_rnn.permute(0, 3, 1, 2)  # [B, C, T, Q]
+            inter_rnn = out + inter_rnn  # B, C, T, Q
+            # Output is inter_rnn by default
+            # inter_rnn = inter_rnn.reshape(B, Q, T, C)
+            # inter_rnn = inter_rnn.permute(0, 3, 2, 1) # B C T Q
+            inter_rnn = inter_rnn[..., :old_T, :]
+        # ===========================attention end================================
+        # print("final output inter rnn dimension is {}".format(inter_rnn.shape))
+        # print("old T is {}".format(old_T))
+        # print("final output dimension is {}".format(inter_rnn.shape))
+        # [2, 16, 4800, 133] [B, C, T, Q]
+        #     return inter_rnn, init_state#, [t0 - t0_0, t1 - t0, t2 - t2_0, t3 - t2, t5 - t4, t7 - t6]
+        # else:
+        return inter_rnn, init_state

src/models/network/model1.py ADDED Viewed

	@@ -0,0 +1,196 @@

+import torch
+import torch.nn as nn
+import src.utils as utils
+# from src.models.common.film import FiLM
+class FilmLayer(nn.Module):
+    def __init__(self, D, C, nF, groups = 1):
+        super().__init__()
+        self.D = D # speaker dim 256
+        self.C = C # latent dim 16
+        self.nF = nF
+        self.weight = nn.Conv1d(self.D, self.C * nF, 1, groups = groups)
+        self.bias = nn.Conv1d(self.D, self.C * nF, 1, groups = groups)
+    def forward(self, x: torch.Tensor, embedding: torch.Tensor):
+        """
+        x: (B, D, F, T)
+        embedding: (B, D, F)
+        """
+        B, D, _F, T = x.shape
+        w = self.weight(embedding).reshape(B, self.C, _F, 1) # (B, C, F, 1)
+        b = self.bias(embedding).reshape(B, self.C, _F, 1) # (B, C, F, 1)
+        return x * w + b
+class LayerNormPermuted(nn.LayerNorm):
+    def __init__(self, *args, **kwargs):
+        super(LayerNormPermuted, self).__init__(*args, **kwargs)
+    def forward(self, x):
+        """
+        Args:
+            x: [B, C, T, F]
+        """
+        x = x.permute(0, 2, 3, 1) # [B, T, F, C]
+        x = super().forward(x)
+        x = x.permute(0, 3, 1, 2) # [B, C, T, F]
+        return x
+class Conv_Emb_Generator(nn.Module):
+    def __init__(
+        self,
+        block_model_name,
+        block_model_params,
+        spk_dim=256,
+        n_srcs=1,
+        n_fft=128,
+        latent_dim=16,
+        num_inputs=1,
+        n_layers=6,
+        use_first_ln=True,
+        n_imics=1,
+        lstm_fold_chunk=400,
+        E=2,
+        use_speaker_emb=True,
+        one_emb=True,
+        local_context_len=-1
+        # 6
+    ):
+        super().__init__()
+        self.n_srcs = n_srcs
+        self.n_layers = n_layers
+        self.num_inputs = num_inputs
+        assert n_fft % 2 == 0
+        n_freqs = n_fft // 2 + 1
+        self.n_freqs = n_freqs
+        self.latent_dim = latent_dim
+        self.use_speaker_emb=use_speaker_emb
+        self.one_emb=one_emb
+        attn_approx_qk_dim=E*n_freqs
+        self.n_fft = n_fft
+        self.eps=1.0e-5
+        t_ksize = 3
+        self.t_ksize = t_ksize
+        ks, padding = (t_ksize, t_ksize), (0, 1)
+        self.n_imics=n_imics
+        if not use_speaker_emb:
+            self.n_imics=self.n_imics+1
+        module_list = [nn.Conv2d(2*self.n_imics, latent_dim, ks, padding=padding)]
+        if use_first_ln:
+            module_list.append(LayerNormPermuted(latent_dim))
+        self.conv = nn.Sequential(
+            *module_list
+        )
+        # FiLM layer
+        self.embeds = nn.ModuleList([])
+        self.local_context_len=local_context_len
+        self.blocks = nn.ModuleList([])
+        for _i in range(n_layers-1):
+            self.blocks.append(utils.import_attr(block_model_name)(emb_dim=latent_dim, n_freqs=n_freqs, approx_qk_dim=attn_approx_qk_dim, lstm_fold_chunk=lstm_fold_chunk, last=False, local_context_len=local_context_len, **block_model_params))
+        self.blocks.append(utils.import_attr(block_model_name)(emb_dim=latent_dim, n_freqs=n_freqs, approx_qk_dim=attn_approx_qk_dim, lstm_fold_chunk=lstm_fold_chunk, local_context_len=local_context_len, last=True, **block_model_params))
+        if self.use_speaker_emb and not self.one_emb:
+            for _i in range(n_layers-1):
+                self.embeds.append(FilmLayer(spk_dim, latent_dim, n_freqs, 1))
+        elif self.use_speaker_emb and self.one_emb:
+            self.embeds.append(FilmLayer(spk_dim, latent_dim, n_freqs, 1))
+    def init_buffers(self, batch_size, device):
+        conv_buf = torch.zeros(batch_size, 2*self.n_imics, self.t_ksize - 1, self.n_freqs,
+                device=device)
+        deconv_buf = torch.zeros(batch_size, self.latent_dim, self.t_ksize - 1, self.n_freqs,
+                                 device=device)
+        block_buffers = {}
+        for i in range(len(self.blocks)):
+            block_buffers[f'buf{i}'] = None
+        return dict(conv_buf=conv_buf, deconv_buf=deconv_buf,
+                    block_bufs=block_buffers)
+    def forward(self, current_input: torch.Tensor, embedding: torch.Tensor, input_state, quantized=False) -> torch.Tensor:
+        """
+        B: batch, M: mic, F: freq bin, C: real/imag, T: time frame
+        D: dimension of the embedding vector
+        current_input: (B, CM, T, F)
+        embedding: (B, D)
+        output: (B, S, T, C*F)
+        """
+        # [B, C, T, F]
+        n_batch, _, n_frames, n_freqs = current_input.shape
+        batch = current_input
+        if input_state is None:
+            input_state = self.init_buffers(current_input.shape[0], current_input.device)
+        conv_buf = input_state['conv_buf']
+        gridnet_buf = input_state['block_bufs']
+        if quantized:
+            batch = nn.functional.pad(batch, (0, 0, self.t_ksize - 1, 0))
+        else:
+            batch = torch.cat((conv_buf, batch), dim=2)
+        conv_buf = batch[:, :,  -(self.t_ksize - 1):, :]
+        batch = self.conv(batch)  # [B, D, T, F]
+        if self.use_speaker_emb:
+            if not self.one_emb:
+                assert len(self.blocks)==self.n_layers
+                assert len(self.embeds)==self.n_layers-1
+                for ii in range(self.n_layers-1):
+                    batch = batch.transpose(2, 3)
+                    if ii > 0:
+                        batch = self.embeds[ii - 1](batch, embedding)
+                    batch = batch.transpose(2, 3)
+                    batch, gridnet_buf[f'buf{ii}'] = self.blocks[ii](batch, gridnet_buf[f'buf{ii}'])
+                batch = batch.transpose(2, 3)
+                batch = self.embeds[-1](batch, embedding)
+                batch = batch.transpose(2, 3)
+                batch, gridnet_buf[f'buf{self.n_layers-1}'] = self.blocks[self.n_layers-1](batch, gridnet_buf[f'buf{self.n_layers-1}'])
+            else:
+                assert len(self.blocks)==self.n_layers
+                assert len(self.embeds)==1
+                for ii in range(self.n_layers):
+                    batch = batch.transpose(2, 3)
+                    if ii == 1:
+                        batch = self.embeds[ii - 1](batch, embedding)
+                    batch = batch.transpose(2, 3)
+                    batch, gridnet_buf[f'buf{ii}'] = self.blocks[ii](batch, gridnet_buf[f'buf{ii}'])
+        else:
+            assert len(self.blocks)==self.n_layers
+            for ii in range(self.n_layers):
+                batch, gridnet_buf[f'buf{ii}'] = self.blocks[ii](batch, gridnet_buf[f'buf{ii}'])
+        conversation_emb=batch
+        return conversation_emb, input_state
+    def edge_mode(self):
+        for i in range(len(self.blocks)):
+            self.blocks[i].edge_mode()
+if __name__ == "__main__":
+    pass

src/models/network/model2_joint.py ADDED Viewed

	@@ -0,0 +1,186 @@

+import torch
+import torch.nn as nn
+import src.utils as utils
+# from src.models.common.film import FiLM
+class FilmLayer(nn.Module):
+    def __init__(self, D, C, nF, groups = 1):
+        super().__init__()
+        self.D = D
+        self.C = C
+        self.nF = nF
+        self.weight = nn.Conv1d(self.D, self.C * nF, 1, groups = groups)
+        self.bias = nn.Conv1d(self.D, self.C * nF, 1, groups = groups)
+    def forward(self, x: torch.Tensor, embedding: torch.Tensor):
+        """
+        x: (B, D, F, T)
+        embedding: (B, D, F)
+        """
+        B, D, _F, T = x.shape
+        w = self.weight(embedding).reshape(B, self.C, _F, 1) # (B, C, F, 1)
+        b = self.bias(embedding).reshape(B, self.C, _F, 1) # (B, C, F, 1)
+        return x * w + b
+class LayerNormPermuted(nn.LayerNorm):
+    def __init__(self, *args, **kwargs):
+        super(LayerNormPermuted, self).__init__(*args, **kwargs)
+    def forward(self, x):
+        """
+        Args:
+            x: [B, C, T, F]
+        """
+        x = x.permute(0, 2, 3, 1) # [B, T, F, C]
+        x = super().forward(x)
+        x = x.permute(0, 3, 1, 2) # [B, C, T, F]
+        return x
+class TSH(nn.Module):
+    def __init__(
+        self,
+        block_model_name,
+        block_model_params,
+        spk_dim=256,
+        latent_dim=48,
+        n_srcs=1,
+        n_fft=128,
+        num_inputs=1,
+        n_layers=6,
+        use_first_ln=True,
+        n_imics=1,
+        lstm_fold_chunk=400,
+        stft_chunk_size=200,
+        latent_dim_model1=16,
+        use_speaker_emb=True,
+        use_self_speech_model2=True
+    ):
+        super().__init__()
+        self.n_srcs = n_srcs
+        self.n_layers = n_layers
+        self.num_inputs = num_inputs
+        assert n_fft % 2 == 0
+        n_freqs = n_fft // 2 + 1
+        self.n_freqs = n_freqs
+        self.latent_dim = latent_dim
+        self.lstm_fold_chunk=lstm_fold_chunk
+        self.stft_chunk_size=stft_chunk_size
+        self.n_fft = n_fft
+        self.eps=1.0e-5
+        t_ksize = 3
+        self.t_ksize = t_ksize
+        ks, padding = (t_ksize, t_ksize), (0, 1)
+        self.n_imics=n_imics
+        self.use_self_speech_model2=use_self_speech_model2
+        if not use_speaker_emb and use_self_speech_model2:
+            self.n_imics=self.n_imics+1
+        module_list = [nn.Conv2d(2*self.n_imics, latent_dim, ks, padding=padding)]
+        if use_first_ln:
+            module_list.append(LayerNormPermuted(latent_dim))
+        self.conv = nn.Sequential(
+            *module_list
+        )
+        # FiLM layer
+        self.embeds = nn.ModuleList([])
+        # Process through a stack of blocks
+        self.blocks = nn.ModuleList([])
+        for _i in range(n_layers):
+            self.blocks.append(utils.import_attr(block_model_name)(emb_dim=latent_dim, n_freqs=n_freqs, **block_model_params))
+        # Project back to TF-Domain
+        self.deconv = nn.ConvTranspose2d(latent_dim, n_srcs * 2, ks, padding=( self.t_ksize - 1, 1))
+        self.latent_dim_model1=latent_dim_model1
+        if latent_dim_model1!=latent_dim:
+            self.projection_layer = nn.Conv2d(latent_dim_model1, latent_dim, kernel_size=1)
+    def init_buffers(self, batch_size, device):
+        conv_buf = torch.zeros(batch_size, 2*self.n_imics, self.t_ksize - 1, self.n_freqs,
+                device=device)
+        deconv_buf = torch.zeros(batch_size, self.latent_dim, self.t_ksize - 1, self.n_freqs,
+                                 device=device)
+        block_buffers = {}
+        for i in range(len(self.blocks)):
+            block_buffers[f'buf{i}'] = self.blocks[i].init_buffers(batch_size, device)
+        return dict(conv_buf=conv_buf, deconv_buf=deconv_buf,
+                    block_bufs=block_buffers)
+    def forward(self, current_input: torch.Tensor, embedding: torch.Tensor, input_state, quantized=False) -> torch.Tensor:
+        """
+        B: batch, M: mic, F: freq bin, C: real/imag, T: time frame
+        D: dimension of the embedding vector
+        current_input: (B, CM, T, F)
+        embedding: (B, D, F)
+        output: (B, S, T, C*F)
+        """
+        n_batch, _, n_frames, n_freqs = current_input.shape
+        batch = current_input
+        if input_state is None:
+            input_state = self.init_buffers(current_input.shape[0], current_input.device)
+        conv_buf = input_state['conv_buf']
+        gridnet_buf = input_state['block_bufs']
+        if quantized:
+            batch = nn.functional.pad(batch, (0, 0, self.t_ksize - 1, 0))
+        else:
+            batch = torch.cat((conv_buf, batch), dim=2)
+        conv_buf = batch[:, :,  -(self.t_ksize - 1):, :]
+        batch = self.conv(batch)  # [B, D, T, F]
+        embedding=embedding.transpose(1, 3)
+        for ii in range(self.n_layers):
+            if ii==1:
+                batch=batch*embedding
+            batch, gridnet_buf[f'buf{ii}'] = self.blocks[ii](batch, gridnet_buf[f'buf{ii}'])
+        deconv_buf = torch.zeros(n_batch, self.latent_dim, self.t_ksize - 1, self.n_freqs,
+                                 device=current_input.device)
+        if quantized:
+            batch = nn.functional.pad(batch, (0, 0, self.t_ksize - 1, 0))
+        else:
+            batch = torch.cat(( deconv_buf, batch), dim=2)
+        batch = self.deconv(batch)  # [B, n_srcs*C, T, F]
+        batch = batch.view([n_batch, self.n_srcs, 2, n_frames, n_freqs]) # [B, n_srcs, 2, n_frames, n_freqs]
+        batch = batch.transpose(2, 3).reshape(n_batch, self.n_srcs, n_frames, 2 * n_freqs) # [B, S, T, F]
+        input_state['conv_buf'] = conv_buf
+        input_state['block_bufs'] = gridnet_buf
+        return batch, input_state
+    def edge_mode(self):
+        for i in range(len(self.blocks)):
+            self.blocks[i].edge_mode()
+if __name__ == "__main__":
+    pass

src/models/network/net_conversation_joint.py ADDED Viewed

	@@ -0,0 +1,360 @@

+import torch
+import torch.nn as nn
+from .model1 import Conv_Emb_Generator
+from .model2_joint import TSH
+import torch.nn.functional as F
+import numpy as np
+import copy
+def mod_pad(x, chunk_size, pad):
+    mod = 0
+    if (x.shape[-1] % chunk_size) != 0:
+        mod = chunk_size - (x.shape[-1] % chunk_size)
+    x = F.pad(x, (0, mod))
+    x = F.pad(x, pad)
+    return x, mod
+# A TF-domain network guided by an embedding vector
+class Net_Conversation(nn.Module):
+    def __init__(self,
+                 model1_block_name,
+                 model1_block_params,
+                 model2_block_name,
+                 model2_block_params,
+                 stft_chunk_size=64,
+                 stft_pad_size=32,
+                 stft_back_pad=32,
+                 num_input_channels=1,
+                 num_output_channels=1,
+                 num_sources=1,
+                 speaker_embed = 256,
+                 num_layers_model1=3,
+                 num_layers_model2=3,
+                 latent_dim_model1=16,
+                 latent_dim_model2=32,
+                 use_sp_feats=False,
+                 use_first_ln=True,
+                 n_imics=1,
+                 window="hann",
+                 lstm_fold_chunk=400,
+                 E=2,
+                 use_speaker_emb_model1=True,
+                 one_emb_model1=True,
+                 use_self_speech_model2=True,
+                 local_context_len=-1
+                 ):
+        super(Net_Conversation, self).__init__()
+        assert num_sources == 1
+        # num input/output channels
+        self.nI = num_input_channels
+        self.nO = num_output_channels
+        # num channels to the TF-network
+        num_separator_inputs = self.nI * 2 + use_sp_feats * (3 * (self.nI - 1))
+        self.stft_chunk_size = stft_chunk_size
+        self.stft_pad_size = stft_pad_size
+        self.stft_back_pad = stft_back_pad
+        self.n_srcs = num_sources
+        self.use_sp_feats = use_sp_feats
+        # Input conv to convert input audio to a latent representation
+        self.nfft = stft_back_pad + stft_chunk_size + stft_pad_size
+        self.nfreqs = self.nfft//2 + 1
+        self.lstm_fold_chunk=lstm_fold_chunk
+        # Construct synthesis/analysis windows (rect)
+        if window=="hann":
+            window_fn = lambda x: np.hanning(x)
+        elif window=="rect":
+            window_fn = lambda x: np.ones(x)
+        else:
+            raise ValueError("Invalid window type!")
+        if ((stft_pad_size) % stft_chunk_size) == 0:
+            print("Using perfect STFT windows")
+            self.analysis_window = torch.from_numpy(window_fn(self.nfft)).float()
+            # eg. inverse SFTF
+            self.synthesis_window = torch.zeros(stft_pad_size + stft_chunk_size).float()
+            A = self.synthesis_window.shape[0]
+            B = self.stft_chunk_size
+            N = self.analysis_window.shape[0]
+            assert (A % B) == 0
+            for i in range(A):
+                num = self.analysis_window[N - A + i]
+                denom = 0
+                for k in range(A//B):
+                    denom += (self.analysis_window[N - A + (i % B) + k * B] ** 2)
+                self.synthesis_window[i] = num / denom
+        else:
+            print("Using imperfect STFT windows")
+            self.analysis_window = torch.from_numpy( window_fn(self.nfft) ).float()
+            self.synthesis_window = torch.from_numpy( window_fn(stft_chunk_size + stft_pad_size) ).float()
+        self.istft_lookback = 1 + (self.synthesis_window.shape[0] - 1) // self.stft_chunk_size
+        if local_context_len!=-1:
+            local_context_len=local_context_len//stft_chunk_size//lstm_fold_chunk
+        self.model1 = Conv_Emb_Generator(
+            model1_block_name,
+            model1_block_params,
+            spk_dim = speaker_embed,
+            latent_dim = latent_dim_model1,
+            n_srcs = num_output_channels * num_sources,
+            n_fft = self.nfft,
+            num_inputs = num_separator_inputs,
+            n_layers = num_layers_model1,
+            use_first_ln=use_first_ln,
+            n_imics=n_imics,
+            lstm_fold_chunk=lstm_fold_chunk,
+            E=E,
+            use_speaker_emb=use_speaker_emb_model1,
+            one_emb=one_emb_model1,
+            local_context_len=local_context_len
+        )
+        self.quantized = False
+        self.use_self_speech_model2=use_self_speech_model2
+        self.model2=TSH(
+            model2_block_name,
+            model2_block_params,
+            spk_dim = speaker_embed,
+            latent_dim = latent_dim_model2,
+            latent_dim_model1=latent_dim_model1,
+            n_srcs = num_output_channels * num_sources,
+            n_fft = self.nfft,
+            num_inputs = num_separator_inputs,
+            n_layers = num_layers_model2,
+            use_first_ln=use_first_ln,
+            n_imics=n_imics,
+            lstm_fold_chunk=lstm_fold_chunk,
+            stft_chunk_size=stft_chunk_size,
+            use_speaker_emb=use_speaker_emb_model1,
+            use_self_speech_model2=use_self_speech_model2
+        )
+        self.use_speaker_emb_model1=use_speaker_emb_model1
+    def init_buffers(self, batch_size, device):
+        buffers = {}
+        buffers['model1_bufs'] = self.model1.init_buffers(batch_size, device)
+        buffers['model2_bufs'] = self.model2.init_buffers(batch_size, device)
+        buffers['istft_buf'] = torch.zeros(batch_size * self.n_srcs * self.nO,
+                                           self.synthesis_window.shape[0],
+                                           self.istft_lookback, device=device)
+        return buffers
+    # compute STFT
+    def extract_features(self, x):
+        """
+        x: (B, M, T)
+        returns: (B, C*M, T, F)
+        """
+        B, M, T = x.shape
+        x = x.reshape(B*M, T)
+        x = torch.stft(x, n_fft = self.nfft, hop_length = self.stft_chunk_size,
+                          win_length = self.nfft, window=self.analysis_window.to(x.device),
+                          center=False, normalized=False, return_complex=True)
+        x = torch.view_as_real(x) # [B*M, F, T, 2]
+        BM, _F, T, C = x.shape
+        x = x.reshape(B, M, _F, T, C) # [B, M, F, T, 2]
+        x = x.permute(0, 4, 1, 3, 2) # [B, 2, M. T, F]
+        x = x.reshape(B, C*M, T, _F)
+        return x
+    def synthesis(self, x, input_state):
+        """
+        x: (B, S, T, C*F)
+        returns: (B, S, t)
+        """
+        istft_buf = input_state['istft_buf']
+        x = x.transpose(2, 3) # [B, S, CF, T]
+        B, S, CF, T = x.shape
+        X = x.reshape(B*S, CF, T)
+        X = X.reshape(B*S, 2, -1, T).permute(0, 2, 3, 1) # [BS, F, T, C]
+        X = X[..., 0] + 1j * X[..., 1]
+        x = torch.fft.irfft(X, dim=1) # [BS, iW, T]
+        x = x[:, -self.synthesis_window.shape[0]:] # [BS, oW, T]
+        # Apply synthesis window
+        x = x * self.synthesis_window.unsqueeze(0).unsqueeze(-1).to(x.device)
+        oW = self.synthesis_window.shape[0]
+        # Concatenate blocks from previous IFFTs
+        x = torch.cat([istft_buf, x], dim=-1)
+        istft_buf = x[..., -istft_buf.shape[1]:] # Update buffer
+        # Get full signal
+        x = F.fold(x, output_size=(self.stft_chunk_size * x.shape[-1] + (oW - self.stft_chunk_size), 1),
+                      kernel_size=(oW, 1), stride=(self.stft_chunk_size, 1)) # [BS, 1, t]
+        x = x[:, :, -T * self.stft_chunk_size - self.stft_pad_size: - self.stft_pad_size]
+        x = x.reshape(B, S, -1) # [B, S, t]
+        input_state['istft_buf'] = istft_buf
+        return x, input_state
+    def predict_model1(self, x, input_state, speaker_embedding, pad=True):
+        """
+        B: batch
+        M: mic
+        t: time step (time-domain)
+        x: (B, M, t)
+        R: real or imaginary
+        """
+        mod = 0
+        if pad:
+            pad_size = (self.stft_back_pad, self.stft_pad_size)
+            x, mod = mod_pad(x, chunk_size=self.stft_chunk_size, pad=pad_size)
+        # Time-domain to TF-domain
+        x = self.extract_features(x) # [B, RM, T, F]
+        if speaker_embedding is not None:
+            speaker_embedding=speaker_embedding.unsqueeze(2)
+        conversation_emb, input_state['model1_bufs'] = self.model1(x, speaker_embedding, input_state['model1_bufs'], self.quantized)
+        return conversation_emb, input_state
+    def predict_model2(self, x, conversation_emb, input_state, pad=True):
+        """
+        B: batch
+        M: mic
+        t: time step (time-domain)
+        x: (B, M, t)
+        R: real or imaginary
+        """
+        mod = 0
+        if pad:
+            pad_size = (self.stft_back_pad, self.stft_pad_size)
+            x, mod = mod_pad(x, chunk_size=self.stft_chunk_size, pad=pad_size)
+        x = self.extract_features(x)
+        x, input_state['model2_bufs']=self.model2(x, conversation_emb, input_state['model2_bufs'], self.quantized)
+        # TF-domain to time-domain
+        x, next_state = self.synthesis(x, input_state) # [B, S * M, t]
+        if mod != 0:
+            x = x[:, :, :-mod]
+        return x, next_state
+    def forward(self, inputs, input_state = None, pad=True):
+        x = inputs['mixture']
+        start_idx_input=inputs['start_idx']
+        end_idx_input=inputs['end_idx']
+        assert ((end_idx_input - start_idx_input) % self.stft_chunk_size) == 0
+        # Snap start and end to chunk
+        start_idx_input = (start_idx_input // self.stft_chunk_size) * self.stft_chunk_size
+        end_idx_input = (end_idx_input // self.stft_chunk_size) * self.stft_chunk_size
+        B, M, t=x.shape
+        audio_range=torch.tensor([start_idx_input, end_idx_input]).to(x.device)
+        audio_range = audio_range.unsqueeze(0).repeat(B, 1)
+        spk_embed = inputs['embed']
+        self_speech=None
+        if not self.use_speaker_emb_model1:
+            self_speech=inputs['self_speech']
+            combined_audio = torch.cat((x, self_speech), dim=1)
+            x=combined_audio
+        if input_state is None:
+            input_state = self.init_buffers(x.shape[0], x.device)
+        B, M, t = x.shape
+        # enter slow model
+        conversation_emb, input_state = self.predict_model1(x, input_state, spk_embed, pad=pad) # [B, F, T, C]
+        # slice conv embedding and corresponding audio
+        B, _F, T, C = conversation_emb.shape
+        conversation_emb = conversation_emb.permute(0, 1, 3, 2) # [B, F, C, T]
+        conversation_emb = torch.roll(conversation_emb, 1, dims=-1)
+        conversation_emb[..., 0] = 0
+        conversation_emb = conversation_emb.flatten(0,3).unsqueeze(1) # [*, 1]
+        multiplier = torch.tile(conversation_emb, (1, self.lstm_fold_chunk)) # [*, L]
+        multiplier = multiplier.reshape(B, _F, C, T, self.lstm_fold_chunk).flatten(3,4) # [B, F, C, T*L]
+        multiplier = multiplier.permute(0, 1, 3, 2) # [B, F, T*L, C]
+        slicing_length=end_idx_input-start_idx_input+self.stft_back_pad+self.stft_pad_size
+        padded_start=start_idx_input-self.stft_back_pad
+        padded_end=end_idx_input+self.stft_pad_size
+        pad_left=max(-padded_start, 0)
+        pad_right=max(padded_end-t, 0)
+        actual_start=max(padded_start, 0)
+        actual_end=min(padded_end, t)
+        if self.use_self_speech_model2:
+            sliced_x=x[:, :, actual_start:actual_end]
+        else:
+            x_no_self_speech=inputs["mixture"]
+            sliced_x=x_no_self_speech[:, :, actual_start:actual_end]
+        padding = (pad_left, pad_right, 0, 0, 0, 0)
+        sliced_x=F.pad(sliced_x, padding, "constant", 0)
+        converted_start_idx=start_idx_input//self.stft_chunk_size
+        converted_end_idx=end_idx_input//self.stft_chunk_size
+        sliced_emb=multiplier[:, :, converted_start_idx:converted_end_idx, :]
+        assert sliced_x.shape[2]==slicing_length
+        assert sliced_emb.shape[2]==(slicing_length-self.stft_back_pad-self.stft_pad_size)//self.stft_chunk_size
+        model2_output, input_state = self.predict_model2(sliced_x, sliced_emb, input_state, pad=False)
+        model2_output = model2_output.reshape(B, self.n_srcs, self.nO, model2_output.shape[-1])
+        return {'output': model2_output[:, 0], 'next_state': input_state, 'audio_range': audio_range}
+if __name__ == "__main__":
+    pass

src/train_joint.py ADDED Viewed

	@@ -0,0 +1,202 @@

+"""
+The main training script for training on synthetic data
+"""
+import torch
+import torch.utils.data
+import torch.nn as nn
+import argparse
+import json
+import os
+import multiprocessing
+import time
+import numpy as np
+import src.utils as utils
+from src.training.tain_val import train_epoch, test_epoch
+import shutil
+import sys
+import wandb
+VAL_SEED = 0
+CURRENT_EPOCH = 0
+def seed_from_epoch(seed):
+    global CURRENT_EPOCH
+    utils.seed_all(seed + CURRENT_EPOCH)
+def print_metrics(metrics: list):
+    input_sisdr = np.array([x['input_si_sdr'] for x in metrics])
+    sisdr = np.array([x['si_sdr'] for x in metrics])
+    print("Average Input SI-SDR: {:03f}, Average Output SI-SDR: {:03f}, Average SI-SDRi: {:03f}".format(np.mean(input_sisdr), np.mean(sisdr), np.mean(sisdr - input_sisdr)))
+def train(args: argparse.Namespace):
+    """
+    Resolve the network to be trained
+    """
+    # Fix random seeds
+    utils.seed_all(args.seed)
+    os.environ["CUBLAS_WORKSPACE_CONFIG"]=":4096:8"
+    # Turn on deterministic algorithms if specified (Note: slower training).
+    if torch.cuda.is_available():
+        if args.use_nondeterministic_cudnn:
+            torch.backends.cudnn.deterministic = False
+        else:
+            torch.backends.cudnn.deterministic = True
+    # Load experiment description
+    with open(args.config, 'rb') as f:
+        params = json.load(f)
+    # Initialize datasets
+    data_train = utils.import_attr(params['train_dataset'])(**params['train_data_args'], split='train')
+    data_val = utils.import_attr(params['val_dataset'])(**params['val_data_args'], split='val')
+    # Set up the device and workers
+    use_cuda = True
+    device = torch.device('cuda' if use_cuda else 'cpu')
+    print("Using device {}".format('cuda' if use_cuda else 'cpu'))
+    # Set multiprocessing params
+    num_workers = min(multiprocessing.cpu_count(), params['num_workers'])
+    kwargs = {
+        'num_workers': num_workers,
+        'worker_init_fn': lambda x: seed_from_epoch(args.seed),
+        'pin_memory': False
+    } if use_cuda else {}
+    # Set up data loaders
+    train_loader = torch.utils.data.DataLoader(data_train,
+                                               batch_size=params['batch_size'],
+                                               shuffle=True,
+                                               **kwargs)
+    kwargs['worker_init_fn'] = lambda x: utils.seed_all(VAL_SEED)
+    test_loader = torch.utils.data.DataLoader(data_val,
+                                              batch_size=params['eval_batch_size'],
+                                              **kwargs)
+    # Initialize HL module
+    hl_module = utils.import_attr(params['pl_module'])(**params['pl_module_args'])
+    hl_module.model.to(device)
+    # Get run name from run dir
+    run_name = os.path.basename(args.run_dir.rstrip('/'))
+    checkpoints_dir = os.path.join(args.run_dir, 'checkpoints')
+    # Set up checkpoints
+    if not os.path.exists(checkpoints_dir):
+        os.makedirs(checkpoints_dir)
+    # Copy json
+    shutil.copyfile(args.config, os.path.join(args.run_dir, 'config.json'))
+    # Check if a model state path exists for this model, if it does, load it
+    best_path = os.path.join(checkpoints_dir, 'best.pt')
+    state_path = os.path.join(checkpoints_dir, 'last.pt')
+    if args.best and os.path.exists(best_path):
+        print("load best state path .....")
+        hl_module.load_state(best_path)
+    elif os.path.exists(state_path):
+        print("load state path .....")
+        hl_module.load_state(state_path)
+    start_epoch = hl_module.epoch
+    if "project_name" in params.keys():
+        project_name = params["project_name"]
+    else:
+        project_name = "AcousticBubble"
+    # Initialize wandb
+    # print(project_name)
+    wandb_run = wandb.init(
+        project=project_name,
+        name=run_name,
+        notes='Example of a note',
+        tags=['speech', 'audio', 'embedded-systems']
+    )
+    # Training loop
+    try:
+        # Go over remaining epochs
+        for epoch in range(start_epoch, params['epochs']):
+            global CURRENT_EPOCH, VAL_SEED
+            CURRENT_EPOCH = epoch
+            seed_from_epoch(args.seed)
+            hl_module.on_epoch_start()
+            current_lr = hl_module.get_current_lr()
+            print("CURRENT learning rate: {:0.08f}".format(current_lr))
+            print("[TRAINING]")
+            # Run testing step
+            t1 = time.time()
+            train_loss = train_epoch(hl_module, train_loader, device)
+            t2 = time.time()
+            print(f"Train epoch time: {t2 - t1:02f}s")
+            print("\nTrain set: Average Loss: {:.4f}\n".format(train_loss))
+            print()
+            if np.isnan(train_loss):
+                raise ValueError("Got NAN in training")
+            utils.seed_all(VAL_SEED)
+            # Run testing step
+            print("[TESTING]")
+            test_loss = test_epoch(hl_module, test_loader, device)
+            print("\nTest set: Average Loss: {:.4f}\n".format(test_loss))
+            hl_module.on_epoch_end(best_path, wandb_run)
+            hl_module.dump_state(state_path)
+            print()
+            print("=" * 25, "FINISHED EPOCH", epoch, "=" * 25)
+            print()
+    except KeyboardInterrupt:
+        print("Interrupted")
+    except Exception as _:
+        import traceback
+        traceback.print_exc()
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    # Experiment Params
+    parser.add_argument('--config', type=str,
+                        help='Path to experiment config')
+    parser.add_argument('--run_dir', type=str,
+                        help='Path to experiment directory')
+    parser.add_argument('--best', action='store_true',
+                        help="load from best checkpoint instead of last checkpoint")
+    # Randomization Params
+    parser.add_argument('--seed', type=int, default=10,
+                        help='Random seed for reproducibility')
+    parser.add_argument('--use_nondeterministic_cudnn',
+                        action='store_true',
+                        help="If using cuda, chooses whether or not to use \
+                                non-deterministic cudDNN algorithms. Training will be\
+                                faster, but the final results may differ slighty.")
+    # wandb params
+    parser.add_argument('--project_name',
+                        type=str,
+                        default='AcousticBubble',
+                        help='Project name that shows up on wandb')
+    train(parser.parse_args())

src/training/tain_val.py ADDED Viewed

	@@ -0,0 +1,88 @@

+"""
+The main training script for training on synthetic data
+"""
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.optim as optim
+import os
+import tqdm
+def to_device(batch, device):
+    if type(batch) == torch.Tensor:
+        return batch.to(device)
+    elif type(batch) == dict:
+        for k in batch:
+            batch[k] = to_device(batch[k], device)
+        return batch
+    elif type(batch) in [list, tuple]:
+        batch = [to_device(x, device) for x in batch]
+        return batch
+    else:
+        return batch
+def test_epoch(hl_module, test_loader, device) -> float:
+    """
+    Evaluate the network.
+    """
+    hl_module.eval()
+    test_loss = 0
+    num_elements = 0
+    num_batches = len(test_loader)
+    pbar = tqdm.tqdm(total=num_batches)
+    with torch.no_grad():
+        for batch_idx, batch in enumerate(test_loader):
+            batch = to_device(batch, device)
+            loss, B = hl_module.validation_step(batch, batch_idx)
+            #print(loss.item(), B)
+            test_loss += (loss.item() * B)
+            num_elements += B
+            pbar.set_postfix(loss='%.05f'%(loss.item()) )
+            pbar.update()
+        return test_loss / num_elements
+def train_epoch(hl_module, train_loader, device) -> float:
+    """
+    Train a single epoch.
+    """
+    # Set the model to training.
+    hl_module.train()
+    # Training loop
+    train_loss = 0
+    num_elements = 0
+    num_batches = len(train_loader)
+    pbar = tqdm.tqdm(total=num_batches)
+    for batch_idx, batch in enumerate(train_loader):
+        batch = to_device(batch, device)
+        # Reset grad
+        hl_module.reset_grad()
+        # Forward pass
+        loss, B = hl_module.training_step(batch, batch_idx)
+        # Backpropagation
+        loss.backward(retain_graph=False)
+        hl_module.backprop()
+        # Save losses
+        loss = loss.detach()
+        train_loss += (loss.item() * B)
+        num_elements += B
+#        if batch_idx % 20 == 0:
+#            print(loss.item(), B)
+#            print('{}/{}'.format(batch_idx, num_batches))
+        pbar.set_postfix(loss='%.05f'%(loss.item()) )
+        pbar.update()
+    return train_loss / num_elements

src/utils.py ADDED Viewed

	@@ -0,0 +1,285 @@

+import os
+import glob
+import importlib
+import json
+import librosa
+import soundfile as sf
+import torch
+import torchaudio
+import math
+import torch.nn as nn
+class PositionalEncoding(nn.Module):
+    """This class implements the absolute sinusoidal positional encoding function.
+    PE(pos, 2i)   = sin(pos/(10000^(2i/dmodel)))
+    PE(pos, 2i+1) = cos(pos/(10000^(2i/dmodel)))
+    Arguments
+    ---------
+    input_size: int
+        Embedding dimension.
+    max_len : int, optional
+        Max length of the input sequences (default 2500).
+    Example
+    -------
+    >>> a = torch.rand((8, 120, 512))
+    >>> enc = PositionalEncoding(input_size=a.shape[-1])
+    >>> b = enc(a)
+    >>> b.shape
+    torch.Size([1, 120, 512])
+    """
+    def __init__(self, input_size, max_len=2500):
+        super().__init__()
+        if input_size % 2 != 0:
+            raise ValueError(f"Cannot use sin/cos positional encoding with odd channels (got channels={input_size})")
+        self.max_len = max_len
+        pe = torch.zeros(self.max_len, input_size, requires_grad=False)
+        positions = torch.arange(0, self.max_len).unsqueeze(1).float()
+        denominator = torch.exp(torch.arange(0, input_size, 2).float() * -(math.log(10000.0) / input_size))
+        pe[:, 0::2] = torch.sin(positions * denominator)
+        pe[:, 1::2] = torch.cos(positions * denominator)
+        pe = pe.unsqueeze(0)
+        self.register_buffer("pe", pe)
+    def forward(self, x):
+        """
+        Arguments
+        ---------
+        x : tensor
+            Input feature shape (batch, time, fea)
+        """
+        return self.pe[:, : x.size(1)].clone().detach()
+def count_parameters(model):
+    """
+    Count the number of parameters in a PyTorch model.
+    Parameters:
+        model (torch.nn.Module): The PyTorch model.
+    Returns:
+        int: Number of parameters in the model.
+    """
+    N_param = sum(p.numel() for p in model.parameters())
+    print(f"Model params number {N_param/1e6} M")
+def import_attr(import_path):
+    module, attr = import_path.rsplit(".", 1)
+    return getattr(importlib.import_module(module), attr)
+class Params:
+    """Class that loads hyperparameters from a json file.
+    Example:
+    ```
+    params = Params(json_path)
+    print(params.learning_rate)
+    params.learning_rate = 0.5  # change the value of learning_rate in params
+    ```
+    """
+    def __init__(self, json_path):
+        with open(json_path) as f:
+            params = json.load(f)
+            self.__dict__.update(params)
+    def save(self, json_path):
+        with open(json_path, "w") as f:
+            json.dump(self.__dict__, f, indent=4)
+    def update(self, json_path):
+        """Loads parameters from json file"""
+        with open(json_path) as f:
+            params = json.load(f)
+            self.__dict__.update(params)
+    @property
+    def dict(self):
+        """Gives dict-like access to Params instance by `params.dict['learning_rate']"""
+        return self.__dict__
+def load_net_torch(expriment_config, return_params=False):
+    params = Params(expriment_config)
+    params.pl_module_args["slow_model_ckpt"] = None
+    params.pl_module_args["use_dp"] = False
+    params.pl_module_args["prev_ckpt"] = None
+    pl_module = import_attr(params.pl_module)(**params.pl_module_args)
+    with open(expriment_config) as f:
+        params = json.load(f)
+    if return_params:
+        return pl_module, params
+    else:
+        return pl_module
+def load_net(expriment_config, return_params=False):
+    params = Params(expriment_config)
+    params.pl_module_args["use_dp"] = False
+    pl_module = import_attr(params.pl_module)(**params.pl_module_args)
+    with open(expriment_config) as f:
+        params = json.load(f)
+    if return_params:
+        return pl_module, params
+    else:
+        return pl_module
+def load_pretrained(run_dir, return_params=False, map_location="cpu", use_last=False):
+    config_path = os.path.join(run_dir, "config.json")
+    pl_module, params = load_net(config_path, return_params=True)
+    # Get all "best" checkpoints
+    if use_last:
+        name = "last.pt"
+    else:
+        name = "best.pt"
+    ckpt_path = os.path.join(run_dir, f"checkpoints/{name}")
+    if not os.path.exists(ckpt_path):
+        raise FileNotFoundError(f"Given run ({run_dir}) doesn't have any pretrained checkpoints!")
+    print("Loading checkpoint from", ckpt_path)
+    # Load checkpoint
+    # state_dict = torch.load(ckpt_path, map_location=map_location)['state_dict']
+    pl_module.load_state(ckpt_path, map_location)
+    print("Loaded module at epoch", pl_module.epoch)
+    if return_params:
+        return pl_module, params
+    else:
+        return pl_module
+def load_pretrained_with_last(run_dir, return_params=False, map_location="cpu", use_last=False):
+    config_path = os.path.join(run_dir, "config.json")
+    pl_module, params = load_net(config_path, return_params=True)
+    # Get all "best" checkpoints
+    if use_last:
+        name = "last.pt"
+    else:
+        name = "best.pt"
+    ckpt_path = os.path.join(run_dir, f"checkpoints/{name}")
+    if not os.path.exists(ckpt_path):
+        raise FileNotFoundError(f"Given run ({run_dir}) doesn't have any pretrained checkpoints!")
+    print("Loading checkpoint from", ckpt_path)
+    # Load checkpoint
+    # state_dict = torch.load(ckpt_path, map_location=map_location)['state_dict']
+    pl_module.load_state(ckpt_path, map_location)
+    print("Loaded module at epoch", pl_module.epoch)
+    if return_params:
+        return pl_module, params
+    else:
+        return pl_module
+def load_pretrained2(run_dir, return_params=False, map_location="cpu"):
+    config_path = os.path.join(run_dir, "config.json")
+    pl_module, params = load_net(config_path, return_params=True)
+    ckpt_path = os.path.join(run_dir, "checkpoints", "best.pt")
+    print("Loading checkpoint from", ckpt_path)
+    # Load checkpoint
+    # state_dict = torch.load(ckpt_path, map_location=map_location)['state_dict']
+    pl_module.load_state(ckpt_path)
+    if return_params:
+        return pl_module, params
+    else:
+        return pl_module
+def load_torch_pretrained(run_dir, return_params=False, map_location="cpu", model_epoch="best"):
+    config_path = os.path.join(run_dir, "config.json")
+    print(config_path)
+    pl_module, params = load_net_torch(config_path, return_params=True)
+    # Get all "best" checkpoints
+    ckpt_path = os.path.join(run_dir, f"checkpoints/{model_epoch}.pt")
+    if not os.path.exists(ckpt_path):
+        raise FileNotFoundError(f"Given run ({run_dir}) doesn't have any pretrained checkpoints!")
+    print("Loading checkpoint from", ckpt_path)
+    # Load checkpoint
+    # state_dict = torch.load(ckpt_path, map_location=map_location)['state_dict']
+    pl_module.load_state(ckpt_path, map_location)
+    print("Loaded module at epoch", pl_module.epoch)
+    if return_params:
+        return pl_module, params
+    else:
+        return pl_module
+def read_audio_file(file_path, sr):
+    """
+    Reads audio file to system memory.
+    """
+    return librosa.core.load(file_path, mono=False, sr=sr)[0]
+def read_audio_file_torch(file_path, downsample=1, input_mean=False):
+    waveform, sample_rate = torchaudio.load(file_path)
+    if downsample > 1:
+        waveform = torchaudio.functional.resample(waveform, sample_rate, sample_rate // downsample)
+    if waveform.shape[0] > 1 and input_mean == True:
+        waveform = torch.mean(waveform, dim=0)
+        waveform = waveform.unsqueeze(0)
+    elif waveform.shape[0] > 1 and input_mean == "L":
+        waveform = waveform[0:1, ...]
+    elif waveform.shape[0] > 1 and input_mean == "R":
+        waveform = waveform[1:2, ...]
+    return waveform
+def write_audio_file(file_path, data, sr, subtype="PCM_16"):
+    """
+    Writes audio file to system memory.
+    @param file_path: Path of the file to write to
+    @param data: Audio signal to write (n_channels x n_samples)
+    @param sr: Sampling rate
+    """
+    sf.write(file_path, data.T, sr, subtype)
+def read_json(path):
+    with open(path, "rb") as f:
+        return json.load(f)
+import random
+import numpy as np
+def seed_all(seed):
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    if torch.cuda.is_available():
+        torch.cuda.manual_seed(seed)