Spaces:

swc2
/

Target-speaker-extraction

Running

App Files Files Community

swc2 commited on Jan 14, 2025

Commit

7eddfc5

1 Parent(s): 6d6a788

update change 2

Browse files

Files changed (41) hide show

.gitignore +1 -0
app.py +63 -33
config/config.yaml +23 -0
datahandler.py +216 -0
decode.py +88 -0
model/__pycache__/cnns.cpython-37.pyc +0 -0
model/__pycache__/cnns.cpython-38.pyc +0 -0
model/__pycache__/norm.cpython-37.pyc +0 -0
model/__pycache__/norm.cpython-38.pyc +0 -0
model/__pycache__/spex_plus.cpython-37.pyc +0 -0
model/__pycache__/spex_plus.cpython-38.pyc +0 -0
{nnet → model}/cnns.py +116 -15
{nnet → model}/norm.py +39 -0
{nnet → model}/spex_plus.py +138 -165
nnet/ResNet34.py +0 -213
nnet/__init__.py +0 -0
nnet/pooling.py +0 -100
nnet/speaker_encoder.py +0 -47
noises/00840.wav +0 -0
noises/022928.wav +0 -0
noises/04338.wav +0 -0
noises/046324.wav +0 -0
noises/093004.wav +0 -0
noises/11129.wav +0 -0
noises/133254.wav +0 -0
noises/30100.wav +0 -0
noises/30135.wav +0 -0
noises/30437.wav +0 -0
noises/30603.wav +0 -0
requirements.txt +6 -1
temp_extracted.wav +0 -0
test_mix.wav +0 -0
test_output_mixture.wav +0 -0
utils/__init__.py +0 -0
utils/audio.py +0 -124
utils/dataset copy.py +0 -284
utils/dataset.py +0 -402
utils/load_obj.py +0 -18
utils/logger.py +0 -22
utils/sisdr.py +0 -23
utils/timer.py +0 -17

.gitignore ADDED Viewed

	@@ -0,0 +1 @@


1	+ ckpt/

app.py CHANGED Viewed

@@ -1,55 +1,85 @@
 import gradio as gr
-#from inference import InferencePipeline
-#i = InferencePipeline()
-# device = "cuda" if torch.cuda.is_available() else "cpu"
-# def convert_audio_to_wav(file_path):
-#     """Convert any supported format (mp3, etc.) to wav using librosa"""
-#     output_path = "temp_input.wav"
-#     audio, sr = librosa.load(file_path, sr=None)  # 加载音频文件
-#     librosa.output.write_wav(output_path, audio, sr)  # 转换并保存为 WAV 格式
-#     return output_path
-def gradio_TSE(audio_file_path):
     """
-    Wrapper function to handle Gradio's audio input and pass the file path to the voice conversion function.
-    Gradio passes audio data as a tuple: (temp file path, sample rate).
     """
-    # Gradio passes audio as (temp file path, sample rate)
-    #audio_file_path = audio_data[0]  # Extract the file path
-    print(f"Here is the audio_file_path: {audio_file_path}")
-    #print(f"Here is the audio_file_path[0]: {audio_file_path[0]}")
-    random_wav = f"/path/to/generated_audio_{int(time.time())}.wav"
-    #return i.voice_conversion(audio_file_path)
-    return random_wav
-# Define your Gradio interface
 with gr.Blocks() as demo:
     gr.Markdown("## Target Speaker Extraction Demo")
     gr.Markdown(
-        "This demo isolates the speech signal of a target speaker from a mixture of multiple speakers, "
-        "with or without noises and reverberations."
     )
-    # input
     with gr.Row():
-        input_audio = gr.Audio(label="Upload or record your clean audio", type="filepath")
-        enroll_audio = gr.Audio(label="Upload your enroll (target speaker) audio", type="filepath")
-    # output
     with gr.Row():
         noisy_audio_output = gr.Audio(label="Noisy Audio (Processed input audio)", type="filepath")
         extracted_audio_output = gr.Audio(label="Extracted target speaker audio", type="filepath")
-    # deal
     convert_button = gr.Button("Extract")
-    # event
     convert_button.click(
         fn=gradio_TSE,
-        inputs=[input_audio, enroll_audio],
-        outputs=[noisy_audio_output, extracted_audio_output]
     )
 if __name__ == "__main__":

 import gradio as gr
+import os
+import soundfile as sf
+import numpy as np
+# 这是你现有的推理管线
+from decode import InferencePipeline
+from datahandler import AudioMixer, fix_audio_format
+#####################################
+# 这是你的推理 pipeline
+#####################################
+inter = InferencePipeline()
+datamix = AudioMixer()
+#####################################
+# 这是供 Gradio 点击时调用的函数
+#####################################
+def gradio_TSE(input_audio_path, enroll_audio_path, audio_type):
     """
+    如果 audio_type 是 "clean"，就调用 data_handler(此处是 produce_mixture_from_clean)
+    把输入先变成 mix；如果是 "mix"，则直接使用原始文件。
     """
+    print(f"User uploaded audio path: {input_audio_path}")
+    print(f"User enroll audio path:  {enroll_audio_path}")
+    print(f"User chose audio_type:   {audio_type}")
+    if audio_type == "clean":
+        # 先把 clean 转成 mix
+        mix_path = datamix.produce_mixture_from_clean(input_audio_path)
+        print(f"Converted clean -> mix: {mix_path}")
+    else:
+        # 如果是已经是混合音频，直接用它
+        mix_path = input_audio_path
+    input_wav = fix_audio_format(mix_path)
+    mix_wav = "mix.wav"
+    sf.write(mix_wav, input_wav, 16000)
+    enroll_wav = fix_audio_format(enroll_audio_path)
+    eol_wav = "eol.wav"
+    sf.write(eol_wav, enroll_wav, 16000)
+    est_path = inter.computer(mix_wav, eol_wav)
+    # 接下来走你的推理流程
+    return mix_path,est_path
+#####################################
+# 搭建 Gradio 界面
+#####################################
 with gr.Blocks() as demo:
     gr.Markdown("## Target Speaker Extraction Demo")
     gr.Markdown(
+        "This demo can handle either clean audio (which we'll turn into a mix) or directly a mix audio."
     )
     with gr.Row():
+        # 上传或录制的“待处理”音频
+        input_audio = gr.Audio(label="Upload/record your audio", type="filepath")
+        # 让用户手动指定音频类型
+        audio_type = gr.Radio(
+            choices=["clean", "mix"],
+            value="clean",
+            label="Input audio type?"
+        )
+    with gr.Row():
+        # enroll 音频
+        enroll_audio = gr.Audio(label="Upload your enroll audio", type="filepath")
     with gr.Row():
+        # 输出：处理后的 noisy 和 提取的目标说话人
         noisy_audio_output = gr.Audio(label="Noisy Audio (Processed input audio)", type="filepath")
         extracted_audio_output = gr.Audio(label="Extracted target speaker audio", type="filepath")
+    # 点击按钮触发
     convert_button = gr.Button("Extract")
     convert_button.click(
         fn=gradio_TSE,
+        inputs=[input_audio, enroll_audio, audio_type],
+        outputs=[noisy_audio_output, extracted_audio_output]
     )
 if __name__ == "__main__":

config/config.yaml ADDED Viewed

	@@ -0,0 +1,23 @@

+model:
+  _target_: model.spex_plus.SpEx_Plus # str, model class name
+  L1: 40
+  L2: 160
+  L3: 320
+  N: 256
+  B: 8
+  O: 256
+  P: 512
+  Q: 3
+  num_spks: 1410 # with speed perturbation 470 -> 1410
+  spk_embed_dim: 256
+  causal: false
+  is_innorm: true
+  fusion_type: 'cat' #cat mul film att
+test:
+  checkpoint: "./ckpt/v2.0.pt.tar"
+  gpu: -1
+  sample_rate: 16000

datahandler.py ADDED Viewed

	@@ -0,0 +1,216 @@

+import os
+import random
+import warnings
+import numpy as np
+import soundfile as sf
+import pyloudnorm
+import glob
+import librosa
+def fix_audio_format(audio_path, out_sr=16000):
+    """
+    将音频读进来（自动识别格式）并强制重采样到 out_sr，转换为单声道。
+    最终返回：
+      - data (numpy array): 处理后（单声道、out_sr）的音频数据
+      - sr (int): 处理后的采样率 (默认 16k)
+    不写入临时文件，只做内存操作。
+    """
+    # librosa.load 会自动解析不同格式的音频（wav/mp3/flac等）
+    # 并将其重采样到 out_sr, 同时 mono=True 意味着转换为单声道
+    data, sr = librosa.load(audio_path, sr=out_sr, mono=True)
+    return data
+class AudioMixer(object):
+    def __init__(
+        self,
+        sample_rate=16000,
+        mean_snr=-7,
+        var_snr=25,
+        mean_loudness=-24,
+        var_loudness=20
+    ):
+        """
+        初始化一些参数、随机种子和响度计算工具等。
+        """
+        self.sample_rate = sample_rate
+        self.mean_snr = mean_snr
+        self.var_snr = var_snr
+        self.MEAN_LOUNDNESS = mean_loudness
+        self.VAR_LOUNDNESS = var_loudness
+        self.EPS = 1e-10
+        self.MAX_AMP = 0.9
+        # pyloudnorm 的 Meter，用于计算音频响度
+        self.meter = pyloudnorm.Meter(self.sample_rate)
+        # # 也可固定随机种子，保证每次混合一致（如果想要可复现）
+        # self.seed = 1453
+        # random.seed(self.seed)
+        # np.random.seed(self.seed)
+    def read_wav(self, wav_path):
+        """
+        读取音频文件并返回 wave 数据和采样率
+        """
+        data, sr = sf.read(wav_path, dtype='float32')
+        # 如果读到的是多通道，可只取其中一个通道
+        if data.ndim > 1:
+            data = data[:, 0]
+        return data, sr
+    def normalize(self, signal, is_noise=False):
+        """
+        对输入的 signal 做响度归一化，并确保不会过载失真。
+        """
+        c_loudness = self.meter.integrated_loudness(signal)
+        if is_noise:
+            # 噪声的目标响度可以偏高一些或随便设置
+            target_loudness = np.random.normal(self.MEAN_LOUNDNESS + 4, self.VAR_LOUNDNESS**0.5)
+        else:
+            # mix 或者语音的目标响度
+            target_loudness = np.random.normal(self.MEAN_LOUNDNESS, self.VAR_LOUNDNESS**0.5)
+        with warnings.catch_warnings():
+            warnings.filterwarnings("error", category=RuntimeWarning)
+            signal = pyloudnorm.normalize.loudness(signal, c_loudness, target_loudness)
+        # # 再检查是否会 clipping
+        # peak = np.max(np.abs(signal))
+        # if peak >= 1.0:
+        #     signal = signal * self.MAX_AMP / peak
+        return signal
+    def snr_norm(self, signal, noise, is_noise=True):
+        """
+        根据预设的 mean_snr、var_snr 来随机决定一个目标 SNR，然后
+        以此对 noise 做缩放，得到与 signal 相匹配的噪声幅度。
+        """
+        if is_noise:
+            desired_snr = np.random.normal(self.mean_snr, self.var_snr**0.5)
+        else:
+            # 如果你还有别的需求，比如想做正 SNR 范围，可以改这里
+            desired_snr = np.random.uniform(2, 10)
+        current_snr = 10 * np.log10(
+            np.mean(signal ** 2) / (np.mean(noise ** 2) + self.EPS) + self.EPS
+        )
+        scale_factor = 10 ** ((current_snr - desired_snr) / 20)
+        scaled_noise = noise * scale_factor
+        # # 防止噪声自身 clipping
+        # peak = np.max(np.abs(scaled_noise))
+        # if peak >= 1.0:
+        #     scaled_noise = scaled_noise * self.MAX_AMP / peak
+        return scaled_noise
+    def _mix(self, sources_list):
+        """
+        将多路音频进行叠加，防止溢出。
+        """
+        # 假设 sources_list[0] 是 mix 音频，sources_list[1] 是已拼好长度的 noise
+        mix_length = len(sources_list[0])
+        mixture = np.zeros(mix_length, dtype=np.float32)
+        for s in sources_list:
+            mixture += s[:mix_length]  # 仅叠加到 mix 的长度
+        # 再做一次峰值校正，避免溢出
+        peak = np.max(np.abs(mixture))
+        if peak >= 1.0:
+            mixture = mixture * self.MAX_AMP / peak
+        return mixture
+    def _prepare_noise_for_mix(self, noise_files, mix_length):
+        """
+        传入一组 noise 文件路径，先对它们打乱，再依次读取、拼接。
+        如果总长度还不够覆盖 mix_length，可以再次拼接自己（循环）。
+        - noise_files: 存储多个噪声文件路径的列表
+        - mix_length: 需要的总长度（采样点数）
+        返回: 拼接后的 noise 波形
+        """
+        # 先随机打乱
+        random.shuffle(noise_files)
+        # 依次读取并拼接
+        noise_all = []
+        total_len = 0
+        # ���一次先拼完所有 noise 文件，如果还不够，就重复拼接
+        while total_len < mix_length:
+            for nf in noise_files:
+                noise_data, _ = self.read_wav(nf)
+                # 可选：对每条 noise 做一次 normalize，提升多样性
+                # （或者只在外部做一次统一的 normalize）
+                #noise_data = self.normalize(noise_data, is_noise=True)
+                noise_all.append(noise_data)
+                total_len += len(noise_data)
+                if total_len >= mix_length:
+                    break
+            # 如果已经拼完一轮，可能还不够，就继续 while 循环再拼一轮
+        # 拼接后截断到 mix_length
+        concatenated_noise = np.concatenate(noise_all)[:mix_length]
+        return concatenated_noise
+    def mix_with_noise_folder(self, mix_wave,sr_mix noise_folder):
+        """
+        读取一条 mix 文件和一个 noise 文件夹，做如下处理：
+        1. 读取 mix wave，并做响度归一化
+        2. 根据 mix 的长度，在 noise 文件夹中随机打乱全部 wav，依次拼接满足同长度
+        3. 对最终拼好的 noise 做 snr_norm
+        4. 叠加输出
+        """
+        # 1. 读取 mix
+        # mix_wave, sr_mix = self.read_wav(mix_path)
+        # 如果文件夹下找不到任何 noise 文件，就直接返回原音频
+        noise_files = sorted(glob.glob(os.path.join(noise_folder, "*.wav")))
+        if not noise_files:
+            raise RuntimeError(f"噪声文件夹 {noise_folder} 内未发现 .wav 文件")
+        mix_wave = self.normalize(mix_wave, is_noise=False)
+        mix_length = len(mix_wave)
+        # 2. 先把 noise 文件拼接到 match mix_length
+        #    （会将 noise_files 打乱后依次读、拼接）
+        noise_ready = self._prepare_noise_for_mix(noise_files, mix_length)
+        # 3. SNR 调整
+        noise_ready = self.snr_norm(mix_wave, noise_ready, is_noise=True)
+        # 4. 叠加
+        mixture = self._mix([mix_wave, noise_ready])
+        out_noisy = "temp_noisy.wav"      # 可以理解为把输入的混合音频直接另存为
+        # 返回混合后的音频以及采样率
+        sf.write(out_noisy, mixture, sr_mix)
+        return out_noisy
+if __name__ == "__main__":
+    # 假设你有一个 mix.wav 以及一个 noise 文件夹(含若干个 .wav 噪声文件)
+    mix_path_test = "test_mix.wav"
+    mix_wave, sr_mix = self.read_wav(mix_path_test)
+    noise_folder_test = "noises/"  # 比如里面有 10 条 noise*.wav
+    mixer = AudioMixer()
+    # 执行混合
+    mixed_wav, sr = mixer.mix_with_noise_folder(mix_wave, sr_mix, noise_folder_test)
+    # 这里你可以选择把结果写回本地文件，或直接返回 numpy 数组做后续处理
+    sf.write("test_output_mixture.wav", mixed_wav, sr)
+    print("混合完成，已输出到 test_output_mixture.wav")

decode.py ADDED Viewed

	@@ -0,0 +1,88 @@

+#!/usr/bin/env python
+import os
+import logging
+import numpy as np
+import torch as th
+import soundfile as sf
+import hydra
+from omegaconf import OmegaConf
+# ================ 网络推理类 ================
+class NnetComputer(object):
+    def __init__(self, cpt_dir, gpuid, nnet_conf):
+        self.device = th.device(f"cuda:{gpuid}") if gpuid >= 0 else th.device("cpu")
+        nnet = self._load_nnet(cpt_dir, nnet_conf)
+        self.nnet = nnet.to(self.device) if gpuid >= 0 else nnet
+        self.nnet.eval()
+    def _load_nnet(self, cpt_dir, model):
+        cpt = th.load(cpt_dir, map_location="cpu")
+        model.load_state_dict(cpt["model_state_dict"])
+        return model
+    def compute(self, samps, aux_samps, aux_samps_len):
+        with th.no_grad():
+            raw = th.tensor(samps, dtype=th.float32, device=self.device)
+            aux = th.tensor(aux_samps, dtype=th.float32, device=self.device)
+            aux_len = th.tensor(aux_samps_len, dtype=th.float32, device=self.device)
+            aux = aux.unsqueeze(0)
+            print("raw",raw.shape)
+            print("aux",aux.shape)
+            sps, sps2, sps3, spk_pred = self.nnet(raw, aux, aux_len)
+            sp_samps = np.squeeze(sps.detach().cpu().numpy())
+            return sp_samps
+class InferencePipeline:
+    """
+    外部只需传入 config，即可完成:
+      1) 模型实例化 (含 hydra.instantiate 逻辑)
+      2) 加载 checkpoint
+      3) 推理
+    """
+    def __init__(self, config):
+        """
+        在构造时就把所有初始化做好，包括:
+          - hydra.instantiate(config.model) -> 得到一个 nn.Module
+          - 用 NnetComputer(...) 封装
+        """
+        # 如果 config.model 里含有 _target_ 字段，可以用 hydra.instantiate
+        # 注意: hydra.instantiate 需要在这里显式地导入 hydra.utils
+        # 1. 根据 config.model 构建模型
+        model_inst = hydra.utils.instantiate(config.model)
+        self.computer_ = NnetComputer(config.test.checkpoint,config.test.gpu, model_inst)
+    def run_inference(self, input_audio_path: str, enroll_audio_path: str) -> str:
+        """
+        给定混合音频 + enroll 音频，执行推理并返回输出文件路径。
+        """
+        # 1. 读取音频
+        mix_samps, sr = sf.read(input_audio_path)
+        aux_samps, sr2 = sf.read(enroll_audio_path)
+        # 2. 调用底层 compute
+        samps = self.computer_.compute(mix_samps, aux_samps, len(aux_samps))
+        norm = np.linalg.norm(mix_samps, np.inf)
+        samps = samps[:mix_samps.size]
+        samps = samps * norm / np.max(np.abs(samps))
+        # 3. 写到临时文件
+        out_wav = "temp_extracted.wav"
+        sf.write(out_wav, samps, sr)
+        return out_wav
+if __name__ == "__main__":
+    cfg = OmegaConf.load("config/config.yaml")
+    pipeline = InferencePipeline(cfg)
+    mix_path = "test_output_mixture.wav"
+    enroll_path = "test_mix.wav"
+    out_wav = pipeline.run_inference(mix_path, enroll_path)
+    print("Done:", out_wav)

model/__pycache__/cnns.cpython-37.pyc ADDED Viewed

Binary file (8.14 kB). View file

model/__pycache__/cnns.cpython-38.pyc ADDED Viewed

Binary file (8.01 kB). View file

model/__pycache__/norm.cpython-37.pyc ADDED Viewed

Binary file (3.88 kB). View file

model/__pycache__/norm.cpython-38.pyc ADDED Viewed

Binary file (3.77 kB). View file

model/__pycache__/spex_plus.cpython-37.pyc ADDED Viewed

Binary file (5.9 kB). View file

model/__pycache__/spex_plus.cpython-38.pyc ADDED Viewed

Binary file (5.98 kB). View file

{nnet → model}/cnns.py RENAMED Viewed

@@ -2,8 +2,9 @@
 import torch as th
 import torch.nn as nn
-from .norm import ChannelwiseLayerNorm, GlobalLayerNorm
 class Conv1D(nn.Conv1d):
     """
@@ -58,12 +59,23 @@ class TCNBlock(nn.Module):
                  conv_channels=512,
                  kernel_size=3,
                  dilation=1,
-                 causal=False):
         super(TCNBlock, self).__init__()
         self.conv1x1 = Conv1D(in_channels, conv_channels, 1)
         self.prelu1 = nn.PReLU()
-        self.norm1 = GlobalLayerNorm(conv_channels, elementwise_affine=True) if not causal else (
-            ChannelwiseLayerNorm(conv_channels, elementwise_affine=True))
         dconv_pad = (dilation * (kernel_size - 1)) // 2 if not causal else (
             dilation * (kernel_size - 1))
         self.dconv = nn.Conv1d(
@@ -75,8 +87,8 @@ class TCNBlock(nn.Module):
             dilation=dilation,
             bias=True)
         self.prelu2 = nn.PReLU()
-        self.norm2 = GlobalLayerNorm(conv_channels, elementwise_affine=True) if not causal else (
-            ChannelwiseLayerNorm(conv_channels, elementwise_affine=True))
         self.sconv = nn.Conv1d(conv_channels, in_channels, 1, bias=True)
         self.causal = causal
         self.dconv_pad = dconv_pad
@@ -108,12 +120,40 @@ class TCNBlock_Spk(nn.Module):
                  conv_channels=512,
                  kernel_size=3,
                  dilation=1,
-                 causal=False):
         super(TCNBlock_Spk, self).__init__()
-        self.conv1x1 = Conv1D(in_channels+spk_embed_dim, conv_channels, 1)
         self.prelu1 = nn.PReLU()
-        self.norm1 = GlobalLayerNorm(conv_channels, elementwise_affine=True) if not causal else (
-            ChannelwiseLayerNorm(conv_channels, elementwise_affine=True))
         dconv_pad = (dilation * (kernel_size - 1)) // 2 if not causal else (
             dilation * (kernel_size - 1))
         self.dconv = nn.Conv1d(
@@ -125,19 +165,80 @@ class TCNBlock_Spk(nn.Module):
             dilation=dilation,
             bias=True)
         self.prelu2 = nn.PReLU()
-        self.norm2 = GlobalLayerNorm(conv_channels, elementwise_affine=True) if not causal else (
-            ChannelwiseLayerNorm(conv_channels, elementwise_affine=True))
         self.sconv = nn.Conv1d(conv_channels, in_channels, 1, bias=True)
         self.causal = causal
         self.dconv_pad = dconv_pad
         self.dilation = dilation
     def forward(self, x, aux):
         # Repeatedly concated speaker embedding aux to each frame of the representation x
         T = x.shape[-1]
-        aux = th.unsqueeze(aux, -1)
-        aux = aux.repeat(1,1,T)
-        y = th.cat([x, aux], 1)
         y = self.conv1x1(y)
         y = self.norm1(self.prelu1(y))
         y = self.dconv(y)

 import torch as th
 import torch.nn as nn
+import torch.nn.functional as F
+from .norm import ChannelwiseLayerNorm, GlobalLayerNorm, CumLN
 class Conv1D(nn.Conv1d):
     """
                  conv_channels=512,
                  kernel_size=3,
                  dilation=1,
+                 causal=False,
+                 norm_type='gLN'):
         super(TCNBlock, self).__init__()
         self.conv1x1 = Conv1D(in_channels, conv_channels, 1)
         self.prelu1 = nn.PReLU()
+        # self.norm1 = GlobalLayerNorm(conv_channels, elementwise_affine=True) if not causal else (
+        #     ChannelwiseLayerNorm(conv_channels, elementwise_affine=True))
+        if norm_type == 'gLN':
+            self.norm1 = GlobalLayerNorm(conv_channels, elementwise_affine=True)
+            self.norm2 = GlobalLayerNorm(conv_channels, elementwise_affine=True)
+        elif norm_type == 'cLN':
+            self.norm1 = ChannelwiseLayerNorm(conv_channels, elementwise_affine=True)
+            self.norm2 = ChannelwiseLayerNorm(conv_channels, elementwise_affine=True)
+        elif norm_type == 'cgLN':
+            self.norm1 = CumLN(conv_channels, elementwise_affine=True)
+            self.norm2 = CumLN(conv_channels, elementwise_affine=True)
         dconv_pad = (dilation * (kernel_size - 1)) // 2 if not causal else (
             dilation * (kernel_size - 1))
         self.dconv = nn.Conv1d(
             dilation=dilation,
             bias=True)
         self.prelu2 = nn.PReLU()
+        # self.norm2 = GlobalLayerNorm(conv_channels, elementwise_affine=True) if not causal else (
+        #     ChannelwiseLayerNorm(conv_channels, elementwise_affine=True))
         self.sconv = nn.Conv1d(conv_channels, in_channels, 1, bias=True)
         self.causal = causal
         self.dconv_pad = dconv_pad
                  conv_channels=512,
                  kernel_size=3,
                  dilation=1,
+                 causal=False,
+                 norm_type='gLN',
+                 fusion_type='cat'):
         super(TCNBlock_Spk, self).__init__()
+        self.fusion_type = fusion_type
+        if fusion_type == 'cat':
+            self.conv1x1 = Conv1D(in_channels+spk_embed_dim, conv_channels, 1)
+        if fusion_type in ('add', 'mul'):
+            self.fusion_linear = nn.Linear(spk_embed_dim, in_channels)
+            self.conv1x1 = Conv1D(in_channels, conv_channels, 1)
+        if fusion_type == 'film':
+            self.fusion_linear_1 = nn.Linear(spk_embed_dim, in_channels)
+            self.fusion_linear_2 = nn.Linear(spk_embed_dim, in_channels)
+            self.conv1x1 = Conv1D(in_channels, conv_channels, 1)
+        if fusion_type == 'att':
+            self.fusion_linear = nn.Linear(spk_embed_dim, in_channels)
+            self.average = Conv1D(in_channels, in_channels, kernel_size, kernel_size, groups=in_channels)
+            self.average.weight = nn.Parameter(th.ones(in_channels, 1, kernel_size) / kernel_size)
+            self.average.bias = nn.Parameter(th.zeros(in_channels))
+            for p in self.average.parameters():
+                p.requires_grad = False
+            self.conv1x1 = Conv1D(in_channels, conv_channels, 1)
         self.prelu1 = nn.PReLU()
+        # self.norm1 = GlobalLayerNorm(conv_channels, elementwise_affine=True) if not causal else (
+        #     ChannelwiseLayerNorm(conv_channels, elementwise_affine=True))
+        if norm_type == 'gLN':
+            self.norm1 = GlobalLayerNorm(conv_channels, elementwise_affine=True)
+            self.norm2 = GlobalLayerNorm(conv_channels, elementwise_affine=True)
+        elif norm_type == 'cLN':
+            self.norm1 = ChannelwiseLayerNorm(conv_channels, elementwise_affine=True)
+            self.norm2 = ChannelwiseLayerNorm(conv_channels, elementwise_affine=True)
+        elif norm_type == 'cgLN':
+            self.norm1 = CumLN(conv_channels, elementwise_affine=True)
+            self.norm2 = CumLN(conv_channels, elementwise_affine=True)
         dconv_pad = (dilation * (kernel_size - 1)) // 2 if not causal else (
             dilation * (kernel_size - 1))
         self.dconv = nn.Conv1d(
             dilation=dilation,
             bias=True)
         self.prelu2 = nn.PReLU()
+        # self.norm2 = GlobalLayerNorm(conv_channels, elementwise_affine=True) if not causal else (
+        #     ChannelwiseLayerNorm(conv_channels, elementwise_affine=True))
         self.sconv = nn.Conv1d(conv_channels, in_channels, 1, bias=True)
         self.causal = causal
         self.dconv_pad = dconv_pad
         self.dilation = dilation
+    def _concatenation(self, aux, output, L):
+        aux_concat = th.unsqueeze(aux, -1)
+        aux_concat  = aux_concat.repeat(1, 1, L)
+         # -> [B, N(embeddings_size), L]
+        output = th.cat([output, aux_concat], 1)
+        # -> [B, N(input_size + embeddings_size), L]
+        return output
+    def _addition(self, aux, output, L, fusion_linear):
+        aux_add = fusion_linear(aux)
+        # -> [B, N(input_size)]
+        aux_add = th.unsqueeze(aux_add, -1)
+        aux_add = aux_add.repeat(1, 1, L)
+        # -> [B, N(input_size), L]
+        output = output + aux_add
+        # -> [B, N(input_size, L]
+        return output
+    def _multiplication(self, aux, output, L, fusion_linear):
+        aux_mul = fusion_linear(aux)
+        # -> [B, N(input_size)]
+        aux_mul = th.unsqueeze(aux_mul, -1)
+        aux_mul = aux_mul.repeat(1, 1, L)
+        # -> [B, N(input_size), L]
+        output = output * aux_mul
+        # -> [B, N(input_size, L]
+        return output
+    def _attention(self, aux, output, fusion_linear):
+        L = output.shape[-1]
+        aux_att = fusion_linear(aux)
+        aux_att = th.unsqueeze(aux_att, -1)
+        aux_att = aux_att.repeat(1, 1, L)
+        att = th.sum(output * aux_att, 1, keepdim=True)
+        att = F.softmax(att, -1)
+        att = att * aux_att
+        return att + aux_att
+    def _film(self, aux, output, L):
+        output = self._multiplication(aux, output, L, self.fusion_linear_1)
+        # -> [B, N(input_size, L]
+        output = self._addition(aux, output, L, self.fusion_linear_2)
+        # -> [B, N(input_size, L]
+        return output
     def forward(self, x, aux):
         # Repeatedly concated speaker embedding aux to each frame of the representation x
         T = x.shape[-1]
+        if self.fusion_type == 'cat':
+            y = self._concatenation(aux, x, T)
+            # -> [B, N(input_size + embeddings_size), L]
+        if self.fusion_type == 'add':
+            y = self._addition(aux, x, T, self.fusion_linear)
+            # -> [B, N(input_size), L]
+        if self.fusion_type == 'mul':
+            y = self._multiplication(aux, x, T, self.fusion_linear)
+            # -> [B, N(input_size), L]
+        if self.fusion_type == 'film':
+            y = self._film(aux, x, T)
+            # -> [B, N(input_size), L]
+        if self.fusion_type == 'att':
+            output_avg = self.average(x)
+            att_out = self._attention(aux, output_avg, self.fusion_linear)
+            upsampling = nn.Upsample(size=T, mode='nearest')
+            att_out = upsampling(att_out)
+            y = x * att_out
         y = self.conv1x1(y)
         y = self.norm1(self.prelu1(y))
         y = self.dconv(y)

{nnet → model}/norm.py RENAMED Viewed

@@ -22,6 +22,44 @@ class ChannelwiseLayerNorm(nn.LayerNorm):
         x = th.transpose(x, 1, 2)
         return x
 class GlobalLayerNorm(nn.Module):
     """
     Global layer normalization
@@ -57,3 +95,4 @@ class GlobalLayerNorm(nn.Module):
     def extra_repr(self):
         return "{normalized_dim}, eps={eps}, " \
             "elementwise_affine={elementwise_affine}".format(**self.__dict__)

         x = th.transpose(x, 1, 2)
         return x
+class CumLN(nn.Module):
+    """
+    Cumulative Global layer normalization
+    Input: 3D tensor with [batch_size(N), channel_size(C), frame_num(T)]
+    Output: 3D tensor with same shape
+    """
+    def __init__(self, dim, eps=1e-05, elementwise_affine=True):
+        super(CumLN, self).__init__()
+        self.eps = eps
+        self.elementwise_affine = elementwise_affine
+        self.normalized_dim = dim
+        if elementwise_affine:
+            self.beta = nn.Parameter(th.zeros(dim, 1))
+            self.gamma = nn.Parameter(th.ones(dim, 1))
+        else:
+            self.register_parameter("weight", None)
+            self.register_parameter("bias", None)
+    def forward(self, x):
+        if x.dim() != 3:
+            raise RuntimeError("{} requires a 3D tensor input".format(self.__class__.__name__))
+        batch, chan, spec_len = x.size()
+        cum_sum = th.cumsum(x.sum(1, keepdim=True), dim=-1)
+        cum_pow_sum = th.cumsum(x.pow(2).sum(1, keepdim=True), dim=-1)  #th.cumsum 后加前 逐元素相加
+        cnt = th.arange(start=chan, end=chan * (spec_len + 1), step=chan, dtype=x.dtype, device=x.device).view(1, 1, -1)
+        cum_mean = cum_sum / cnt
+        cum_var = cum_pow_sum / cnt - cum_mean.pow(2)
+        normalized_x = (x - cum_mean) / (cum_var + self.eps).sqrt()
+        if self.elementwise_affine:
+            normalized_x = self.gamma * normalized_x + self.beta
+        return normalized_x
+    def extra_repr(self):
+        return "{normalized_dim}, eps={eps}, elementwise_affine={elementwise_affine}".format(**self.__dict__)
 class GlobalLayerNorm(nn.Module):
     """
     Global layer normalization
     def extra_repr(self):
         return "{normalized_dim}, eps={eps}, " \
             "elementwise_affine={elementwise_affine}".format(**self.__dict__)

{nnet → model}/spex_plus.py RENAMED Viewed

@@ -6,15 +6,9 @@ import torch.nn.functional as F
 from .norm import ChannelwiseLayerNorm, GlobalLayerNorm
 from .cnns import Conv1D, ConvTrans1D, TCNBlock, TCNBlock_Spk, ResBlock
-import torchaudio
-from .ResNet34 import Speaker_Encoder
-# from .sunine.trainer.utils import PreEmphasis
-# 考虑两种可能，频域就不大可能有所谓的多时间尺度，所以肯定speaker是直接频谱，那speech呢？
-# 注意下维度 是 B N T 还是 B T N
 class SpEx_Plus(nn.Module):
     def __init__(self,
@@ -28,14 +22,15 @@ class SpEx_Plus(nn.Module):
                  Q=3,
                  num_spks=101,
                  spk_embed_dim=256,
-                 sample_rate = 16000,
-                 n_mels = 80,
                  causal=False,
                  ):
         super(SpEx_Plus, self).__init__()
         # n x S => n x N x T, S = 4s*8000 = 32000
-        self.sample_rate = sample_rate
-        self.n_mels = n_mels
         self.L1 = L1
         self.L2 = L2
         self.L3 = L3
@@ -43,82 +38,49 @@ class SpEx_Plus(nn.Module):
         self.encoder_1d_middle = Conv1D(1, N, L2, stride=L1 // 2, padding=0)
         self.encoder_1d_long = Conv1D(1, N, L3, stride=L1 // 2, padding=0)
         # before repeat blocks, always cLN
-        self.ln = ChannelwiseLayerNorm(3*N)
-        # n x N x T => n x O x T
-        self.proj = Conv1D(3*N, O, 1)
-        self.conv_block_1 = TCNBlock_Spk(spk_embed_dim=spk_embed_dim, in_channels=O, conv_channels=P, kernel_size=Q, causal=causal, dilation=1)
-        self.conv_block_1_other = self._build_stacks(num_blocks=B, in_channels=O, conv_channels=P, kernel_size=Q, causal=causal)
-        self.conv_block_2 = TCNBlock_Spk(spk_embed_dim=spk_embed_dim, in_channels=O, conv_channels=P, kernel_size=Q, causal=causal, dilation=1)
-        self.conv_block_2_other = self._build_stacks(num_blocks=B, in_channels=O, conv_channels=P, kernel_size=Q, causal=causal)
-        self.conv_block_3 = TCNBlock_Spk(spk_embed_dim=spk_embed_dim, in_channels=O, conv_channels=P, kernel_size=Q, causal=causal, dilation=1)
-        self.conv_block_3_other = self._build_stacks(num_blocks=B, in_channels=O, conv_channels=P, kernel_size=Q, causal=causal)
-        self.conv_block_4 = TCNBlock_Spk(spk_embed_dim=spk_embed_dim, in_channels=O, conv_channels=P, kernel_size=Q, causal=causal, dilation=1)
-        self.conv_block_4_other = self._build_stacks(num_blocks=B, in_channels=O, conv_channels=P, kernel_size=Q, causal=causal)
-        # n x O x T => n x N x T
-        self.mask1 = Conv1D(O, N, 1)
-        self.mask2 = Conv1D(O, N, 1)
-        self.mask3 = Conv1D(O, N, 1)
-        # using ConvTrans1D: n x N x T => n x 1 x To
-        # To = (T - 1) * L // 2 + L
-#############################################################
         self.decoder_1d_short = ConvTrans1D(N, 1, kernel_size=L1, stride=L1 // 2, bias=True)
         self.decoder_1d_middle = ConvTrans1D(N, 1, kernel_size=L2, stride=L1 // 2, bias=True)
         self.decoder_1d_long = ConvTrans1D(N, 1, kernel_size=L3, stride=L1 // 2, bias=True)
         self.num_spks = num_spks
-        # self.spk_encoder = nn.Sequential(
-        #     ChannelwiseLayerNorm(3*N),
-        #     Conv1D(3*N, O, 1),
-        #     ResBlock(O, O),
-        #     ResBlock(O, P),
-        #     ResBlock(P, P),
-        #     Conv1D(P, spk_embed_dim, 1),
-        # )
-        # self.pred_linear = nn.Linear(spk_embed_dim, num_spks)
-# 改为pretrain
-# 考虑两种可能，频域就不大可能有所谓的多时间尺度，所以肯定speaker是直接频谱，那speech呢？
-# /work105/youzhenghai/model/resnet_asp_aam_adamw_welr
-# import ..sunine/trainer/speaker encoder
-# **kwargs 无需关心 找到 self.hparams就行  按照 main_infer改就行
-#############################################################
-        # # 1. Acoustic Feature
-        # self.mel_trans = th.nn.Sequential(
-        #         PreEmphasis(),
-        #         torchaudio.transforms.MelSpectrogram(sample_rate=self.sample_rate, n_fft=512,
-        #             win_length=400, hop_length=160, window_fn=th.hamming_window, n_mels=self.n_mels)
-        #         )
-        # self.instancenorm = nn.InstanceNorm1d(self.n_mels)
-        # # 在调用的地方设置超参数 记得后面写为参数传入
-        # self.hparams = {'embedding_dim': spk_embed_dim, 'pooling_type': 'ASP' , 'n_mels': self.n_mels}
-        # # 使用 **self.hparams 调用函数
-        # self.speaker_encoder = Speaker_Encoder(**self.hparams)
-        self.speaker_embedding_extracter = Speaker_Model(pooling_type='ASP', spk_embed_dim=spk_embed_dim, sample_rate=self.sample_rate, n_mels=self.n_mels)
         self.pred_linear = nn.Linear(spk_embed_dim, num_spks)
-#############################################################
-        # # 3. Loss / Classifier
-        # if not self.hparams.evaluate:
-        #     LossFunction = importlib.import_module('trainer.loss.'+self.hparams.loss_type).__getattribute__('LossFunction')
-        #     self.loss = LossFunction(**dict(self.hparams))
-    def _build_stacks(self, num_blocks, **block_kwargs):
-        """
-        Stack B numbers of TCN block, the first TCN block takes the speaker embedding
-        """
-        blocks = [
-            TCNBlock(**block_kwargs, dilation=(2**b))
-            for b in range(1,num_blocks)
-        ]
-        return nn.Sequential(*blocks)
-#   注意下维度 是 B N T 还是 B T N
     def forward(self, x, aux, aux_len):
         if x.dim() >= 3:
@@ -128,8 +90,9 @@ class SpEx_Plus(nn.Module):
         # when inference, only one utt
         if x.dim() == 1:
             x = th.unsqueeze(x, 0)
         # n x 1 x S => n x N x T
         w1 = F.relu(self.encoder_1d_short(x))
         T = w1.shape[-1]
         xlen1 = x.shape[-1]
@@ -137,42 +100,79 @@ class SpEx_Plus(nn.Module):
         xlen3 = (T - 1) * (self.L1 // 2) + self.L3
         w2 = F.relu(self.encoder_1d_middle(F.pad(x, (0, xlen2 - xlen1), "constant", 0)))
         w3 = F.relu(self.encoder_1d_long(F.pad(x, (0, xlen3 - xlen1), "constant", 0)))
         # n x 3N x T
-        y = self.ln(th.cat([w1, w2, w3], 1))
-        # n x O x T
-        y = self.proj(y)
         # speaker encoder (share params from speech encoder)
-        # aux_w1 = F.relu(self.encoder_1d_short(aux))
-        # aux_T_shape = aux_w1.shape[-1]
-        # aux_len1 = aux.shape[-1]
-        # aux_len2 = (aux_T_shape - 1) * (self.L1 // 2) + self.L2
-        # aux_len3 = (aux_T_shape - 1) * (self.L1 // 2) + self.L3
-        # aux_w2 = F.relu(self.encoder_1d_middle(F.pad(aux, (0, aux_len2 - aux_len1), "constant", 0)))
-        # aux_w3 = F.relu(self.encoder_1d_long(F.pad(aux, (0, aux_len3 - aux_len1), "constant", 0)))
-        # spk_encoder + mean pooling
-        # aux = self.spk_encoder(th.cat([aux_w1, aux_w2, aux_w3], 1))
-        # aux_T = (aux_len - self.L1) // (self.L1 // 2) + 1
-        # aux_T = ((aux_T // 3) // 3) // 3
-        # aux = th.sum(aux, -1)/aux_T.view(-1,1).float()
-        # spk_encoder + TAP pooling
-        aux = self.speaker_embedding_extracter(aux)
-        #aux = torch.mean(aux, axis=0)
-        # aux = aux.cpu().detach().numpy()
-        # 不需要 reshape N * D 是正确的维度
-        #aux = aux.reshape(-1, self.hparams.nPerSpeaker, self.spk_embed_dim)
-    #     loss, acc = self.loss(x, label)
-    #     return loss.mean(), acc
-        # 考虑 loss 是否也要
         y = self.conv_block_1(y, aux)
         y = self.conv_block_1_other(y)
         y = self.conv_block_2(y, aux)
@@ -186,62 +186,35 @@ class SpEx_Plus(nn.Module):
         m1 = F.relu(self.mask1(y))
         m2 = F.relu(self.mask2(y))
         m3 = F.relu(self.mask3(y))
-        S1 = w1 * m1
-        S2 = w2 * m2
-        S3 = w3 * m3
-        return self.decoder_1d_short(S1), self.decoder_1d_middle(S2)[:, :xlen1], self.decoder_1d_long(S3)[:, :xlen1], self.pred_linear(aux)
-class PreEmphasis(th.nn.Module):
-    def __init__(self, coef: float = 0.97):
-        super().__init__()
-        self.coef = coef
-        # make kernel
-        # In pyth, the convolution operation uses cross-correlation. So, filter is flipped.
-        self.register_buffer(
-            'flipped_filter', th.FloatTensor([-self.coef, 1.]).unsqueeze(0).unsqueeze(0)
-        )
-    def forward(self, inputs: th.tensor) -> th.tensor:
-        assert len(inputs.size()) == 2, 'The number of dimensions of inputs tensor must be 2!'
-        # reflect padding to match lengths of in/out
-        inputs = inputs.unsqueeze(1)
-        inputs = F.pad(inputs, (1, 0), 'reflect')
-        return F.conv1d(inputs, self.flipped_filter).squeeze(1)
 class Speaker_Model(nn.Module):
-#class Speaker_Model(LightningModule):
-    def __init__(self, pooling_type, spk_embed_dim, sample_rate, n_mels):
-        super().__init__()
-        # self.save_hyperparameters()
-        self.pooling_type = pooling_type
-        self.spk_embed_dim = spk_embed_dim
-        self.sample_rate = sample_rate
-        self.n_mels = n_mels
-        sr = self.sample_rate
-        self.mel_trans = th.nn.Sequential(
-                PreEmphasis(),
-                torchaudio.transforms.MelSpectrogram(sample_rate=sr, n_fft=512,
-                                                     win_length=sr * 25 // 1000, hop_length=sr * 10 // 1000,
-                                                     window_fn=th.hamming_window, n_mels=self.n_mels)
-                )
-        self.instancenorm = nn.InstanceNorm1d(self.n_mels)
-        self.hparams = {'embedding_dim': self.spk_embed_dim, 'pooling_type': self.pooling_type , 'n_mels': self.n_mels}
-        self.speaker_encoder = Speaker_Encoder(**dict(self.hparams))
-    def extract_speaker_embedding(self, data):
-        x = data.reshape(-1, data.size()[-1])
-        x = self.mel_trans(x) + 1e-6
-        x = x.log()
-        x = self.instancenorm(x)
-        x = self.speaker_encoder(x)
-        return x
-    def forward(self, x):
-        x = self.extract_speaker_embedding(x)
-        return x

 from .norm import ChannelwiseLayerNorm, GlobalLayerNorm
 from .cnns import Conv1D, ConvTrans1D, TCNBlock, TCNBlock_Spk, ResBlock
+import warnings
+# inference aux_len
 class SpEx_Plus(nn.Module):
     def __init__(self,
                  Q=3,
                  num_spks=101,
                  spk_embed_dim=256,
                  causal=False,
+                 norm_type='gLN',
+                 fusion_type='cat',
+                 is_innorm=False,
                  ):
         super(SpEx_Plus, self).__init__()
         # n x S => n x N x T, S = 4s*8000 = 32000
         self.L1 = L1
         self.L2 = L2
         self.L3 = L3
         self.encoder_1d_middle = Conv1D(1, N, L2, stride=L1 // 2, padding=0)
         self.encoder_1d_long = Conv1D(1, N, L3, stride=L1 // 2, padding=0)
         # before repeat blocks, always cLN
+        self.instancenorm = nn.InstanceNorm1d(N)
         self.decoder_1d_short = ConvTrans1D(N, 1, kernel_size=L1, stride=L1 // 2, bias=True)
         self.decoder_1d_middle = ConvTrans1D(N, 1, kernel_size=L2, stride=L1 // 2, bias=True)
         self.decoder_1d_long = ConvTrans1D(N, 1, kernel_size=L3, stride=L1 // 2, bias=True)
         self.num_spks = num_spks
         self.pred_linear = nn.Linear(spk_embed_dim, num_spks)
+        self.is_innorm = is_innorm
+        if causal and norm_type not in ["cgLN", "cLN"]:
+            norm_type = "cLN"
+            warnings.warn(
+                "In causal configuration cumulative layer normalization (cgLN)"
+                "or channel-wise layer normalization (chanLN)  "
+                f"must be used. Changing {norm_type} to cLN"
+            )
+        self.speaker_encoder = Speaker_Model(
+                                        L1=L1,
+                                        L2=L2,
+                                        L3=L3,
+                                        N=N,
+                                        O=O,
+                                        P=P,
+                                        spk_embed_dim=spk_embed_dim,
+                                        )
+        self.extractor = Extractor(
+                                    L1=L1,
+                                    L2=L2,
+                                    L3=L3,
+                                    N=N,
+                                    B=B,
+                                    O=O,
+                                    P=P,
+                                    Q=Q,
+                                    num_spks=num_spks,
+                                    spk_embed_dim=spk_embed_dim,
+                                    causal=causal,
+                                    fusion_type=fusion_type,
+                                    norm_type=norm_type,
+                                    )
     def forward(self, x, aux, aux_len):
         if x.dim() >= 3:
         # when inference, only one utt
         if x.dim() == 1:
             x = th.unsqueeze(x, 0)
         # n x 1 x S => n x N x T
         w1 = F.relu(self.encoder_1d_short(x))
         T = w1.shape[-1]
         xlen1 = x.shape[-1]
         xlen3 = (T - 1) * (self.L1 // 2) + self.L3
         w2 = F.relu(self.encoder_1d_middle(F.pad(x, (0, xlen2 - xlen1), "constant", 0)))
         w3 = F.relu(self.encoder_1d_long(F.pad(x, (0, xlen3 - xlen1), "constant", 0)))
         # n x 3N x T
         # speaker encoder (share params from speech encoder)
+        aux_w1 = F.relu(self.encoder_1d_short(aux))
+        aux_T_shape = aux_w1.shape[-1]
+        aux_len1 = aux.shape[-1]
+        aux_len2 = (aux_T_shape - 1) * (self.L1 // 2) + self.L2
+        aux_len3 = (aux_T_shape - 1) * (self.L1 // 2) + self.L3
+        aux_w2 = F.relu(self.encoder_1d_middle(F.pad(aux, (0, aux_len2 - aux_len1), "constant", 0)))
+        aux_w3 = F.relu(self.encoder_1d_long(F.pad(aux, (0, aux_len3 - aux_len1), "constant", 0)))
+        aux = self.speaker_encoder(th.cat([aux_w1, aux_w2, aux_w3], 1), aux_len)
+        if self.is_innorm:
+            w1 = self.instancenorm(w1)
+            w2 = self.instancenorm(w2)
+            w3 = self.instancenorm(w3)
+        m1, m2, m3  = self.extractor(w1, w2, w3, aux)
+        S1 = w1 * m1
+        S2 = w2 * m2
+        S3 = w3 * m3
+        return self.decoder_1d_short(S1), self.decoder_1d_middle(S2)[:, :xlen1], self.decoder_1d_long(S3)[:, :xlen1], self.pred_linear(aux)
+class Extractor(nn.Module):
+    def __init__(self,
+                 L1=20,
+                 L2=80,
+                 L3=160,
+                 N=256,
+                 B=8,
+                 O=256,
+                 P=512,
+                 Q=3,
+                 num_spks=101,
+                 spk_embed_dim=256,
+                 causal=False,
+                 fusion_type='cat',
+                 norm_type='gLN',
+                 ):
+        super(Extractor, self).__init__()
+        # n x N x T => n x O x T
+        self.ln = ChannelwiseLayerNorm(3*N)
+        self.proj = Conv1D(3*N, O, 1)
+        self.conv_block_1 = TCNBlock_Spk(spk_embed_dim=spk_embed_dim, in_channels=O, conv_channels=P, kernel_size=Q, causal=causal, dilation=1,fusion_type=fusion_type,norm_type=norm_type)
+        self.conv_block_1_other = self._build_stacks(num_blocks=B, in_channels=O, conv_channels=P, kernel_size=Q, causal=causal,norm_type=norm_type)
+        self.conv_block_2 = TCNBlock_Spk(spk_embed_dim=spk_embed_dim, in_channels=O, conv_channels=P, kernel_size=Q, causal=causal, dilation=1,fusion_type=fusion_type,norm_type=norm_type)
+        self.conv_block_2_other = self._build_stacks(num_blocks=B, in_channels=O, conv_channels=P, kernel_size=Q, causal=causal,norm_type=norm_type)
+        self.conv_block_3 = TCNBlock_Spk(spk_embed_dim=spk_embed_dim, in_channels=O, conv_channels=P, kernel_size=Q, causal=causal, dilation=1,fusion_type=fusion_type,norm_type=norm_type)
+        self.conv_block_3_other = self._build_stacks(num_blocks=B, in_channels=O, conv_channels=P, kernel_size=Q, causal=causal,norm_type=norm_type)
+        self.conv_block_4 = TCNBlock_Spk(spk_embed_dim=spk_embed_dim, in_channels=O, conv_channels=P, kernel_size=Q, causal=causal, dilation=1,fusion_type=fusion_type,norm_type=norm_type)
+        self.conv_block_4_other = self._build_stacks(num_blocks=B, in_channels=O, conv_channels=P, kernel_size=Q, causal=causal,norm_type=norm_type)
+        # n x O x T => n x N x T
+        self.mask1 = Conv1D(O, N, 1)
+        self.mask2 = Conv1D(O, N, 1)
+        self.mask3 = Conv1D(O, N, 1)
+    def _build_stacks(self, num_blocks, **block_kwargs):
+        """
+        Stack B numbers of TCN block, the first TCN block takes the speaker embedding
+        """
+        blocks = [
+            TCNBlock(**block_kwargs, dilation=(2**b))
+            for b in range(1,num_blocks)
+        ]
+        return nn.Sequential(*blocks)
+    def forward(self, w1, w2, w3, aux):
+        y = self.ln(th.cat([w1, w2, w3], 1))
+        # n x O x T
+        y = self.proj(y)
         y = self.conv_block_1(y, aux)
         y = self.conv_block_1_other(y)
         y = self.conv_block_2(y, aux)
         m1 = F.relu(self.mask1(y))
         m2 = F.relu(self.mask2(y))
         m3 = F.relu(self.mask3(y))
+        return m1, m2, m3
 class Speaker_Model(nn.Module):
+    def __init__(self,
+                 L1=20,
+                 L2=80,
+                 L3=160,
+                 N=256,
+                 O=256,
+                 P=512,
+                 spk_embed_dim=256,
+                ):
+        super(Speaker_Model, self).__init__()
+        self.L1 = L1
+        self.L2 = L2
+        self.L3 = L3
+        self.spk_encoder = nn.Sequential(
+            ChannelwiseLayerNorm(3*N),
+            Conv1D(3*N, O, 1),
+            ResBlock(O, O),
+            ResBlock(O, P),
+            ResBlock(P, P),
+            Conv1D(P, spk_embed_dim, 1),
+        )
+    def forward(self, aux, aux_len):
+        aux = self.spk_encoder(aux)
+        aux_T = (aux_len - self.L1) // (self.L1 // 2) + 1
+        aux_T = ((aux_T // 3) // 3) // 3
+        aux = th.sum(aux, -1)/aux_T.view(-1,1).float()
+        return aux

nnet/ResNet34.py DELETED Viewed

@@ -1,213 +0,0 @@
-#! /usr/bin/python
-# -*- encoding: utf-8 -*-
-'''
-Fast ResNet
-https://arxiv.org/pdf/2003.11982.pdf
-'''
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-from torch.nn import Parameter
-try:
-    from .pooling import *
-except:
-    from pooling import *
-class SEBasicBlock(nn.Module):
-    expansion = 1
-    def __init__(self, inplanes, planes, stride=1, downsample=None, reduction=8):
-        super(SEBasicBlock, self).__init__()
-        self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=3, stride=stride, padding=1, bias=False)
-        self.bn1 = nn.BatchNorm2d(planes)
-        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, padding=1, bias=False)
-        self.bn2 = nn.BatchNorm2d(planes)
-        self.relu = nn.ReLU(inplace=True)
-        self.se = SELayer(planes, reduction)
-        self.downsample = downsample
-        self.stride = stride
-    def forward(self, x):
-        residual = x
-        out = self.conv1(x)
-        out = self.relu(out)
-        out = self.bn1(out)
-        out = self.conv2(out)
-        out = self.bn2(out)
-        out = self.se(out)
-        if self.downsample is not None:
-            residual = self.downsample(x)
-        out += residual
-        out = self.relu(out)
-        return out
-class SEBottleneck(nn.Module):
-    expansion = 4
-    def __init__(self, inplanes, planes, stride=1, downsample=None, reduction=8):
-        super(SEBottleneck, self).__init__()
-        self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, bias=False)
-        self.bn1 = nn.BatchNorm2d(planes)
-        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride,
-                               padding=1, bias=False)
-        self.bn2 = nn.BatchNorm2d(planes)
-        self.conv3 = nn.Conv2d(planes, planes * 4, kernel_size=1, bias=False)
-        self.bn3 = nn.BatchNorm2d(planes * 4)
-        self.relu = nn.ReLU(inplace=True)
-        self.se = SELayer(planes * 4, reduction)
-        self.downsample = downsample
-        self.stride = stride
-    def forward(self, x):
-        residual = x
-        out = self.conv1(x)
-        out = self.bn1(out)
-        out = self.relu(out)
-        out = self.conv2(out)
-        out = self.bn2(out)
-        out = self.relu(out)
-        out = self.conv3(out)
-        out = self.bn3(out)
-        out = self.se(out)
-        if self.downsample is not None:
-            residual = self.downsample(x)
-        out += residual
-        out = self.relu(out)
-        return out
-class SELayer(nn.Module):
-    def __init__(self, channel, reduction=8):
-        super(SELayer, self).__init__()
-        self.avg_pool = nn.AdaptiveAvgPool2d(1)
-        self.fc = nn.Sequential(
-                nn.Linear(channel, channel // reduction),
-                nn.ReLU(inplace=True),
-                nn.Linear(channel // reduction, channel),
-                nn.Sigmoid()
-        )
-    def forward(self, x):
-        b, c, _, _ = x.size()
-        y = self.avg_pool(x).view(b, c)
-        y = self.fc(y).view(b, c, 1, 1)
-        return x * y
-class ResNetSE(nn.Module):
-    def __init__(self, block, layers, num_filters, embedding_dim, n_mels=80, pooling_type="TSP", **kwargs):
-        super(ResNetSE, self).__init__()
-        self.inplanes   = num_filters[0]
-        self.conv1 = nn.Conv2d(1, num_filters[0] , kernel_size=3, stride=(1, 1), padding=1,
-                               bias=False)
-        self.bn1 = nn.BatchNorm2d(num_filters[0])
-        self.relu = nn.ReLU(inplace=True)
-        self.layer1 = self._make_layer(block, num_filters[0], layers[0])
-        self.layer2 = self._make_layer(block, num_filters[1], layers[1], stride=(2, 2))
-        self.layer3 = self._make_layer(block, num_filters[2], layers[2], stride=(2, 2))
-        self.layer4 = self._make_layer(block, num_filters[3], layers[3], stride=(2, 2))
-        out_dim = num_filters[3] * block.expansion * (n_mels//8)
-        if pooling_type == "Temporal_Average_Pooling" or pooling_type == "TAP":
-            self.pooling = Temporal_Average_Pooling()
-            self.bn2 = nn.BatchNorm1d(out_dim)
-            self.fc = nn.Linear(out_dim, embedding_dim)
-            self.bn3 = nn.BatchNorm1d(embedding_dim)
-        elif pooling_type == "Temporal_Statistics_Pooling" or pooling_type == "TSP":
-            self.pooling = Temporal_Statistics_Pooling()
-            self.bn2 = nn.BatchNorm1d(out_dim * 2)
-            self.fc = nn.Linear(out_dim * 2, embedding_dim)
-            self.bn3 = nn.BatchNorm1d(embedding_dim)
-        elif pooling_type == "Self_Attentive_Pooling" or pooling_type == "SAP":
-            self.pooling = Self_Attentive_Pooling(out_dim)
-            self.bn2 = nn.BatchNorm1d(out_dim)
-            self.fc = nn.Linear(out_dim, embedding_dim)
-            self.bn3 = nn.BatchNorm1d(embedding_dim)
-        elif pooling_type == "Attentive_Statistics_Pooling" or pooling_type == "ASP":
-            self.pooling = Attentive_Statistics_Pooling(out_dim)
-            self.bn2 = nn.BatchNorm1d(out_dim * 2)
-            self.fc = nn.Linear(out_dim * 2, embedding_dim)
-            self.bn3 = nn.BatchNorm1d(embedding_dim)
-        else:
-            raise ValueError('{} pooling type is not defined'.format(pooling_type))
-        for m in self.modules():
-            if isinstance(m, nn.Conv2d):
-                nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
-            elif isinstance(m, nn.BatchNorm2d):
-                nn.init.constant_(m.weight, 1)
-                nn.init.constant_(m.bias, 0)
-    def _make_layer(self, block, planes, blocks, stride=1):
-        downsample = None
-        if stride != 1 or self.inplanes != planes * block.expansion:
-            downsample = nn.Sequential(
-                nn.Conv2d(self.inplanes, planes * block.expansion,
-                          kernel_size=1, stride=stride, bias=False),
-                nn.BatchNorm2d(planes * block.expansion),
-            )
-        layers = []
-        layers.append(block(self.inplanes, planes, stride, downsample))
-        self.inplanes = planes * block.expansion
-        for i in range(1, blocks):
-            layers.append(block(self.inplanes, planes))
-        return nn.Sequential(*layers)
-    def forward(self, x):
-        x = x.unsqueeze(1)
-        x = self.conv1(x)
-        x = self.bn1(x)
-        x = self.relu(x)
-        x = self.layer1(x)
-        x = self.layer2(x)
-        x = self.layer3(x)
-        x = self.layer4(x)
-        x = x.reshape(x.shape[0], -1, x.shape[-1])
-        x = self.pooling(x)
-        x = self.bn2(x)
-        x = torch.flatten(x, 1)
-        x = self.fc(x)
-        x = self.bn3(x)
-        return x
-def Speaker_Encoder(embedding_dim=256, **kwargs):
-    # Number of filters
-    num_filters = [32, 64, 128, 256]
-    model = ResNetSE(SEBasicBlock, [3, 4, 6, 3], num_filters, embedding_dim, **kwargs)
-    return model
-if __name__ == '__main__':
-    model = Speaker_Encoder()
-    total = sum([param.nelement() for param in model.parameters()])
-    print(total/1e6)
-    data = torch.randn(10, 80, 100)
-    out = model(data)
-    print(data.shape)
-    print(out.shape)

nnet/__init__.py DELETED Viewed

File without changes

nnet/pooling.py DELETED Viewed

@@ -1,100 +0,0 @@
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-from speechbrain.lobes.models.ECAPA_TDNN import AttentiveStatisticsPooling
-class Temporal_Average_Pooling(nn.Module):
-    def __init__(self, **kwargs):
-        """TAP
-        Paper: Multi-Task Learning with High-Order Statistics for X-vector based Text-Independent Speaker Verification
-        Link: https://arxiv.org/pdf/1903.12058.pdf
-        """
-        super(Temporal_Average_Pooling, self).__init__()
-    def forward(self, x):
-        """Computes Temporal Average Pooling Module
-        Args:
-            x (torch.Tensor): Input tensor (#batch, channels, frames).
-        Returns:
-            torch.Tensor: Output tensor (#batch, channels)
-        """
-        x = torch.mean(x, axis=2)
-        return x
-class Temporal_Statistics_Pooling(nn.Module):
-    def __init__(self, **kwargs):
-        """TSP
-        Paper: X-vectors: Robust DNN Embeddings for Speaker Recognition
-        Link： http://www.danielpovey.com/files/2018_icassp_xvectors.pdf
-        """
-        super(Temporal_Statistics_Pooling, self).__init__()
-    def forward(self, x):
-        """Computes Temporal Statistics Pooling Module
-        Args:
-            x (torch.Tensor): Input tensor (#batch, channels, frames).
-        Returns:
-            torch.Tensor: Output tensor (#batch, channels*2)
-        """
-        mean = torch.mean(x, axis=2)
-        var = torch.var(x, axis=2)
-        x = torch.cat((mean, var), axis=1)
-        return x
-''' Self attentive weighted mean pooling.
-'''
-class Self_Attentive_Pooling(nn.Module):
-    def __init__(self, dim, **kwargs):
-        # Use Conv1d with stride == 1 rather than Linear, then we don't need to transpose inputs.
-        # attention dim = 128
-        super(Self_Attentive_Pooling, self).__init__()
-        self.linear1 = nn.Conv1d(dim, dim, kernel_size=1) # equals W and b in the paper
-        self.linear2 = nn.Conv1d(dim, dim, kernel_size=1) # equals V and k in the paper
-    def forward(self, x):
-        # DON'T use ReLU here! In experiments, I find ReLU hard to converge.
-        alpha = torch.tanh(self.linear1(x))
-        alpha = torch.softmax(self.linear2(alpha), dim=2)
-        mean = torch.sum(alpha * x, dim=2)
-        return mean
-''' Attentive weighted mean and standard deviation pooling.
-'''
-class Attentive_Statistics_Pooling(nn.Module):
-    def __init__(self, dim, **kwargs):
-        # Use AttentiveStatisticsPooling and BatchNorm1d from speechbrain
-        super(Attentive_Statistics_Pooling, self).__init__()
-        self.pooling = AttentiveStatisticsPooling(dim)
-    def forward(self, x):
-        x = self.pooling(x)
-        return x
-# class Attentive_Statistics_Pooling(nn.Module):
-#     def __init__(self, dim, **kwargs):
-#         # Use Conv1d with stride == 1 rather than Linear, then we don't need to transpose inputs.
-#         # attention dim = 128
-#         super(Attentive_Statistics_Pooling, self).__init__()
-#         self.linear1 = nn.Conv1d(dim, dim, kernel_size=1) # equals W and b in the paper
-#         self.linear2 = nn.Conv1d(dim, dim, kernel_size=1) # equals V and k in the paper
-#
-#     def forward(self, x):
-#         # DON'T use ReLU here! In experiments, I find ReLU hard to converge.
-#         alpha = torch.tanh(self.linear1(x))
-#         alpha = torch.softmax(self.linear2(alpha), dim=2)
-#         mean = torch.sum(alpha * x, dim=2)
-#         residuals = torch.sum(alpha * x ** 2, dim=2) - mean ** 2
-#         std = torch.sqrt(residuals.clamp(min=1e-9))
-#         return torch.cat([mean, std], dim=1)
-if __name__ == "__main__":
-    data = torch.randn(10, 128, 100)
-    pooling = Self_Attentive_Pooling(128)
-    out = pooling(data)
-    print(data.shape)
-    print(out.shape)

nnet/speaker_encoder.py DELETED Viewed

@@ -1,47 +0,0 @@
-import torch
-import torchaudio
-import torch.nn as nn
-from torch.nn import functional as F
-from .ResNet34 import Speaker_Encoder
-class Speaker_Model(torch.nn.Module):
-#class Speaker_Model(LightningModule):
-    def __init__(self, pooling_type, spk_embed_dim, sample_rate, n_mels):
-        super().__init__()
-        # self.save_hyperparameters()
-        self.pooling_type = pooling_type
-        self.spk_embed_dim = spk_embed_dim
-        self.sample_rate = sample_rate
-        self.n_mels = n_mels
-        sr = self.sample_rate
-        self.mel_trans = torch.nn.Sequential(
-                PreEmphasis(),
-                torchaudio.transforms.MelSpectrogram(sample_rate=sr, n_fft=512,
-                                                     win_length=sr * 25 // 1000, hop_length=sr * 10 // 1000,
-                                                     window_fn=torch.hamming_window, n_mels=self.n_mels)
-                )
-        self.instancenorm = nn.InstanceNorm1d(self.n_mels)
-        self.hparams = {'embedding_dim': self.spk_embed_dim, 'pooling_type': self.pooling_type , 'n_mels': self.n_mels}
-        self.speaker_encoder = Speaker_Encoder(**dict(self.hparams))
-class PreEmphasis(torch.nn.Module):
-    def __init__(self, coef: float = 0.97):
-        super().__init__()
-        self.coef = coef
-        # make kernel
-        # In pytorch, the convolution operation uses cross-correlation. So, filter is flipped.
-        self.register_buffer(
-            'flipped_filter', torch.FloatTensor([-self.coef, 1.]).unsqueeze(0).unsqueeze(0)
-        )
-    def forward(self, inputs: torch.tensor) -> torch.tensor:
-        assert len(inputs.size()) == 2, 'The number of dimensions of inputs tensor must be 2!'
-        # reflect padding to match lengths of in/out
-        inputs = inputs.unsqueeze(1)
-        inputs = F.pad(inputs, (1, 0), 'reflect')
-        return F.conv1d(inputs, self.flipped_filter).squeeze(1)

noises/00840.wav ADDED Viewed

Binary file (963 kB). View file

noises/022928.wav ADDED Viewed

Binary file (961 kB). View file

noises/04338.wav ADDED Viewed

Binary file (959 kB). View file

noises/046324.wav ADDED Viewed

Binary file (961 kB). View file

noises/093004.wav ADDED Viewed

Binary file (960 kB). View file

noises/11129.wav ADDED Viewed

Binary file (963 kB). View file

noises/133254.wav ADDED Viewed

Binary file (960 kB). View file

noises/30100.wav ADDED Viewed

Binary file (959 kB). View file

noises/30135.wav ADDED Viewed

Binary file (959 kB). View file

noises/30437.wav ADDED Viewed

Binary file (963 kB). View file

noises/30603.wav ADDED Viewed

Binary file (959 kB). View file

requirements.txt CHANGED Viewed

@@ -1,2 +1,7 @@
-soundfile
 gradio

+soundfile==0.12.1
 gradio
+hydra-core==1.3.2
+torch==1.11.0
+pyloudnorm
+numpy==1.24.4
+librosa

temp_extracted.wav ADDED Viewed

Binary file (180 kB). View file

test_mix.wav ADDED Viewed

Binary file (180 kB). View file

test_output_mixture.wav ADDED Viewed

Binary file (180 kB). View file

utils/__init__.py DELETED Viewed

File without changes

utils/audio.py DELETED Viewed

@@ -1,124 +0,0 @@
-#!/usr/bin/env python
-import os
-import numpy as np
-import soundfile as sf
-def write_wav(fname, samps, sample_rate=16000, normalize=True):
-    """
-    Write wav files in float32, support single/multi-channel
-    """
-    # wham and whamr mixture and clean data are float 32, can not use scipy.io.wavfile to read and write int16
-    # change to soundfile to read and write, although reference speech is int16, soundfile still can read and outputs as float
-    fdir = os.path.dirname(fname)
-    if fdir and not os.path.exists(fdir):
-        os.makedirs(fdir)
-    sf.write(fname, samps, sample_rate, subtype='FLOAT')
-def read_wav(fname, normalize=True, return_rate=False):
-    """
-    Read wave files (support multi-channel)
-    """
-    # wham and whamr mixture and clean data are float 32, can not use scipy.io.wavfile to read and write int16
-    # change to soundfile to read and write, although reference speech is int16, soundfile still can read and outputs as float
-    samps, samp_rate = sf.read(fname)
-    if return_rate:
-        return samp_rate, samps
-    return samps
-def parse_scripts(scp_path, value_processor=lambda x: x, num_tokens=2):
-    """
-    Parse kaldi's script(.scp) file
-    If num_tokens >= 2, function will check token number
-    """
-    scp_dict = dict()
-    line = 0
-    with open(scp_path, "r") as f:
-        for raw_line in f:
-            scp_tokens = raw_line.strip().split()
-            line += 1
-            if num_tokens >= 2 and len(scp_tokens) != num_tokens or len(
-                    scp_tokens) < 2:
-                raise RuntimeError(
-                    "For {}, format error in line[{:d}]: {}".format(
-                        scp_path, line, raw_line))
-            if num_tokens == 2:
-                key, value = scp_tokens
-            else:
-                key, value = scp_tokens[0], scp_tokens[1:]
-            if key in scp_dict:
-                raise ValueError("Duplicated key \'{0}\' exists in {1}".format(
-                    key, scp_path))
-            scp_dict[key] = value_processor(value)
-    return scp_dict
-class Reader(object):
-    """
-        Basic Reader Class
-    """
-    def __init__(self, scp_path, value_processor=lambda x: x):
-        self.index_dict = parse_scripts(
-            scp_path, value_processor=value_processor, num_tokens=2)
-        self.index_keys = list(self.index_dict.keys())
-    def _load(self, key):
-        # return path
-        return self.index_dict[key]
-    # number of utterance
-    def __len__(self):
-        return len(self.index_dict)
-    # avoid key error
-    def __contains__(self, key):
-        return key in self.index_dict
-    # sequential index
-    def __iter__(self):
-        for key in self.index_keys:
-            yield key, self._load(key)
-    # random index, support str/int as index
-    def __getitem__(self, index):
-        if type(index) not in [int, str]:
-            raise IndexError("Unsupported index type: {}".format(type(index)))
-        if type(index) == int:
-            # from int index to key
-            num_utts = len(self.index_keys)
-            if index >= num_utts or index < 0:
-                raise KeyError(
-                    "Interger index out of range, {:d} vs {:d}".format(
-                        index, num_utts))
-            index = self.index_keys[index]
-        if index not in self.index_dict:
-            raise KeyError("Missing utterance {}!".format(index))
-        return self._load(index)
-class WaveReader(Reader):
-    """
-        Sequential/Random Reader for single channel wave
-        Format of wav.scp follows Kaldi's definition:
-            key1 /path/to/wav
-            ...
-    """
-    def __init__(self, wav_scp, sample_rate=None, normalize=True):
-        super(WaveReader, self).__init__(wav_scp)
-        self.samp_rate = sample_rate
-        self.normalize = normalize
-    def _load(self, key):
-        # return C x N or N
-        samp_rate, samps = read_wav(
-            self.index_dict[key], normalize=self.normalize, return_rate=True)
-        # if given samp_rate, check it
-        if self.samp_rate is not None and samp_rate != self.samp_rate:
-            raise RuntimeError("SampleRate mismatch: {:d} vs {:d}".format(
-                samp_rate, self.samp_rate))
-        return samps

utils/dataset copy.py DELETED Viewed

@@ -1,284 +0,0 @@
-#!/usr/bin/env python
-import random
-import torch as th
-import numpy as np
-from torch.utils.data.dataloader import default_collate
-import torch.utils.data as dat
-from torch.nn.utils.rnn import pad_sequence
-from .audio import WaveReader
-import soundfile as sf
-# random_seed = 1453
-# random.seed(random_seed)
-def make_dataloader(train=True,
-                    utt_scp_file=None,
-                    spk_list=None,
-                    sample_rate=16000,
-                    num_workers=4,
-                    chunk_size=32000,
-                    batch_size=16):
-    dataset = Dataset(utt_scp_file=utt_scp_file,
-                      spk_list=spk_list,
-                      chunk_size=chunk_size,
-                      sample_rate=sample_rate)
-    return DataLoader(dataset,
-                      train=train,
-                      chunk_size=chunk_size,
-                      batch_size=batch_size,
-                      num_workers=num_workers)
-class Dataset(object):
-    """
-    Per Utterance Loader
-    """
-    def __init__(self, utt_scp_file="", spk_list=None,chunk_size=32000, sample_rate=8000):
-        self.sample_rate = sample_rate
-        self.spk_list = self._load_spk(spk_list)
-        self.seg_least= int(chunk_size // 2 )
-        # self.mix = WaveReader(mix_scp, sample_rate=sample_rate)
-        # self.ref = WaveReader(ref_scp, sample_rate=sample_rate)
-        # self.aux = WaveReader(aux_scp, sample_rate=sample_rate)
-        with open(utt_scp_file, 'r') as f:
-            lines = f.readlines()
-        self.data = []
-        self.total_lines = len(self.data)
-        for line in lines:
-            parts = line.strip().split()
-            sentence_id = parts[0]
-            sentence_path = parts[1]
-            data_len = parts[2]
-            spk_id = (sentence_id.split('-')[0])[1:5]
-            self.data.append((sentence_id, spk_id, sentence_path, data_len))
-        if not self.data:
-            raise ValueError("No valid lines found in the input file.")
-        self.total_lines = len(self.data)
-    def _load_spk(self, spk_list_path):
-        if spk_list_path is None:
-            return []
-        lines = open(spk_list_path).readlines()
-        new_lines = []
-        for line in lines:
-            new_lines.append(line.strip())
-        return new_lines
-    def __len__(self):
-        return len(self.data)
-    def _get_segment_start_stop(self, seg_len, length):
-        if seg_len is not None:
-            start = random.randint(0, length - seg_len)
-            stop = start + seg_len
-        else:
-            start = 0
-            stop = None
-        return start, stop
-    def _mix(self, sources_list):
-        # if self.seg_len:
-        #     mix_length = self.seg_len
-        # else:
-        #     mix_length = self.common_length
-        mix_length = self.common_length
-        mixture = np.zeros(mix_length)
-        for i, _ in enumerate(sources_list):
-            mixture += sources_list[i]
-        return mixture
-    def __getitem__(self, idx):
-        source_id, source_spk, source_path, all_source_length= self.data[idx]
-        all_source_length = int(all_source_length)
-        spk_idx = self.spk_list.index(source_spk)
-        other_counter = 0
-        while True:
-            random_idx = np.random.randint(0, self.total_lines)
-            if self.data[random_idx][1] != source_spk:
-                other_id, other_spk, other_path, other_length = self.data[random_idx]
-                other_length = int(other_length)
-                if other_length > self.seg_least:
-                    break
-            other_counter += 1
-            if other_counter >= self.total_lines:
-                raise ValueError("All Data too shorter to mix")
-        enroll_counter = 0
-        while True:
-            random_idx = np.random.randint(0, self.total_lines)
-            if self.data[random_idx][1] == source_spk:
-                enroll_id, enroll_spk, enroll_path, all_enroll_length= self.data[random_idx]
-                all_enroll_length = int(all_enroll_length)
-                if all_enroll_length > self.seg_least:
-                    break
-            enroll_counter += 1
-            if enroll_counter >= self.total_lines:
-                raise ValueError("All Data too shorter to enroll")
-        # lengths = [all_source_length, other_length]
-        if all_source_length >= other_length:
-            self.common_length = other_length
-            start, stop = self._get_segment_start_stop(other_length, all_source_length)
-            source_tmp,_ = sf.read(source_path, dtype="float32", start=start, stop=stop)
-            other_tmp,_ = sf.read(other_path, dtype="float32")
-        elif all_source_length <= other_length:
-            self.common_length = all_source_length
-            start, stop = self._get_segment_start_stop(all_source_length, other_length)
-            source_tmp,_ = sf.read(source_path, dtype="float32")
-            other_tmp,_ = sf.read(other_path, dtype="float32", start=start, stop=stop)
-        source = source_tmp[:, np.random.randint(0, source_tmp.shape[1])]
-        other = other_tmp[:, np.random.randint(0, other_tmp.shape[1])]
-        mixture = self._mix([source, other])
-        mixture = mixture.astype(np.float32)
-        enroll_tmp, _ = sf.read(enroll_path, dtype="float32")
-        enroll = enroll_tmp[:, np.random.randint(0, enroll_tmp.shape[1])]
-        return {
-            "mix": mixture,
-            "ref": source,
-            "aux": enroll,
-            "aux_len": len(enroll),
-            "spk_idx": spk_idx
-        }
-class ChunkSplitter(object):
-    """
-    Split utterance into small chunks
-    """
-    def __init__(self, chunk_size, train=True, least=16000):
-        self.chunk_size = chunk_size
-        self.least = least
-        self.train = train
-    def _make_chunk(self, eg, s):
-        """
-        Make a chunk instance, which contains:
-            "mix": ndarray,
-            "ref": [ndarray...]
-        """
-        chunk = dict()
-        chunk["mix"] = eg["mix"][s:s + self.chunk_size]
-        chunk["ref"] = eg["ref"][s:s + self.chunk_size]
-        chunk["aux"] = eg["aux"]
-        chunk["aux_len"] = eg["aux_len"]
-        chunk["valid_len"] = int(self.chunk_size)
-        chunk["spk_idx"] = eg["spk_idx"]
-        return chunk
-    def split(self, eg):
-        N = eg["mix"].size
-        # too short, throw away
-        if N < self.least:
-            return []
-        chunks = []
-        # padding zeros
-        if N < self.chunk_size:
-            P = self.chunk_size - N
-            chunk = dict()
-            chunk["mix"] = np.pad(eg["mix"], (0, P), "constant")
-            chunk["ref"] = np.pad(eg["ref"], (0, P), "constant")
-            chunk["aux"] = eg["aux"]
-            chunk["aux_len"] = eg["aux_len"]
-            chunk["valid_len"] = int(N)
-            chunk["spk_idx"] = eg["spk_idx"]
-            chunks.append(chunk)
-        else:
-            # random select start point for training
-            s = random.randint(0, N % self.least) if self.train else 0
-            while True:
-                if s + self.chunk_size > N:
-                    break
-                chunk = self._make_chunk(eg, s)
-                chunks.append(chunk)
-                s += self.least
-        return chunks
-class DataLoader(object):
-    """
-    Online dataloader for chunk-level
-    """
-    def __init__(self,
-                 dataset,
-                 num_workers=4,
-                 chunk_size=32000,
-                 batch_size=16,
-                 train=True):
-        self.batch_size = batch_size
-        self.train = train
-        self.splitter = ChunkSplitter(chunk_size,
-                                      train=train,
-                                      least=chunk_size // 2)
-        # just return batch of egs, support multiple workers
-        self.eg_loader = dat.DataLoader(dataset,
-                                        batch_size=batch_size // 2,
-                                        num_workers=num_workers,
-                                        shuffle=train,
-                                        collate_fn=self._collate)
-    def _collate(self, batch):
-        """
-        Online split utterances
-        """
-        chunk = []
-        for eg in batch:
-            chunk += self.splitter.split(eg)
-        return chunk
-    def _pad_aux(self, chunk_list):
-        lens_list = []
-        for chunk_item in chunk_list:
-            lens_list.append(chunk_item['aux_len'])
-        max_len = np.max(lens_list)
-        for idx in range(len(chunk_list)):
-            P = max_len - len(chunk_list[idx]["aux"])
-            chunk_list[idx]["aux"] = np.pad(chunk_list[idx]["aux"], (0, P), "constant")
-        return chunk_list
-    def _merge(self, chunk_list):
-        """
-        Merge chunk list into mini-batch
-        """
-        N = len(chunk_list)
-        if self.train:
-            random.shuffle(chunk_list)
-        blist = []
-        for s in range(0, N - self.batch_size + 1, self.batch_size):
-            # padding aux info
-            #self._pad_aux(chunk_list[s:s + self.batch_size])
-            batch = default_collate(self._pad_aux(chunk_list[s:s + self.batch_size]))
-            blist.append(batch)
-        rn = N % self.batch_size
-        return blist, chunk_list[-rn:] if rn else []
-    def __iter__(self):
-        chunk_list = []
-        for chunks in self.eg_loader:
-            chunk_list += chunks
-            batch, chunk_list = self._merge(chunk_list)
-            for obj in batch:
-                yield obj

utils/dataset.py DELETED Viewed

@@ -1,402 +0,0 @@
-#!/usr/bin/env python
-import random
-import torch as th
-import numpy as np
-from torch.utils.data.dataloader import default_collate
-import torch.utils.data as dat
-from torch.nn.utils.rnn import pad_sequence
-from .audio import WaveReader
-import soundfile as sf
-# random_seed = 1453
-# random.seed(random_seed)
-# "aux_len": all_enroll_length,
-EPS = 1e-10
-def make_dataloader(train=True,
-                    mix_scp_file=None,
-                    enroll_scp_file=None,
-                    noise_scp_file=None,
-                    spk_list=None,
-                    sample_rate=16000,
-                    num_workers=4,
-                    chunk_size=32000,
-                    batch_size=16):
-    dataset = Dataset(mix_scp_file=mix_scp_file,
-                      enroll_scp_file=enroll_scp_file,
-                      noise_scp_file=noise_scp_file,
-                      spk_list=spk_list,
-                      chunk_size=chunk_size,
-                      sample_rate=sample_rate)
-    return DataLoader(dataset,
-                      train=train,
-                      chunk_size=chunk_size,
-                      batch_size=batch_size,
-                      num_workers=num_workers)
-class Dataset(object):
-    """
-    Per Utterance Loader
-    """
-    def __init__(self, mix_scp_file="", enroll_scp_file="", noise_scp_file="", spk_list=None,chunk_size=32000, sample_rate=8000):
-        self.sample_rate = sample_rate
-        self.spk_list = self._load_spk(spk_list)
-        self.seg_least= int(chunk_size // 2 )
-        with open(mix_scp_file, 'r') as f:
-            lines = f.readlines()
-        self.data = []
-        for line in lines:
-            parts = line.strip().split()
-            sentence_id = parts[0]
-            sentence_path = parts[1]
-            data_len = parts[2]
-            spk_id = (sentence_id.split('-')[0])[1:5]
-            self.data.append((sentence_id, spk_id, sentence_path, data_len))
-        with open(enroll_scp_file, 'r') as f:
-            enroll_lines = f.readlines()
-        self.enroll_data = []
-        for line in enroll_lines:
-            parts = line.strip().split()
-            sentence_id = parts[0]
-            sentence_path = parts[1]
-            data_len = parts[2]
-            spk_id = (sentence_id.split('-')[0])[1:5]
-            self.enroll_data.append((sentence_id, spk_id, sentence_path, data_len))
-        with open(noise_scp_file, 'r') as f:
-            noise_lines = f.readlines()
-        self.noise_data = []
-        for line in noise_lines:
-            parts = line.strip().split()
-            sentence_id = parts[0]
-            sentence_path = parts[1]
-            data_len = parts[2]
-            # spk_id = (sentence_id.split('-')[0])[1:5]
-            self.noise_data.append((sentence_id, sentence_path, data_len))
-        self.total_lines = len(self.data)
-        self.total_enroll = self._enroll_data_len()
-        self.total_noise = self._noise_data_len()
-        if not self.data:
-            raise ValueError("No valid lines found in the input file.")
-    def _load_spk(self, spk_list_path):
-        if spk_list_path is None:
-            return []
-        lines = open(spk_list_path).readlines()
-        new_lines = []
-        for line in lines:
-            new_lines.append(line.strip())
-        return new_lines
-    def __len__(self):
-        return len(self.data)
-    def _enroll_data_len(self):
-        return len(self.enroll_data)
-    def _noise_data_len(self):
-        return len(self.noise_data)
-    def _get_segment_start_stop(self, seg_len, length):
-        if seg_len is not None:
-            start = random.randint(0, length - seg_len)
-            stop = start + seg_len
-        else:
-            start = 0
-            stop = None
-        return start, stop
-    def _mix(self, sources_list):
-        # if self.seg_len:
-        #     mix_length = self.seg_len
-        # else:
-        #     mix_length = self.common_length
-        mix_length = self.common_length
-        mixture = np.zeros(mix_length)
-        for i, _ in enumerate(sources_list):
-            mixture += sources_list[i]
-        return mixture
-    def __getitem__(self, idx):
-        source_id, source_spk, source_path, all_source_length= self.data[idx]
-        all_source_length = int(all_source_length)
-        spk_idx = self.spk_list.index(source_spk)
-        other_counter = 0
-        while True:
-            random_idx = np.random.randint(0, self.total_lines)
-            if self.data[random_idx][1] != source_spk:
-                other_id, other_spk, other_path, other_length = self.data[random_idx]
-                other_length = int(other_length)
-                if other_length > self.seg_least:
-                    break
-            other_counter += 1
-            if other_counter >= self.total_lines:
-                raise ValueError("All Data too shorter to mix")
-        if all_source_length >= other_length:
-            self.common_length = other_length
-            start, stop = self._get_segment_start_stop(self.common_length, all_source_length)
-            source_tmp,_ = sf.read(source_path, dtype="float32", start=start, stop=stop)
-            other_tmp,_ = sf.read(other_path, dtype="float32")
-        elif all_source_length <= other_length:
-            self.common_length = all_source_length
-            start, stop = self._get_segment_start_stop(self.common_length, other_length)
-            source_tmp,_ = sf.read(source_path, dtype="float32")
-            other_tmp,_ = sf.read(other_path, dtype="float32", start=start, stop=stop)
-        noise_counter = 0
-        while True:
-            random_idx = np.random.randint(0, self.total_noise)
-            noise_id, noise_path, all_noise_length= self.noise_data[random_idx]
-            all_noise_length = int(all_noise_length)
-            if all_noise_length >= self.common_length:
-                break
-            noise_counter += 1
-            if noise_counter >= self.total_noise:
-                raise ValueError("All Data can't as noise")
-        enroll_counter = 0
-        while True:
-            random_idx = np.random.randint(0, self.total_enroll)
-            if self.enroll_data[random_idx][1] == source_spk:
-                enroll_id, enroll_spk, enroll_path, all_enroll_length= self.enroll_data[random_idx]
-                all_enroll_length = int(all_enroll_length)
-                break
-            enroll_counter += 1
-            if enroll_counter >= self.total_enroll:
-                raise ValueError("All Data can't as enroll")
-        source = source_tmp[:, np.random.randint(0, source_tmp.shape[1])]
-        other = other_tmp[:, np.random.randint(0, other_tmp.shape[1])]
-        noise_start, noise_stop = self._get_segment_start_stop(self.common_length, all_noise_length)
-        noise,_ = sf.read(noise_path, dtype="float32", start=noise_start, stop=noise_stop) # single channel?
-        # noise = noise_tmp[:, np.random.randint(0, noise_tmp.shape[1])]
-        # other_noise = self._mix([other,noise])
-        desired_snr = np.random.uniform(-4, 4)  # 设置目标 SNR
-        current_snr = 10 * np.log10(np.mean(source ** 2) / (np.mean(noise ** 2) + EPS) + EPS)
-        scale_factor = 10 ** ((current_snr - desired_snr ) / 20)
-        scaled_noise = noise * scale_factor
-        snr = 10 * np.log10(np.mean(source ** 2) / (np.mean(scaled_noise ** 2) + EPS) + EPS)
-        mixture = self._mix([source,other,scaled_noise])
-        mixture = mixture.astype(np.float32)
-        enroll_tmp, _ = sf.read(enroll_path, dtype="float32")
-        enroll = enroll_tmp[:, np.random.randint(0, enroll_tmp.shape[1])]
-        return {
-            "mix": mixture,
-            "ref": source,
-            "aux": enroll,
-            "aux_len": all_enroll_length,
-            "spk_idx": spk_idx
-        }
-class ChunkSplitter(object):
-    """
-    Split utterance into small chunks
-    """
-    def __init__(self, chunk_size, train=True, least=16000):
-        self.chunk_size = chunk_size
-        self.least = least
-        self.train = train
-    def _make_chunk(self, eg, s):
-        """
-        Make a chunk instance, which contains:
-            "mix": ndarray,
-            "ref": [ndarray...]
-        """
-        chunk = dict()
-        chunk["mix"] = eg["mix"][s:s + self.chunk_size]
-        chunk["ref"] = eg["ref"][s:s + self.chunk_size]
-        chunk["aux"] = eg["aux"]
-        chunk["aux_len"] = eg["aux_len"]
-        chunk["valid_len"] = int(self.chunk_size)
-        chunk["spk_idx"] = eg["spk_idx"]
-        return chunk
-    def split(self, eg):
-        N = eg["mix"].size
-        # too short, throw away
-        if N < self.least:
-            return []
-        chunks = []
-        # padding zeros
-        if N < self.chunk_size:
-            P = self.chunk_size - N
-            chunk = dict()
-            chunk["mix"] = np.pad(eg["mix"], (0, P), "constant")
-            chunk["ref"] = np.pad(eg["ref"], (0, P), "constant")
-            chunk["aux"] = eg["aux"]
-            chunk["aux_len"] = eg["aux_len"]
-            chunk["valid_len"] = int(N)
-            chunk["spk_idx"] = eg["spk_idx"]
-            chunks.append(chunk)
-        # else:
-        #     # random select start point for training
-        #     s = random.randint(0, N % self.least) if self.train else 0
-        #     while True:
-        #         if s + self.chunk_size > N:
-        #             break
-        #         chunk = self._make_chunk(eg, s)
-        #         chunks.append(chunk)
-        #         s += self.least
-        # return chunks
-        else:
-            if self.train:
-                # random select A start point for training
-                s = random.randint(0, N - self.chunk_size)
-                chunk = self._make_chunk(eg, s)
-                chunks.append(chunk)
-            else:
-                s = 0
-                while True:
-                    if s + self.chunk_size > N:
-                        break
-                    chunk = self._make_chunk(eg, s)
-                    chunks.append(chunk)
-                    s += self.least
-        return chunks
-class DataLoader(object):
-    """
-    Online dataloader for chunk-level
-    """
-    def __init__(self,
-                 dataset,
-                 num_workers=4,
-                 chunk_size=32000,
-                 batch_size=16,
-                 train=True):
-        self.batch_size = batch_size
-        self.train = train
-        self.splitter = ChunkSplitter(chunk_size,
-                                      train=train,
-                                      least=chunk_size // 2)
-        # just return batch of egs, support multiple workers
-        self.eg_loader = dat.DataLoader(dataset,
-                                        batch_size=batch_size // 2,
-                                        num_workers=num_workers,
-                                        shuffle=train,
-                                        collate_fn=self._collate)
-    def _collate(self, batch):
-        """
-        Online split utterances
-        """
-        chunk = []
-        for eg in batch:
-            chunk += self.splitter.split(eg)
-        return chunk
-    def _pad_aux(self, chunk_list):
-        lens_list = []
-        for chunk_item in chunk_list:
-            lens_list.append(chunk_item['aux_len'])
-        max_len = np.max(lens_list)
-        # pad 0
-        for idx in range(len(chunk_list)):
-            P = max_len - len(chunk_list[idx]["aux"])
-            chunk_list[idx]["aux"] = np.pad(chunk_list[idx]["aux"], (0, P), "constant")
-        # # pad circle
-        # for idx in range(len(chunk_list)):
-        #     P = max_len - len(chunk_list[idx]["aux"])
-        #     original_aux_len = len(chunk_list[idx]["aux"])
-        #     # 使用循环来填充原句子的内容
-        #     for i in range(P):
-        #         chunk_list[idx]["aux"].append(chunk_list[idx]["aux"][i % original_aux_len])
-        return chunk_list
-    def _merge(self, chunk_list):
-        """
-        Merge chunk list into mini-batch
-        """
-        N = len(chunk_list)
-        if self.train:
-            random.shuffle(chunk_list)
-        blist = []
-        for s in range(0, N - self.batch_size + 1, self.batch_size):
-            # padding aux info
-            #self._pad_aux(chunk_list[s:s + self.batch_size])
-            batch = default_collate(self._pad_aux(chunk_list[s:s + self.batch_size]))
-            blist.append(batch)
-        rn = N % self.batch_size
-        return blist, chunk_list[-rn:] if rn else []
-    def __iter__(self):
-        chunk_list = []
-        for chunks in self.eg_loader:
-            chunk_list += chunks
-            batch, chunk_list = self._merge(chunk_list)
-            for obj in batch:
-                yield obj
-# def snr_xy(x, y):
-#     return 10 * np.log10(np.mean(x ** 2) / (np.mean(y ** 2) + EPS) + EPS)
-# def main(args):
-#     wham_noise_dir = args.wham_dir
-#     # Get train dir
-#     subdir = os.path.join(wham_noise_dir, 'tr')
-#     # List files in that dir
-#     sound_paths = glob.glob(os.path.join(subdir, '**/*.wav'),
-#                             recursive=True)
-#     # Avoid running this script if it already have been run
-#     if len(sound_paths) == 60000:
-#         print("It appears that augmented files have already been generated.\n"
-#               "Skipping data augmentation.")
-#         return
-#     elif len(sound_paths) != 20000:
-#         print("It appears that augmented files have not been generated properly\n"
-#               "Resuming augmentation.")
-#         originals = [x for x in sound_paths if 'sp' not in x]
-#         to_be_removed_08 = [x.replace('sp08','') for x in sound_paths if 'sp08' in x]
-#         to_be_removed_12 = [x.replace('sp12','') for x in sound_paths if 'sp12' in x ]
-#         sound_paths_08 = list(set(originals) - set(to_be_removed_08))
-#         sound_paths_12 = list(set(originals) - set(to_be_removed_12))
-#         augment_noise(sound_paths_08, 0.8)
-#         augment_noise(sound_paths_12, 1.2)
-#     else:
-#         print(f'Augmenting {subdir} files')
-#         # Transform audio speed
-#         augment_noise(sound_paths, 0.8)
-#         augment_noise(sound_paths, 1.2)

utils/load_obj.py DELETED Viewed

@@ -1,18 +0,0 @@
-#!/usr/bin/env python
-import torch as th
-def load_obj(obj, device):
-    """
-    Offload tensor object in obj to cuda device
-    """
-    def cuda(obj):
-        return obj.to(device) if isinstance(obj, th.Tensor) else obj
-    if isinstance(obj, dict):
-        return {key: load_obj(obj[key], device) for key in obj}
-    elif isinstance(obj, list):
-        return [load_obj(val, device) for val in obj]
-    else:
-        return cuda(obj)

utils/logger.py DELETED Viewed

@@ -1,22 +0,0 @@
-#!/usr/bin/env python
-import logging
-def get_logger(
-        name,
-        format_str="%(asctime)s [%(pathname)s:%(lineno)s - %(levelname)s ] %(message)s",
-        date_format="%Y-%m-%d %H:%M:%S",
-        file=False):
-    """
-    Get python logger instance
-    """
-    logger = logging.getLogger(name)
-    logger.setLevel(logging.INFO)
-    # file or console
-    handler = logging.StreamHandler() if not file else logging.FileHandler(
-        name)
-    handler.setLevel(logging.INFO)
-    formatter = logging.Formatter(fmt=format_str, datefmt=date_format)
-    handler.setFormatter(formatter)
-    logger.addHandler(handler)
-    return logger

utils/sisdr.py DELETED Viewed

@@ -1,23 +0,0 @@
-#!/usr/bin/env python
-import numpy as np
-def sisdr(x, s, remove_dc=True):
-    """
-    Compute SI-SDR
-    x: extracted signal
-    s: reference signal(ground truth)
-    """
-    def vec_l2norm(x):
-        return np.linalg.norm(x, 2)
-    if remove_dc:
-        x_zm = x - np.mean(x)
-        s_zm = s - np.mean(s)
-        t = np.inner(x_zm, s_zm) * s_zm / vec_l2norm(s_zm)**2
-        n = x_zm - t
-    else:
-        t = np.inner(x, s) * s / vec_l2norm(s)**2
-        n = x - t
-    return 20 * np.log10(vec_l2norm(t) / vec_l2norm(n))

utils/timer.py DELETED Viewed

@@ -1,17 +0,0 @@
-#!/usr/bin/env python
-import time
-class Timer(object):
-    """
-    A timer to record the elapsed time
-    """
-    def __init__(self):
-        self.reset()
-    def reset(self):
-        self.start = time.time()
-    def elapsed(self):
-        return (time.time() - self.start) / 60