Spaces:

swc2
/

Target-speaker-extraction

Running

App Files Files Community

swc2 commited on Jan 14, 2025

Commit

ab3af29

1 Parent(s): 8c575ce

debug

Browse files

Files changed (2) hide show

datahandler.py +22 -65
decode.py +4 -22

datahandler.py CHANGED Viewed

@@ -30,9 +30,7 @@ class AudioMixer(object):
         mean_loudness=-24,
         var_loudness=20
     ):
-        """
-        初始化一些参数、随机种子和响度计算工具等。
-        """
         self.sample_rate = sample_rate
         self.mean_snr = mean_snr
         self.var_snr = var_snr
@@ -42,41 +40,37 @@ class AudioMixer(object):
         self.EPS = 1e-10
         self.MAX_AMP = 0.9
-        # pyloudnorm 的 Meter，用于计算音频响度
         self.meter = pyloudnorm.Meter(self.sample_rate)
-        # # 也可固定随机种子，保证每次混合一致（如果想要可复现）
         # self.seed = 1453
         # random.seed(self.seed)
         # np.random.seed(self.seed)
     def read_wav(self, wav_path):
-        """
-        读取音频文件并返回 wave 数据和采样率
-        """
         data, sr = sf.read(wav_path, dtype='float32')
-        # 如果读到的是多通道，可只取其中一个通道
         if data.ndim > 1:
             data = data[:, 0]
         return data, sr
     def normalize(self, signal, is_noise=False):
-        """
-        对输入的 signal 做响度归一化，并确保不会过载失真。
-        """
         c_loudness = self.meter.integrated_loudness(signal)
         if is_noise:
-            # 噪声的目标响度可以偏高一些或随便设置
             target_loudness = np.random.normal(self.MEAN_LOUNDNESS + 4, self.VAR_LOUNDNESS**0.5)
         else:
-            # mix 或者语音的目标响度
             target_loudness = np.random.normal(self.MEAN_LOUNDNESS, self.VAR_LOUNDNESS**0.5)
         with warnings.catch_warnings():
             warnings.filterwarnings("error", category=RuntimeWarning)
             signal = pyloudnorm.normalize.loudness(signal, c_loudness, target_loudness)
-        # # 再检查是否会 clipping
         # peak = np.max(np.abs(signal))
         # if peak >= 1.0:
         #     signal = signal * self.MAX_AMP / peak
@@ -84,14 +78,11 @@ class AudioMixer(object):
         return signal
     def snr_norm(self, signal, noise, is_noise=True):
-        """
-        根据预设的 mean_snr、var_snr 来随机决定一个目标 SNR，然后
-        以此对 noise 做缩放，得到与 signal 相匹配的噪声幅度。
-        """
         if is_noise:
             desired_snr = np.random.normal(self.mean_snr, self.var_snr**0.5)
         else:
-            # 如果你还有别的需求，比如想做正 SNR 范围，可以改这里
             desired_snr = np.random.uniform(2, 10)
         current_snr = 10 * np.log10(
@@ -101,7 +92,6 @@ class AudioMixer(object):
         scaled_noise = noise * scale_factor
-        # # 防止噪声自身 clipping
         # peak = np.max(np.abs(scaled_noise))
         # if peak >= 1.0:
         #     scaled_noise = scaled_noise * self.MAX_AMP / peak
@@ -109,16 +99,14 @@ class AudioMixer(object):
         return scaled_noise
     def _mix(self, sources_list):
-        """
-        将多路音频进行叠加，防止溢出。
-        """
-        # 假设 sources_list[0] 是 mix 音频，sources_list[1] 是已拼好长度的 noise
         mix_length = len(sources_list[0])
         mixture = np.zeros(mix_length, dtype=np.float32)
         for s in sources_list:
             mixture += s[:mix_length]  # 仅叠加到 mix 的长度
-        # 再做一次峰值校正，避免溢出
         peak = np.max(np.abs(mixture))
         if peak >= 1.0:
             mixture = mixture * self.MAX_AMP / peak
@@ -126,30 +114,16 @@ class AudioMixer(object):
         return mixture
     def _prepare_noise_for_mix(self, noise_files, mix_length):
-        """
-        传入一组 noise 文件路径，先对它们打乱，再依次读取、拼接。
-        如果总长度还不够覆盖 mix_length，可以再次拼接自己（循环）。
-        - noise_files: 存储多个噪声文件路径的列表
-        - mix_length: 需要的总长度（采样点数）
-        返回: 拼接后的 noise 波形
-        """
-        # 先随机打乱
         random.shuffle(noise_files)
-        # 依次读取并拼接
         noise_all = []
         total_len = 0
-        # 第一次先拼完所有 noise 文件，如果还不够，就重复拼接
         while total_len < mix_length:
             for nf in noise_files:
                 noise_data, _ = self.read_wav(nf)
-                # 可选：对每条 noise 做一次 normalize，提升多样性
-                # （或者只在外部做一次统一的 normalize）
-                #noise_data = self.normalize(noise_data, is_noise=True)
                 noise_all.append(noise_data)
                 total_len += len(noise_data)
@@ -157,24 +131,12 @@ class AudioMixer(object):
                 if total_len >= mix_length:
                     break
-            # 如果已经拼完一轮，可能还不够，就继续 while 循环再拼一轮
-        # 拼接后截断到 mix_length
         concatenated_noise = np.concatenate(noise_all)[:mix_length]
         return concatenated_noise
     def mix_with_noise_folder(self, mix_wave,sr_mix,noise_folder):
-        """
-        读取一条 mix 文件和一个 noise 文件夹，做如下处理：
-        1. 读取 mix wave，并做响度归一化
-        2. 根据 mix 的长度，在 noise 文件夹中随机打乱全部 wav，依次拼接满足同长度
-        3. 对最终拼好的 noise 做 snr_norm
-        4. 叠加输出
-        """
-        # 1. 读取 mix
-        # mix_wave, sr_mix = self.read_wav(mix_path)
-        # 如果文件夹下找不到任何 noise 文件，就直接返回原音频
         noise_files = sorted(glob.glob(os.path.join(noise_folder, "*.wav")))
         if not noise_files:
             raise RuntimeError(f"噪声文件夹 {noise_folder} 内未发现 .wav 文件")
@@ -182,35 +144,30 @@ class AudioMixer(object):
         mix_wave = self.normalize(mix_wave, is_noise=False)
         mix_length = len(mix_wave)
-        # 2. 先把 noise 文件拼接到 match mix_length
-        #    （会将 noise_files 打乱后依次读、拼接）
         noise_ready = self._prepare_noise_for_mix(noise_files, mix_length)
-        # 3. SNR 调整
         noise_ready = self.snr_norm(mix_wave, noise_ready, is_noise=True)
-        # 4. 叠加
         mixture = self._mix([mix_wave, noise_ready])
-        out_noisy = "temp_noisy.wav"      # 可以理解为把输入的混合音频直接另存为
-        # 返回混合后的音频以及采样率
         sf.write(out_noisy, mixture, sr_mix)
         return out_noisy
 if __name__ == "__main__":
-    # 假设你有一个 mix.wav 以及一个 noise 文件夹(含若干个 .wav 噪声文件)
     mix_path_test = "test_mix.wav"
     mix_wave, sr_mix = self.read_wav(mix_path_test)
-    noise_folder_test = "noises/"  # 比如里面有 10 条 noise*.wav
     mixer = AudioMixer()
-    # 执行混合
     mixed_wav, sr = mixer.mix_with_noise_folder(mix_wave, sr_mix, noise_folder_test)
-    # 这里你可以选择把结果写回本地文件，或直接返回 numpy 数组做后续处理
     sf.write("test_output_mixture.wav", mixed_wav, sr)
     print("混合完成，已输出到 test_output_mixture.wav")

         mean_loudness=-24,
         var_loudness=20
     ):
         self.sample_rate = sample_rate
         self.mean_snr = mean_snr
         self.var_snr = var_snr
         self.EPS = 1e-10
         self.MAX_AMP = 0.9
         self.meter = pyloudnorm.Meter(self.sample_rate)
         # self.seed = 1453
         # random.seed(self.seed)
         # np.random.seed(self.seed)
     def read_wav(self, wav_path):
         data, sr = sf.read(wav_path, dtype='float32')
         if data.ndim > 1:
             data = data[:, 0]
         return data, sr
     def normalize(self, signal, is_noise=False):
         c_loudness = self.meter.integrated_loudness(signal)
         if is_noise:
             target_loudness = np.random.normal(self.MEAN_LOUNDNESS + 4, self.VAR_LOUNDNESS**0.5)
         else:
             target_loudness = np.random.normal(self.MEAN_LOUNDNESS, self.VAR_LOUNDNESS**0.5)
         with warnings.catch_warnings():
             warnings.filterwarnings("error", category=RuntimeWarning)
             signal = pyloudnorm.normalize.loudness(signal, c_loudness, target_loudness)
         # peak = np.max(np.abs(signal))
         # if peak >= 1.0:
         #     signal = signal * self.MAX_AMP / peak
         return signal
     def snr_norm(self, signal, noise, is_noise=True):
         if is_noise:
             desired_snr = np.random.normal(self.mean_snr, self.var_snr**0.5)
         else:
             desired_snr = np.random.uniform(2, 10)
         current_snr = 10 * np.log10(
         scaled_noise = noise * scale_factor
         # peak = np.max(np.abs(scaled_noise))
         # if peak >= 1.0:
         #     scaled_noise = scaled_noise * self.MAX_AMP / peak
         return scaled_noise
     def _mix(self, sources_list):
         mix_length = len(sources_list[0])
         mixture = np.zeros(mix_length, dtype=np.float32)
         for s in sources_list:
             mixture += s[:mix_length]  # 仅叠加到 mix 的长度
         peak = np.max(np.abs(mixture))
         if peak >= 1.0:
             mixture = mixture * self.MAX_AMP / peak
         return mixture
     def _prepare_noise_for_mix(self, noise_files, mix_length):
         random.shuffle(noise_files)
         noise_all = []
         total_len = 0
         while total_len < mix_length:
             for nf in noise_files:
                 noise_data, _ = self.read_wav(nf)
                 noise_all.append(noise_data)
                 total_len += len(noise_data)
                 if total_len >= mix_length:
                     break
         concatenated_noise = np.concatenate(noise_all)[:mix_length]
         return concatenated_noise
     def mix_with_noise_folder(self, mix_wave,sr_mix,noise_folder):
         noise_files = sorted(glob.glob(os.path.join(noise_folder, "*.wav")))
         if not noise_files:
             raise RuntimeError(f"噪声文件夹 {noise_folder} 内未发现 .wav 文件")
         mix_wave = self.normalize(mix_wave, is_noise=False)
         mix_length = len(mix_wave)
         noise_ready = self._prepare_noise_for_mix(noise_files, mix_length)
         noise_ready = self.snr_norm(mix_wave, noise_ready, is_noise=True)
         mixture = self._mix([mix_wave, noise_ready])
+        out_noisy = "temp_noisy.wav"
         sf.write(out_noisy, mixture, sr_mix)
         return out_noisy
 if __name__ == "__main__":
     mix_path_test = "test_mix.wav"
     mix_wave, sr_mix = self.read_wav(mix_path_test)
+    noise_folder_test = "noises/"
     mixer = AudioMixer()
     mixed_wav, sr = mixer.mix_with_noise_folder(mix_wave, sr_mix, noise_folder_test)
     sf.write("test_output_mixture.wav", mixed_wav, sr)
     print("混合完成，已输出到 test_output_mixture.wav")

decode.py CHANGED Viewed

@@ -10,7 +10,6 @@ from omegaconf import OmegaConf
-# ================ 网络推理类 ================
 class NnetComputer(object):
     def __init__(self, cpt_dir, gpuid, nnet_conf):
         self.device = th.device(f"cuda:{gpuid}") if gpuid >= 0 else th.device("cpu")
@@ -37,41 +36,24 @@ class NnetComputer(object):
             return sp_samps
 class InferencePipeline:
-    """
-    外部只需传入 config，即可完成:
-      1) 模型实例化 (含 hydra.instantiate 逻辑)
-      2) 加载 checkpoint
-      3) 推理
-    """
     def __init__(self, config):
-        """
-        在构造时就把所有初始化做好，包括:
-          - hydra.instantiate(config.model) -> 得到一个 nn.Module
-          - 用 NnetComputer(...) 封装
-        """
-        # 如果 config.model 里含有 _target_ 字段，可以用 hydra.instantiate
-        # 注意: hydra.instantiate 需要在这里显式地导入 hydra.utils
-        # 1. 根据 config.model 构建模型
         model_inst = hydra.utils.instantiate(config.model)
         self.computer_ = NnetComputer(config.test.checkpoint,config.test.gpu, model_inst)
     def run_inference(self, input_audio_path: str, enroll_audio_path: str) -> str:
-        """
-        给定混合音频 + enroll 音频，执行推理并返回输出文件路径。
-        """
-        # 1. 读取音频
         mix_samps, sr = sf.read(input_audio_path)
         aux_samps, sr2 = sf.read(enroll_audio_path)
-        # 2. 调用底层 compute
         samps = self.computer_.compute(mix_samps, aux_samps, len(aux_samps))
         norm = np.linalg.norm(mix_samps, np.inf)
         samps = samps[:mix_samps.size]
         samps = samps * norm / np.max(np.abs(samps))
-        # 3. 写到临时文件
         out_wav = "temp_extracted.wav"
         sf.write(out_wav, samps, sr)
         return out_wav

 class NnetComputer(object):
     def __init__(self, cpt_dir, gpuid, nnet_conf):
         self.device = th.device(f"cuda:{gpuid}") if gpuid >= 0 else th.device("cpu")
             return sp_samps
 class InferencePipeline:
     def __init__(self, config):
         model_inst = hydra.utils.instantiate(config.model)
         self.computer_ = NnetComputer(config.test.checkpoint,config.test.gpu, model_inst)
     def run_inference(self, input_audio_path: str, enroll_audio_path: str) -> str:
         mix_samps, sr = sf.read(input_audio_path)
         aux_samps, sr2 = sf.read(enroll_audio_path)
         samps = self.computer_.compute(mix_samps, aux_samps, len(aux_samps))
         norm = np.linalg.norm(mix_samps, np.inf)
         samps = samps[:mix_samps.size]
         samps = samps * norm / np.max(np.abs(samps))
         out_wav = "temp_extracted.wav"
         sf.write(out_wav, samps, sr)
         return out_wav