Spaces:

chenxie95
/

xlance-msr

Running

App Files Files Community

ModistAndrew commited on Nov 14, 2025

Commit

7673750

1 Parent(s): b1462ff

difficult aug and configs

Browse files

Files changed (5) hide show

configs/bsrestore/vox_hard.yaml +142 -0
configs/bsrestore/vox_hard_gan.yaml +165 -0
configs/bsrestore/vox_mix.yaml +9 -2
configs/bsrestore/vox_mix_gan.yaml +168 -0
data/augment.py +83 -89

configs/bsrestore/vox_hard.yaml ADDED Viewed

	@@ -0,0 +1,142 @@

+project_name: "bsrestore"
+exp_name: "vox_hard_large"
+model:
+  name: "BSRoFormer"
+  params:
+    dim: 256
+    depth: 12
+    stereo: true
+    num_stems: 1
+    time_transformer_depth: 1
+    freq_transformer_depth: 1
+    linear_transformer_depth: 0
+    freqs_per_bands: !!python/tuple
+      - 2
+      - 2
+      - 2
+      - 2
+      - 2
+      - 2
+      - 2
+      - 2
+      - 2
+      - 2
+      - 2
+      - 2
+      - 2
+      - 2
+      - 2
+      - 2
+      - 2
+      - 2
+      - 2
+      - 2
+      - 2
+      - 2
+      - 2
+      - 2
+      - 4
+      - 4
+      - 4
+      - 4
+      - 4
+      - 4
+      - 4
+      - 4
+      - 4
+      - 4
+      - 4
+      - 4
+      - 12
+      - 12
+      - 12
+      - 12
+      - 12
+      - 12
+      - 12
+      - 12
+      - 24
+      - 24
+      - 24
+      - 24
+      - 24
+      - 24
+      - 24
+      - 24
+      - 48
+      - 48
+      - 48
+      - 48
+      - 48
+      - 48
+      - 48
+      - 48
+      - 128
+      - 129
+    dim_head: 64
+    heads: 8
+    attn_dropout: 0.1
+    ff_dropout: 0.1
+    flash_attn: true
+    dim_freqs_in: 1025
+    stft_n_fft: 2048
+    stft_hop_length: 512
+    stft_win_length: 2048
+    stft_normalized: false
+    mask_estimator_depth: 2
+    multi_stft_resolution_loss_weight: 1.0
+    multi_stft_resolutions_window_sizes: !!python/tuple
+    - 4096
+    - 2048
+    - 1024
+    - 512
+    - 256
+    multi_stft_hop_size: 147
+    multi_stft_normalized: False
+    mlp_expansion_factor: 4
+    use_torch_checkpoint: False # it allows to greatly reduce GPU memory consumption during training (not fully tested)
+    skip_connection: False # Enable skip connection between transformer blocks - can solve problem with gradients and probably faster training
+data:
+  sample_rate: 48000
+  clip_duration: 10.0
+  train_dataset:
+    target_stem: "Voc"
+    root_directory: "/inspire/hdd/project/multilingualspeechrecognition/chenxie-25019/data/RawStems"
+    apply_augmentation: True
+    snr_range: [0.0, 10.0]
+  train_dataset1:
+    target_stem: "vox"
+    root_directory: "/inspire/hdd/project/multilingualspeechrecognition/chenxie-25019/data/moisesdb_raw"
+    apply_augmentation: True
+    snr_range: [0.0, 10.0]
+    moisesdb: True
+  val_dataset:
+    target_stem: "Voc"
+    root_directory: "/inspire/hdd/project/multilingualspeechrecognition/chenxie-25019/data/RawStems_valid"
+    apply_augmentation: True
+    snr_range: [0.0, 10.0]
+  dataloader_params:
+    batch_size: 4
+    num_workers: 8
+optimizer_g:
+  lr: 0.0005
+  betas: [0.8, 0.99]
+scheduler:
+  warm_up_steps: 10000
+trainer:
+  max_steps: 1000000
+  log_every_n_steps: 100
+  checkpoint_save_interval: 10000
+  limit_train_batches: 2000
+  devices: [0]
+  precision: 16-mixed
+  save_dir: logs/
+checkpoint:
+  path: "/inspire/hdd/project/multilingualspeechrecognition/chenxie-25019/jinxuanzhu/MSRKit/checkpoints/BS-Rofo-SW-Fixed.ckpt"
+  type: "roformer"

configs/bsrestore/vox_hard_gan.yaml ADDED Viewed

	@@ -0,0 +1,165 @@

+project_name: "bsrestore"
+exp_name: "vox_hard_large_gan"
+model:
+  name: "BSRoFormer"
+  params:
+    dim: 256
+    depth: 12
+    stereo: true
+    num_stems: 1
+    time_transformer_depth: 1
+    freq_transformer_depth: 1
+    linear_transformer_depth: 0
+    freqs_per_bands: !!python/tuple
+      - 2
+      - 2
+      - 2
+      - 2
+      - 2
+      - 2
+      - 2
+      - 2
+      - 2
+      - 2
+      - 2
+      - 2
+      - 2
+      - 2
+      - 2
+      - 2
+      - 2
+      - 2
+      - 2
+      - 2
+      - 2
+      - 2
+      - 2
+      - 2
+      - 4
+      - 4
+      - 4
+      - 4
+      - 4
+      - 4
+      - 4
+      - 4
+      - 4
+      - 4
+      - 4
+      - 4
+      - 12
+      - 12
+      - 12
+      - 12
+      - 12
+      - 12
+      - 12
+      - 12
+      - 24
+      - 24
+      - 24
+      - 24
+      - 24
+      - 24
+      - 24
+      - 24
+      - 48
+      - 48
+      - 48
+      - 48
+      - 48
+      - 48
+      - 48
+      - 48
+      - 128
+      - 129
+    dim_head: 64
+    heads: 8
+    attn_dropout: 0.1
+    ff_dropout: 0.1
+    flash_attn: true
+    dim_freqs_in: 1025
+    stft_n_fft: 2048
+    stft_hop_length: 512
+    stft_win_length: 2048
+    stft_normalized: false
+    mask_estimator_depth: 2
+    multi_stft_resolution_loss_weight: 1.0
+    multi_stft_resolutions_window_sizes: !!python/tuple
+    - 4096
+    - 2048
+    - 1024
+    - 512
+    - 256
+    multi_stft_hop_size: 147
+    multi_stft_normalized: False
+    mlp_expansion_factor: 4
+    use_torch_checkpoint: False # it allows to greatly reduce GPU memory consumption during training (not fully tested)
+    skip_connection: False # Enable skip connection between transformer blocks - can solve problem with gradients and probably faster training
+discriminators:
+  - name: "MultiFrequencyDiscriminator"
+    params:
+      nch: 1
+      window_sizes: [2048, 1024, 512]
+      sample_rate: 48000
+      norm: True
+data:
+  sample_rate: 48000
+  clip_duration: 10.0
+  train_dataset:
+    target_stem: "Voc"
+    root_directory: "/inspire/hdd/project/multilingualspeechrecognition/chenxie-25019/data/RawStems"
+    apply_augmentation: True
+    snr_range: [0.0, 10.0]
+  train_dataset1:
+    target_stem: "vox"
+    root_directory: "/inspire/hdd/project/multilingualspeechrecognition/chenxie-25019/data/moisesdb_raw"
+    apply_augmentation: True
+    snr_range: [0.0, 10.0]
+    moisesdb: True
+  val_dataset:
+    target_stem: "Voc"
+    root_directory: "/inspire/hdd/project/multilingualspeechrecognition/chenxie-25019/data/RawStems_valid"
+    apply_augmentation: True
+    snr_range: [0.0, 10.0]
+  dataloader_params:
+    batch_size: 4
+    num_workers: 8
+optimizer_g:
+  lr: 0.0002
+  betas: [0.8, 0.99]
+optimizer_d:
+  lr: 0.0002
+  betas: [0.8, 0.99]
+scheduler:
+  warm_up_steps: 10000
+losses:
+  gan_type: 'lsgan'
+  lambda_recon: 100.0
+  lambda_feat: 2.0
+  lambda_gan: 1.0
+  reconstruction_loss:
+    sample_rate: 48000
+    n_fft: [1024, 2048, 512]
+    hop_length: [256, 512, 128]
+    n_mels: [80, 160, 40]
+trainer:
+  max_steps: 1000000
+  log_every_n_steps: 100
+  checkpoint_save_interval: 10000
+  limit_train_batches: 2000
+  devices: [0]
+  precision: 16-mixed
+  save_dir: logs/
+checkpoint:
+  path: "/inspire/hdd/project/multilingualspeechrecognition/chenxie-25019/jinxuanzhu/MSRKit/checkpoints/BS-Rofo-SW-Fixed.ckpt"
+  type: "roformer"

configs/bsrestore/vox_mix.yaml CHANGED Viewed

@@ -1,5 +1,5 @@
 project_name: "bsrestore"
-exp_name: "vox_mix"
 model:
   name: "BSRoFormer"
@@ -100,13 +100,20 @@ model:
 data:
   sample_rate: 48000
-  clip_duration: 3.0
   train_dataset:
     target_stem: "Voc"
     root_directory: "/inspire/hdd/project/multilingualspeechrecognition/chenxie-25019/data/RawStems"
     apply_augmentation: True
     snr_range: [0.0, 10.0]
     output_mixture: True
   val_dataset:
     target_stem: "Voc"
     root_directory: "/inspire/hdd/project/multilingualspeechrecognition/chenxie-25019/data/RawStems_valid"

 project_name: "bsrestore"
+exp_name: "vox_mix_large"
 model:
   name: "BSRoFormer"
 data:
   sample_rate: 48000
+  clip_duration: 10.0
   train_dataset:
     target_stem: "Voc"
     root_directory: "/inspire/hdd/project/multilingualspeechrecognition/chenxie-25019/data/RawStems"
     apply_augmentation: True
     snr_range: [0.0, 10.0]
     output_mixture: True
+  train_dataset1:
+    target_stem: "vox"
+    root_directory: "/inspire/hdd/project/multilingualspeechrecognition/chenxie-25019/data/moisesdb_raw"
+    apply_augmentation: True
+    snr_range: [0.0, 10.0]
+    output_mixture: True
+    moisesdb: True
   val_dataset:
     target_stem: "Voc"
     root_directory: "/inspire/hdd/project/multilingualspeechrecognition/chenxie-25019/data/RawStems_valid"

configs/bsrestore/vox_mix_gan.yaml ADDED Viewed

	@@ -0,0 +1,168 @@

+project_name: "bsrestore"
+exp_name: "vox_mix_large_gan"
+model:
+  name: "BSRoFormer"
+  params:
+    dim: 256
+    depth: 12
+    stereo: true
+    num_stems: 1
+    time_transformer_depth: 1
+    freq_transformer_depth: 1
+    linear_transformer_depth: 0
+    freqs_per_bands: !!python/tuple
+      - 2
+      - 2
+      - 2
+      - 2
+      - 2
+      - 2
+      - 2
+      - 2
+      - 2
+      - 2
+      - 2
+      - 2
+      - 2
+      - 2
+      - 2
+      - 2
+      - 2
+      - 2
+      - 2
+      - 2
+      - 2
+      - 2
+      - 2
+      - 2
+      - 4
+      - 4
+      - 4
+      - 4
+      - 4
+      - 4
+      - 4
+      - 4
+      - 4
+      - 4
+      - 4
+      - 4
+      - 12
+      - 12
+      - 12
+      - 12
+      - 12
+      - 12
+      - 12
+      - 12
+      - 24
+      - 24
+      - 24
+      - 24
+      - 24
+      - 24
+      - 24
+      - 24
+      - 48
+      - 48
+      - 48
+      - 48
+      - 48
+      - 48
+      - 48
+      - 48
+      - 128
+      - 129
+    dim_head: 64
+    heads: 8
+    attn_dropout: 0.1
+    ff_dropout: 0.1
+    flash_attn: true
+    dim_freqs_in: 1025
+    stft_n_fft: 2048
+    stft_hop_length: 512
+    stft_win_length: 2048
+    stft_normalized: false
+    mask_estimator_depth: 2
+    multi_stft_resolution_loss_weight: 1.0
+    multi_stft_resolutions_window_sizes: !!python/tuple
+    - 4096
+    - 2048
+    - 1024
+    - 512
+    - 256
+    multi_stft_hop_size: 147
+    multi_stft_normalized: False
+    mlp_expansion_factor: 4
+    use_torch_checkpoint: False # it allows to greatly reduce GPU memory consumption during training (not fully tested)
+    skip_connection: False # Enable skip connection between transformer blocks - can solve problem with gradients and probably faster training
+discriminators:
+  - name: "MultiFrequencyDiscriminator"
+    params:
+      nch: 1
+      window_sizes: [2048, 1024, 512]
+      sample_rate: 48000
+      norm: True
+data:
+  sample_rate: 48000
+  clip_duration: 10.0
+  train_dataset:
+    target_stem: "Voc"
+    root_directory: "/inspire/hdd/project/multilingualspeechrecognition/chenxie-25019/data/RawStems"
+    apply_augmentation: True
+    snr_range: [0.0, 10.0]
+    output_mixture: True
+  train_dataset1:
+    target_stem: "vox"
+    root_directory: "/inspire/hdd/project/multilingualspeechrecognition/chenxie-25019/data/moisesdb_raw"
+    apply_augmentation: True
+    snr_range: [0.0, 10.0]
+    output_mixture: True
+    moisesdb: True
+  val_dataset:
+    target_stem: "Voc"
+    root_directory: "/inspire/hdd/project/multilingualspeechrecognition/chenxie-25019/data/RawStems_valid"
+    apply_augmentation: True
+    snr_range: [0.0, 10.0]
+    output_mixture: True
+  dataloader_params:
+    batch_size: 4
+    num_workers: 8
+optimizer_g:
+  lr: 0.0002
+  betas: [0.8, 0.99]
+optimizer_d:
+  lr: 0.0002
+  betas: [0.8, 0.99]
+scheduler:
+  warm_up_steps: 10000
+losses:
+  gan_type: 'lsgan'
+  lambda_recon: 100.0
+  lambda_feat: 2.0
+  lambda_gan: 1.0
+  reconstruction_loss:
+    sample_rate: 48000
+    n_fft: [1024, 2048, 512]
+    hop_length: [256, 512, 128]
+    n_mels: [80, 160, 40]
+trainer:
+  max_steps: 1000000
+  log_every_n_steps: 100
+  checkpoint_save_interval: 10000
+  limit_train_batches: 2000
+  devices: [0]
+  precision: 16-mixed
+  save_dir: logs/
+checkpoint:
+  path: "/inspire/hdd/project/multilingualspeechrecognition/chenxie-25019/jinxuanzhu/MSRKit/checkpoints/BS-Rofo-SW-Fixed.ckpt"
+  type: "roformer"

data/augment.py CHANGED Viewed

@@ -2,7 +2,7 @@ import numpy as np
 from data.eq_utils import apply_random_eq
 from pedalboard import Pedalboard, Resample, Compressor, Distortion, Reverb, Limiter, MP3Compressor, HighpassFilter, LowpassFilter
 import torch
-from scipy.signal import butter, lfilter
 try:
     import pyroomacoustics as pra
 except Exception as e:
@@ -25,78 +25,38 @@ def calculate_rms(audio: np.ndarray) -> float:
     return np.sqrt(np.mean(audio**2))
 def apply_fm_effect(audio: np.ndarray, sample_rate: int) -> np.ndarray:
-    """
-    应用 FM 电台模拟效果：低通滤波 (带宽限制) + 噪声叠加。
-    """
-    # 1. 随机带宽限制参数 (Cutoff Freq)
-    # 模拟接收不良的信号，截止频率在 8kHz 到 14kHz 之间
     cutoff_freq = np.random.uniform(8000, 14000)
-    order = 5 # 滤波器阶数，越高衰减越陡峭
-    # 2. 噪声参数
-    # 噪声幅度，模拟信号弱时的嘶嘶声
-    noise_level = np.random.uniform(0.0005, 0.005) # 噪声电平，需根据您的数据进行调整
-    # --- 低通滤波 (带宽限制) ---
     def butter_lowpass(cutoff, fs, order=5):
         nyq = 0.5 * fs
         normal_cutoff = cutoff / nyq
         b, a = butter(order, normal_cutoff, btype='low', analog=False)
         return b, a
     b, a = butter_lowpass(cutoff_freq, sample_rate, order=order)
-    # 注意：lfilter 默认只处理一维数组。如果 audio 是多通道 (C, L)，需要逐通道处理。
-    if audio.ndim == 2:
-        # (C, L) 格式
-        filtered_audio = np.array([lfilter(b, a, channel) for channel in audio])
-    else:
-        # (L,) 格式
-        filtered_audio = lfilter(b, a, audio)
-    # --- 噪声叠加 ---
-    # 生成白噪音，并乘以噪声电平
     noise = np.random.normal(0, 1, filtered_audio.shape) * noise_level
-    # 叠加
     fm_audio = filtered_audio + noise
-    # 确保幅度不会溢出，但由于噪声幅度小，通常不会成为问题
-    np.clip(fm_audio, -1.0, 1.0, out=fm_audio)
     return fm_audio
 def apply_random_room_reverb(audio, sr):
-    # audio 为 (C, L)，若是 (L,) 则 reshape
-    if audio.ndim == 1:
-        audio = audio[None, :]  # -> (1, L)
     C, L = audio.shape
-    # 随机房间大小 (更大 → 更多混响尾巴)
     room_dim = np.random.uniform(3, 9, size=3)
-    # 随机选择麦克风&声源位置
     room = pra.ShoeBox(room_dim, fs=sr, max_order=np.random.randint(4, 7), absorption=np.random.uniform(0.2, 0.7))
     mic_loc = np.array([
     np.random.uniform(0.5, room_dim[0]-0.5),
     np.random.uniform(0.5, room_dim[1]-0.5),
-    np.random.uniform(1.0, 2.0),  # 麦克风高度 ~ 人耳高度
     ])
     source_loc = np.array([
     np.random.uniform(0.5, room_dim[0]-0.5),
     np.random.uniform(0.5, room_dim[1]-0.5),
-    np.random.uniform(1.0, 2.0),  # 声源高度不必和人同高，但保持现实
     ])
     room.add_microphone(mic_loc)
-    room.add_source(source_loc, signal=audio.mean(axis=0))  # 用 mean 保持左右一致的空间信息
     room.compute_rir()
     WET_LEVEL = np.random.uniform(0.1, 0.6)
     DRY_LEVEL = np.random.uniform(0.5, 1.0)
     wet_audio = np.vstack([
@@ -104,14 +64,69 @@ def apply_random_room_reverb(audio, sr):
         for ch in range(C)
     ])
     wet_norm = np.max(np.abs(wet_audio)) + 1e-8
-    # 最终输出 = 干声 * Dry 比例 + 归一化湿声 * Wet 比例
     out = (audio * DRY_LEVEL) + (wet_audio * (WET_LEVEL / wet_norm))
     max_out = np.max(np.abs(out)) + 1e-8
     out_normalized = out / max_out
     return out_normalized
 class MasteringEnhancer:
     def __init__(self):
         pass
@@ -119,15 +134,12 @@ class MasteringEnhancer:
     def __call__(self, audio: np.ndarray, sr: int):
         board = Pedalboard()
-        # 1) 高频空气感（温和提升）
         if np.random.rand() < 0.5:
             board.append(LowpassFilter(np.random.uniform(14000, 19000)))
-        # 2) 低频收紧（避免boom）
         if np.random.rand() < 0.5:
             board.append(HighpassFilter(np.random.uniform(20, 60)))
-        # 3) 轻柔总线压缩（Glue）
         if np.random.rand() < 0.7:
             board.append(Compressor(
                 threshold_db=np.random.uniform(-12, -6),
@@ -136,12 +148,9 @@ class MasteringEnhancer:
                 release_ms=np.random.uniform(100, 300)
             ))
-        # 4) Tape 饱和感（质感 & 谐波）
         if np.random.rand() < 0.6:
-            # 使用一个很小的 drive_db (例如 0.5 到 2.0 dB) 来模拟轻微的饱和
             board.append(Distortion(drive_db=np.random.uniform(0.5, 2.0)))
-        # 5) 最后一层安全限制（保护不削顶）
         board.append(Limiter(threshold_db=np.random.uniform(-3, -0.1)))
         return board(audio, sample_rate=sr)
@@ -207,16 +216,16 @@ class MixtureAugmentation:
         self.encodec_model = EncodecModel.encodec_model_48khz()
         self.encodec_model.eval()
         self.encodec_available = True
-        self.encodec_bandwidths = [6.0, 12.0, 24.0]
-        self.p_encodec = 0.2
-        self.p_mp3 = 0.3
-        self.p_fm = 0.2
-        self.p_room = 0.3
-        self.p_limiter = 0.4
-        self.p_resample = 0.3
         self.is_cuda_initialized = False
         self.mastering = MasteringEnhancer()
-        self.p_mastering = 0.3
     def apply(self, audio: np.ndarray, sample_rate: int = 44100) -> np.ndarray:
         if np.max(np.abs(audio)) == 0:
@@ -231,49 +240,34 @@ class MixtureAugmentation:
         audio = audio / normalize_scale
         board = Pedalboard()
-        if np.random.rand() < self.p_limiter:
-            board.append(Limiter(
-                threshold_db=np.random.uniform(-10, 0),
-                release_ms=np.random.uniform(50, 200)
-            ))
         if np.random.rand() < self.p_resample:
             board.append(Resample(target_sample_rate=np.random.randint(16000, 44100)))
         if np.random.rand() < self.p_mastering:
             audio = self.mastering(audio, sample_rate)
-        # Encodec Part
         if np.random.rand() < self.p_encodec:
             device = 'cpu'
-            # device = 'cuda' if torch.cuda.is_available() else 'cpu'
-            if device == 'cuda' and not self.is_cuda_initialized:
-                self.encodec_model = self.encodec_model.to(device)
-                self.is_cuda_initialized = True
             model = self.encodec_model
-            # print(" DEBUG:Using Encodec augmentation")
             target_bw = np.random.choice(self.encodec_bandwidths)
             model.set_target_bandwidth(target_bw)
             wav_tensor = torch.from_numpy(audio).float().to(device)
             wav_processed = convert_audio(wav_tensor, sample_rate, model.sample_rate, model.channels)
             wav_input = wav_processed.unsqueeze(0)
             with torch.no_grad():
-                # 编码 -> 解码 (引入神经失真)
                 reconstructed_tensor = model(wav_input).squeeze(0)
-                # 将结果转回 numpy
                 audio = reconstructed_tensor.cpu().numpy()
-                # 重要：更新 sample_rate 以便后续的 Pedalboard 步骤使用 Encodec 的采样率
                 sample_rate = model.sample_rate
-        # MP3 Part
-        elif np.random.rand() < self.p_mp3:
-            board.append(MP3Compressor(vbr_quality=np.random.uniform(1.0, 9.0)))
-        # FM part
-        elif np.random.rand() < self.p_fm:
-            audio = apply_fm_effect(audio, sample_rate)
-        # Room part
-        elif np.random.rand() < self.p_room:
-            audio = apply_random_room_reverb(audio, sample_rate)
         if len(board) > 0:
             audio = board(audio, sample_rate=sample_rate)

 from data.eq_utils import apply_random_eq
 from pedalboard import Pedalboard, Resample, Compressor, Distortion, Reverb, Limiter, MP3Compressor, HighpassFilter, LowpassFilter
 import torch
+from scipy.signal import butter, lfilter, sosfilt
 try:
     import pyroomacoustics as pra
 except Exception as e:
     return np.sqrt(np.mean(audio**2))
 def apply_fm_effect(audio: np.ndarray, sample_rate: int) -> np.ndarray:
     cutoff_freq = np.random.uniform(8000, 14000)
+    order = 5
+    noise_level = np.random.uniform(0.0005, 0.005)
     def butter_lowpass(cutoff, fs, order=5):
         nyq = 0.5 * fs
         normal_cutoff = cutoff / nyq
         b, a = butter(order, normal_cutoff, btype='low', analog=False)
         return b, a
     b, a = butter_lowpass(cutoff_freq, sample_rate, order=order)
+    filtered_audio = np.array([lfilter(b, a, channel) for channel in audio])
     noise = np.random.normal(0, 1, filtered_audio.shape) * noise_level
     fm_audio = filtered_audio + noise
+    np.clip(fm_audio, -1.0, 1.0, out=fm_audio)
     return fm_audio
 def apply_random_room_reverb(audio, sr):
     C, L = audio.shape
     room_dim = np.random.uniform(3, 9, size=3)
     room = pra.ShoeBox(room_dim, fs=sr, max_order=np.random.randint(4, 7), absorption=np.random.uniform(0.2, 0.7))
     mic_loc = np.array([
     np.random.uniform(0.5, room_dim[0]-0.5),
     np.random.uniform(0.5, room_dim[1]-0.5),
+    np.random.uniform(1.0, 2.0),
     ])
     source_loc = np.array([
     np.random.uniform(0.5, room_dim[0]-0.5),
     np.random.uniform(0.5, room_dim[1]-0.5),
+    np.random.uniform(1.0, 2.0),
     ])
     room.add_microphone(mic_loc)
+    room.add_source(source_loc, signal=audio.mean(axis=0))
     room.compute_rir()
     WET_LEVEL = np.random.uniform(0.1, 0.6)
     DRY_LEVEL = np.random.uniform(0.5, 1.0)
     wet_audio = np.vstack([
         for ch in range(C)
     ])
     wet_norm = np.max(np.abs(wet_audio)) + 1e-8
     out = (audio * DRY_LEVEL) + (wet_audio * (WET_LEVEL / wet_norm))
     max_out = np.max(np.abs(out)) + 1e-8
     out_normalized = out / max_out
     return out_normalized
+def apply_live_dt4_simple(audio: np.ndarray, sample_rate: int, snr_db: float = 20.0) -> np.ndarray:
+    audio = apply_random_room_reverb(audio, sample_rate)
+    audio = _apply_phone_filter(audio, sample_rate)
+    audio = _add_environmental_noise(audio, sample_rate, snr_db)
+    return audio
+def _apply_phone_filter(audio: np.ndarray, sample_rate: int) -> np.ndarray:
+    lowcut = 300.0
+    highcut = 3400.0
+    nyq = 0.5 * sample_rate
+    low = lowcut / nyq
+    high = highcut / nyq
+    sos = butter(4, [low, high], btype='band', output='sos')
+    filtered = np.array([sosfilt(sos, channel) for channel in audio])
+    return filtered
+def _add_environmental_noise(audio: np.ndarray, sample_rate: int, snr_db: float) -> np.ndarray:
+    C, L = audio.shape
+    noise = _generate_noise(L, sample_rate)
+    if C > 1:
+        noise = np.tile(noise, (C, 1))
+    signal_power = np.mean(audio ** 2)
+    noise_power = np.mean(noise ** 2)
+    if noise_power > 0:
+        target_noise_power = signal_power / (10 ** (snr_db / 10))
+        scale = np.sqrt(target_noise_power / noise_power)
+        noise = noise * scale
+    mixed = audio + noise
+    max_val = np.max(np.abs(mixed))
+    if max_val > 1.0:
+        mixed = mixed / max_val
+    return mixed
+def _generate_noise(length: int, sample_rate: int) -> np.ndarray:
+    t = np.arange(length) / sample_rate
+    noise = np.random.normal(0, 1, length)
+    low_freq = np.random.uniform(50, 120)
+    noise += 0.3 * np.sin(2 * np.pi * low_freq * t)
+    mid_freq = np.random.uniform(200, 800)
+    noise += 0.2 * np.sin(2 * np.pi * mid_freq * t + np.random.uniform(0, 2*np.pi))
+    b = [0.1, 0.2, 0.4, 0.2, 0.1]
+    noise = lfilter(b, 1, noise)
+    return noise
 class MasteringEnhancer:
     def __init__(self):
         pass
     def __call__(self, audio: np.ndarray, sr: int):
         board = Pedalboard()
         if np.random.rand() < 0.5:
             board.append(LowpassFilter(np.random.uniform(14000, 19000)))
         if np.random.rand() < 0.5:
             board.append(HighpassFilter(np.random.uniform(20, 60)))
         if np.random.rand() < 0.7:
             board.append(Compressor(
                 threshold_db=np.random.uniform(-12, -6),
                 release_ms=np.random.uniform(100, 300)
             ))
         if np.random.rand() < 0.6:
             board.append(Distortion(drive_db=np.random.uniform(0.5, 2.0)))
         board.append(Limiter(threshold_db=np.random.uniform(-3, -0.1)))
         return board(audio, sample_rate=sr)
         self.encodec_model = EncodecModel.encodec_model_48khz()
         self.encodec_model.eval()
         self.encodec_available = True
+        self.encodec_bandwidths = [3.0, 6.0, 12.0, 24.0]
+        self.p_resample = 0.5
+        self.p_mastering = 0.5
+        self.p_mp3 = 0.5
+        self.p_fm = 0.5
+        self.p_live = 0.5
+        self.p_encodec = 0.5
         self.is_cuda_initialized = False
         self.mastering = MasteringEnhancer()
     def apply(self, audio: np.ndarray, sample_rate: int = 44100) -> np.ndarray:
         if np.max(np.abs(audio)) == 0:
         audio = audio / normalize_scale
         board = Pedalboard()
         if np.random.rand() < self.p_resample:
             board.append(Resample(target_sample_rate=np.random.randint(16000, 44100)))
         if np.random.rand() < self.p_mastering:
             audio = self.mastering(audio, sample_rate)
+        if np.random.rand() < self.p_mp3:
+            board.append(MP3Compressor(vbr_quality=np.random.uniform(1.0, 9.0)))
+        if np.random.rand() < self.p_fm:
+            audio = apply_fm_effect(audio, sample_rate)
+        if np.random.rand() < self.p_live:
+            audio = apply_live_dt4_simple(audio, sample_rate)
         if np.random.rand() < self.p_encodec:
             device = 'cpu'
             model = self.encodec_model
             target_bw = np.random.choice(self.encodec_bandwidths)
             model.set_target_bandwidth(target_bw)
             wav_tensor = torch.from_numpy(audio).float().to(device)
             wav_processed = convert_audio(wav_tensor, sample_rate, model.sample_rate, model.channels)
             wav_input = wav_processed.unsqueeze(0)
             with torch.no_grad():
                 reconstructed_tensor = model(wav_input).squeeze(0)
                 audio = reconstructed_tensor.cpu().numpy()
                 sample_rate = model.sample_rate
         if len(board) > 0:
             audio = board(audio, sample_rate=sample_rate)