ModistAndrew commited on
Commit
7673750
·
1 Parent(s): b1462ff

difficult aug and configs

Browse files
configs/bsrestore/vox_hard.yaml ADDED
@@ -0,0 +1,142 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ project_name: "bsrestore"
2
+ exp_name: "vox_hard_large"
3
+
4
+ model:
5
+ name: "BSRoFormer"
6
+ params:
7
+ dim: 256
8
+ depth: 12
9
+ stereo: true
10
+ num_stems: 1
11
+ time_transformer_depth: 1
12
+ freq_transformer_depth: 1
13
+ linear_transformer_depth: 0
14
+ freqs_per_bands: !!python/tuple
15
+ - 2
16
+ - 2
17
+ - 2
18
+ - 2
19
+ - 2
20
+ - 2
21
+ - 2
22
+ - 2
23
+ - 2
24
+ - 2
25
+ - 2
26
+ - 2
27
+ - 2
28
+ - 2
29
+ - 2
30
+ - 2
31
+ - 2
32
+ - 2
33
+ - 2
34
+ - 2
35
+ - 2
36
+ - 2
37
+ - 2
38
+ - 2
39
+ - 4
40
+ - 4
41
+ - 4
42
+ - 4
43
+ - 4
44
+ - 4
45
+ - 4
46
+ - 4
47
+ - 4
48
+ - 4
49
+ - 4
50
+ - 4
51
+ - 12
52
+ - 12
53
+ - 12
54
+ - 12
55
+ - 12
56
+ - 12
57
+ - 12
58
+ - 12
59
+ - 24
60
+ - 24
61
+ - 24
62
+ - 24
63
+ - 24
64
+ - 24
65
+ - 24
66
+ - 24
67
+ - 48
68
+ - 48
69
+ - 48
70
+ - 48
71
+ - 48
72
+ - 48
73
+ - 48
74
+ - 48
75
+ - 128
76
+ - 129
77
+ dim_head: 64
78
+ heads: 8
79
+ attn_dropout: 0.1
80
+ ff_dropout: 0.1
81
+ flash_attn: true
82
+ dim_freqs_in: 1025
83
+ stft_n_fft: 2048
84
+ stft_hop_length: 512
85
+ stft_win_length: 2048
86
+ stft_normalized: false
87
+ mask_estimator_depth: 2
88
+ multi_stft_resolution_loss_weight: 1.0
89
+ multi_stft_resolutions_window_sizes: !!python/tuple
90
+ - 4096
91
+ - 2048
92
+ - 1024
93
+ - 512
94
+ - 256
95
+ multi_stft_hop_size: 147
96
+ multi_stft_normalized: False
97
+ mlp_expansion_factor: 4
98
+ use_torch_checkpoint: False # it allows to greatly reduce GPU memory consumption during training (not fully tested)
99
+ skip_connection: False # Enable skip connection between transformer blocks - can solve problem with gradients and probably faster training
100
+
101
+ data:
102
+ sample_rate: 48000
103
+ clip_duration: 10.0
104
+ train_dataset:
105
+ target_stem: "Voc"
106
+ root_directory: "/inspire/hdd/project/multilingualspeechrecognition/chenxie-25019/data/RawStems"
107
+ apply_augmentation: True
108
+ snr_range: [0.0, 10.0]
109
+ train_dataset1:
110
+ target_stem: "vox"
111
+ root_directory: "/inspire/hdd/project/multilingualspeechrecognition/chenxie-25019/data/moisesdb_raw"
112
+ apply_augmentation: True
113
+ snr_range: [0.0, 10.0]
114
+ moisesdb: True
115
+ val_dataset:
116
+ target_stem: "Voc"
117
+ root_directory: "/inspire/hdd/project/multilingualspeechrecognition/chenxie-25019/data/RawStems_valid"
118
+ apply_augmentation: True
119
+ snr_range: [0.0, 10.0]
120
+ dataloader_params:
121
+ batch_size: 4
122
+ num_workers: 8
123
+
124
+ optimizer_g:
125
+ lr: 0.0005
126
+ betas: [0.8, 0.99]
127
+
128
+ scheduler:
129
+ warm_up_steps: 10000
130
+
131
+ trainer:
132
+ max_steps: 1000000
133
+ log_every_n_steps: 100
134
+ checkpoint_save_interval: 10000
135
+ limit_train_batches: 2000
136
+ devices: [0]
137
+ precision: 16-mixed
138
+ save_dir: logs/
139
+
140
+ checkpoint:
141
+ path: "/inspire/hdd/project/multilingualspeechrecognition/chenxie-25019/jinxuanzhu/MSRKit/checkpoints/BS-Rofo-SW-Fixed.ckpt"
142
+ type: "roformer"
configs/bsrestore/vox_hard_gan.yaml ADDED
@@ -0,0 +1,165 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ project_name: "bsrestore"
2
+ exp_name: "vox_hard_large_gan"
3
+
4
+ model:
5
+ name: "BSRoFormer"
6
+ params:
7
+ dim: 256
8
+ depth: 12
9
+ stereo: true
10
+ num_stems: 1
11
+ time_transformer_depth: 1
12
+ freq_transformer_depth: 1
13
+ linear_transformer_depth: 0
14
+ freqs_per_bands: !!python/tuple
15
+ - 2
16
+ - 2
17
+ - 2
18
+ - 2
19
+ - 2
20
+ - 2
21
+ - 2
22
+ - 2
23
+ - 2
24
+ - 2
25
+ - 2
26
+ - 2
27
+ - 2
28
+ - 2
29
+ - 2
30
+ - 2
31
+ - 2
32
+ - 2
33
+ - 2
34
+ - 2
35
+ - 2
36
+ - 2
37
+ - 2
38
+ - 2
39
+ - 4
40
+ - 4
41
+ - 4
42
+ - 4
43
+ - 4
44
+ - 4
45
+ - 4
46
+ - 4
47
+ - 4
48
+ - 4
49
+ - 4
50
+ - 4
51
+ - 12
52
+ - 12
53
+ - 12
54
+ - 12
55
+ - 12
56
+ - 12
57
+ - 12
58
+ - 12
59
+ - 24
60
+ - 24
61
+ - 24
62
+ - 24
63
+ - 24
64
+ - 24
65
+ - 24
66
+ - 24
67
+ - 48
68
+ - 48
69
+ - 48
70
+ - 48
71
+ - 48
72
+ - 48
73
+ - 48
74
+ - 48
75
+ - 128
76
+ - 129
77
+ dim_head: 64
78
+ heads: 8
79
+ attn_dropout: 0.1
80
+ ff_dropout: 0.1
81
+ flash_attn: true
82
+ dim_freqs_in: 1025
83
+ stft_n_fft: 2048
84
+ stft_hop_length: 512
85
+ stft_win_length: 2048
86
+ stft_normalized: false
87
+ mask_estimator_depth: 2
88
+ multi_stft_resolution_loss_weight: 1.0
89
+ multi_stft_resolutions_window_sizes: !!python/tuple
90
+ - 4096
91
+ - 2048
92
+ - 1024
93
+ - 512
94
+ - 256
95
+ multi_stft_hop_size: 147
96
+ multi_stft_normalized: False
97
+ mlp_expansion_factor: 4
98
+ use_torch_checkpoint: False # it allows to greatly reduce GPU memory consumption during training (not fully tested)
99
+ skip_connection: False # Enable skip connection between transformer blocks - can solve problem with gradients and probably faster training
100
+
101
+ discriminators:
102
+ - name: "MultiFrequencyDiscriminator"
103
+ params:
104
+ nch: 1
105
+ window_sizes: [2048, 1024, 512]
106
+ sample_rate: 48000
107
+ norm: True
108
+
109
+ data:
110
+ sample_rate: 48000
111
+ clip_duration: 10.0
112
+ train_dataset:
113
+ target_stem: "Voc"
114
+ root_directory: "/inspire/hdd/project/multilingualspeechrecognition/chenxie-25019/data/RawStems"
115
+ apply_augmentation: True
116
+ snr_range: [0.0, 10.0]
117
+ train_dataset1:
118
+ target_stem: "vox"
119
+ root_directory: "/inspire/hdd/project/multilingualspeechrecognition/chenxie-25019/data/moisesdb_raw"
120
+ apply_augmentation: True
121
+ snr_range: [0.0, 10.0]
122
+ moisesdb: True
123
+ val_dataset:
124
+ target_stem: "Voc"
125
+ root_directory: "/inspire/hdd/project/multilingualspeechrecognition/chenxie-25019/data/RawStems_valid"
126
+ apply_augmentation: True
127
+ snr_range: [0.0, 10.0]
128
+ dataloader_params:
129
+ batch_size: 4
130
+ num_workers: 8
131
+
132
+ optimizer_g:
133
+ lr: 0.0002
134
+ betas: [0.8, 0.99]
135
+
136
+ optimizer_d:
137
+ lr: 0.0002
138
+ betas: [0.8, 0.99]
139
+
140
+ scheduler:
141
+ warm_up_steps: 10000
142
+
143
+ losses:
144
+ gan_type: 'lsgan'
145
+ lambda_recon: 100.0
146
+ lambda_feat: 2.0
147
+ lambda_gan: 1.0
148
+ reconstruction_loss:
149
+ sample_rate: 48000
150
+ n_fft: [1024, 2048, 512]
151
+ hop_length: [256, 512, 128]
152
+ n_mels: [80, 160, 40]
153
+
154
+ trainer:
155
+ max_steps: 1000000
156
+ log_every_n_steps: 100
157
+ checkpoint_save_interval: 10000
158
+ limit_train_batches: 2000
159
+ devices: [0]
160
+ precision: 16-mixed
161
+ save_dir: logs/
162
+
163
+ checkpoint:
164
+ path: "/inspire/hdd/project/multilingualspeechrecognition/chenxie-25019/jinxuanzhu/MSRKit/checkpoints/BS-Rofo-SW-Fixed.ckpt"
165
+ type: "roformer"
configs/bsrestore/vox_mix.yaml CHANGED
@@ -1,5 +1,5 @@
1
  project_name: "bsrestore"
2
- exp_name: "vox_mix"
3
 
4
  model:
5
  name: "BSRoFormer"
@@ -100,13 +100,20 @@ model:
100
 
101
  data:
102
  sample_rate: 48000
103
- clip_duration: 3.0
104
  train_dataset:
105
  target_stem: "Voc"
106
  root_directory: "/inspire/hdd/project/multilingualspeechrecognition/chenxie-25019/data/RawStems"
107
  apply_augmentation: True
108
  snr_range: [0.0, 10.0]
109
  output_mixture: True
 
 
 
 
 
 
 
110
  val_dataset:
111
  target_stem: "Voc"
112
  root_directory: "/inspire/hdd/project/multilingualspeechrecognition/chenxie-25019/data/RawStems_valid"
 
1
  project_name: "bsrestore"
2
+ exp_name: "vox_mix_large"
3
 
4
  model:
5
  name: "BSRoFormer"
 
100
 
101
  data:
102
  sample_rate: 48000
103
+ clip_duration: 10.0
104
  train_dataset:
105
  target_stem: "Voc"
106
  root_directory: "/inspire/hdd/project/multilingualspeechrecognition/chenxie-25019/data/RawStems"
107
  apply_augmentation: True
108
  snr_range: [0.0, 10.0]
109
  output_mixture: True
110
+ train_dataset1:
111
+ target_stem: "vox"
112
+ root_directory: "/inspire/hdd/project/multilingualspeechrecognition/chenxie-25019/data/moisesdb_raw"
113
+ apply_augmentation: True
114
+ snr_range: [0.0, 10.0]
115
+ output_mixture: True
116
+ moisesdb: True
117
  val_dataset:
118
  target_stem: "Voc"
119
  root_directory: "/inspire/hdd/project/multilingualspeechrecognition/chenxie-25019/data/RawStems_valid"
configs/bsrestore/vox_mix_gan.yaml ADDED
@@ -0,0 +1,168 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ project_name: "bsrestore"
2
+ exp_name: "vox_mix_large_gan"
3
+
4
+ model:
5
+ name: "BSRoFormer"
6
+ params:
7
+ dim: 256
8
+ depth: 12
9
+ stereo: true
10
+ num_stems: 1
11
+ time_transformer_depth: 1
12
+ freq_transformer_depth: 1
13
+ linear_transformer_depth: 0
14
+ freqs_per_bands: !!python/tuple
15
+ - 2
16
+ - 2
17
+ - 2
18
+ - 2
19
+ - 2
20
+ - 2
21
+ - 2
22
+ - 2
23
+ - 2
24
+ - 2
25
+ - 2
26
+ - 2
27
+ - 2
28
+ - 2
29
+ - 2
30
+ - 2
31
+ - 2
32
+ - 2
33
+ - 2
34
+ - 2
35
+ - 2
36
+ - 2
37
+ - 2
38
+ - 2
39
+ - 4
40
+ - 4
41
+ - 4
42
+ - 4
43
+ - 4
44
+ - 4
45
+ - 4
46
+ - 4
47
+ - 4
48
+ - 4
49
+ - 4
50
+ - 4
51
+ - 12
52
+ - 12
53
+ - 12
54
+ - 12
55
+ - 12
56
+ - 12
57
+ - 12
58
+ - 12
59
+ - 24
60
+ - 24
61
+ - 24
62
+ - 24
63
+ - 24
64
+ - 24
65
+ - 24
66
+ - 24
67
+ - 48
68
+ - 48
69
+ - 48
70
+ - 48
71
+ - 48
72
+ - 48
73
+ - 48
74
+ - 48
75
+ - 128
76
+ - 129
77
+ dim_head: 64
78
+ heads: 8
79
+ attn_dropout: 0.1
80
+ ff_dropout: 0.1
81
+ flash_attn: true
82
+ dim_freqs_in: 1025
83
+ stft_n_fft: 2048
84
+ stft_hop_length: 512
85
+ stft_win_length: 2048
86
+ stft_normalized: false
87
+ mask_estimator_depth: 2
88
+ multi_stft_resolution_loss_weight: 1.0
89
+ multi_stft_resolutions_window_sizes: !!python/tuple
90
+ - 4096
91
+ - 2048
92
+ - 1024
93
+ - 512
94
+ - 256
95
+ multi_stft_hop_size: 147
96
+ multi_stft_normalized: False
97
+ mlp_expansion_factor: 4
98
+ use_torch_checkpoint: False # it allows to greatly reduce GPU memory consumption during training (not fully tested)
99
+ skip_connection: False # Enable skip connection between transformer blocks - can solve problem with gradients and probably faster training
100
+
101
+ discriminators:
102
+ - name: "MultiFrequencyDiscriminator"
103
+ params:
104
+ nch: 1
105
+ window_sizes: [2048, 1024, 512]
106
+ sample_rate: 48000
107
+ norm: True
108
+
109
+ data:
110
+ sample_rate: 48000
111
+ clip_duration: 10.0
112
+ train_dataset:
113
+ target_stem: "Voc"
114
+ root_directory: "/inspire/hdd/project/multilingualspeechrecognition/chenxie-25019/data/RawStems"
115
+ apply_augmentation: True
116
+ snr_range: [0.0, 10.0]
117
+ output_mixture: True
118
+ train_dataset1:
119
+ target_stem: "vox"
120
+ root_directory: "/inspire/hdd/project/multilingualspeechrecognition/chenxie-25019/data/moisesdb_raw"
121
+ apply_augmentation: True
122
+ snr_range: [0.0, 10.0]
123
+ output_mixture: True
124
+ moisesdb: True
125
+ val_dataset:
126
+ target_stem: "Voc"
127
+ root_directory: "/inspire/hdd/project/multilingualspeechrecognition/chenxie-25019/data/RawStems_valid"
128
+ apply_augmentation: True
129
+ snr_range: [0.0, 10.0]
130
+ output_mixture: True
131
+ dataloader_params:
132
+ batch_size: 4
133
+ num_workers: 8
134
+
135
+ optimizer_g:
136
+ lr: 0.0002
137
+ betas: [0.8, 0.99]
138
+
139
+ optimizer_d:
140
+ lr: 0.0002
141
+ betas: [0.8, 0.99]
142
+
143
+ scheduler:
144
+ warm_up_steps: 10000
145
+
146
+ losses:
147
+ gan_type: 'lsgan'
148
+ lambda_recon: 100.0
149
+ lambda_feat: 2.0
150
+ lambda_gan: 1.0
151
+ reconstruction_loss:
152
+ sample_rate: 48000
153
+ n_fft: [1024, 2048, 512]
154
+ hop_length: [256, 512, 128]
155
+ n_mels: [80, 160, 40]
156
+
157
+ trainer:
158
+ max_steps: 1000000
159
+ log_every_n_steps: 100
160
+ checkpoint_save_interval: 10000
161
+ limit_train_batches: 2000
162
+ devices: [0]
163
+ precision: 16-mixed
164
+ save_dir: logs/
165
+
166
+ checkpoint:
167
+ path: "/inspire/hdd/project/multilingualspeechrecognition/chenxie-25019/jinxuanzhu/MSRKit/checkpoints/BS-Rofo-SW-Fixed.ckpt"
168
+ type: "roformer"
data/augment.py CHANGED
@@ -2,7 +2,7 @@ import numpy as np
2
  from data.eq_utils import apply_random_eq
3
  from pedalboard import Pedalboard, Resample, Compressor, Distortion, Reverb, Limiter, MP3Compressor, HighpassFilter, LowpassFilter
4
  import torch
5
- from scipy.signal import butter, lfilter
6
  try:
7
  import pyroomacoustics as pra
8
  except Exception as e:
@@ -25,78 +25,38 @@ def calculate_rms(audio: np.ndarray) -> float:
25
  return np.sqrt(np.mean(audio**2))
26
 
27
  def apply_fm_effect(audio: np.ndarray, sample_rate: int) -> np.ndarray:
28
- """
29
- 应用 FM 电台模拟效果:低通滤波 (带宽限制) + 噪声叠加。
30
- """
31
-
32
- # 1. 随机带宽限制参数 (Cutoff Freq)
33
- # 模拟接收不良的信号,截止频率在 8kHz 到 14kHz 之间
34
  cutoff_freq = np.random.uniform(8000, 14000)
35
- order = 5 # 滤波器阶数,越高衰减越陡峭
36
-
37
- # 2. 噪声参数
38
- # 噪声幅度,模拟信号弱时的嘶嘶声
39
- noise_level = np.random.uniform(0.0005, 0.005) # 噪声电平,需根据您的数据进行调整
40
-
41
- # --- 低通滤波 (带宽限制) ---
42
  def butter_lowpass(cutoff, fs, order=5):
43
  nyq = 0.5 * fs
44
  normal_cutoff = cutoff / nyq
45
  b, a = butter(order, normal_cutoff, btype='low', analog=False)
46
  return b, a
47
-
48
  b, a = butter_lowpass(cutoff_freq, sample_rate, order=order)
49
-
50
- # 注意:lfilter 默认只处理一维数组。如果 audio 是多通道 (C, L),需要逐通道处理。
51
- if audio.ndim == 2:
52
- # (C, L) 格式
53
- filtered_audio = np.array([lfilter(b, a, channel) for channel in audio])
54
- else:
55
- # (L,) 格式
56
- filtered_audio = lfilter(b, a, audio)
57
-
58
- # --- 噪声叠加 ---
59
-
60
- # 生成白噪音,并乘以噪声电平
61
  noise = np.random.normal(0, 1, filtered_audio.shape) * noise_level
62
-
63
- # 叠加
64
  fm_audio = filtered_audio + noise
65
-
66
- # 确保幅度不会溢出,但由于噪声幅度小,通常不会成为问题
67
- np.clip(fm_audio, -1.0, 1.0, out=fm_audio)
68
-
69
  return fm_audio
70
 
71
  def apply_random_room_reverb(audio, sr):
72
- # audio 为 (C, L),若是 (L,) 则 reshape
73
- if audio.ndim == 1:
74
- audio = audio[None, :] # -> (1, L)
75
-
76
  C, L = audio.shape
77
-
78
- # 随机房间大小 (更大 → 更多混响尾巴)
79
  room_dim = np.random.uniform(3, 9, size=3)
80
-
81
- # 随机选择麦克风&声源位置
82
  room = pra.ShoeBox(room_dim, fs=sr, max_order=np.random.randint(4, 7), absorption=np.random.uniform(0.2, 0.7))
83
-
84
  mic_loc = np.array([
85
  np.random.uniform(0.5, room_dim[0]-0.5),
86
  np.random.uniform(0.5, room_dim[1]-0.5),
87
- np.random.uniform(1.0, 2.0), # 麦克风高度 ~ 人耳高度
88
  ])
89
-
90
  source_loc = np.array([
91
  np.random.uniform(0.5, room_dim[0]-0.5),
92
  np.random.uniform(0.5, room_dim[1]-0.5),
93
- np.random.uniform(1.0, 2.0), # 声源高度不必和人同高,但保持现实
94
  ])
95
  room.add_microphone(mic_loc)
96
- room.add_source(source_loc, signal=audio.mean(axis=0)) # 用 mean 保持左右一致的空间信息
97
-
98
  room.compute_rir()
99
-
100
  WET_LEVEL = np.random.uniform(0.1, 0.6)
101
  DRY_LEVEL = np.random.uniform(0.5, 1.0)
102
  wet_audio = np.vstack([
@@ -104,14 +64,69 @@ def apply_random_room_reverb(audio, sr):
104
  for ch in range(C)
105
  ])
106
  wet_norm = np.max(np.abs(wet_audio)) + 1e-8
107
-
108
- # 最终输出 = 干声 * Dry 比例 + 归一化湿声 * Wet 比例
109
  out = (audio * DRY_LEVEL) + (wet_audio * (WET_LEVEL / wet_norm))
110
  max_out = np.max(np.abs(out)) + 1e-8
111
  out_normalized = out / max_out
112
-
113
  return out_normalized
114
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
115
  class MasteringEnhancer:
116
  def __init__(self):
117
  pass
@@ -119,15 +134,12 @@ class MasteringEnhancer:
119
  def __call__(self, audio: np.ndarray, sr: int):
120
  board = Pedalboard()
121
 
122
- # 1) 高频空气感(温和提升)
123
  if np.random.rand() < 0.5:
124
  board.append(LowpassFilter(np.random.uniform(14000, 19000)))
125
 
126
- # 2) 低频收紧(避免boom)
127
  if np.random.rand() < 0.5:
128
  board.append(HighpassFilter(np.random.uniform(20, 60)))
129
 
130
- # 3) 轻柔总线压缩(Glue)
131
  if np.random.rand() < 0.7:
132
  board.append(Compressor(
133
  threshold_db=np.random.uniform(-12, -6),
@@ -136,12 +148,9 @@ class MasteringEnhancer:
136
  release_ms=np.random.uniform(100, 300)
137
  ))
138
 
139
- # 4) Tape 饱和感(质感 & 谐波)
140
  if np.random.rand() < 0.6:
141
- # 使用一个很小的 drive_db (例如 0.5 到 2.0 dB) 来模拟轻微的饱和
142
  board.append(Distortion(drive_db=np.random.uniform(0.5, 2.0)))
143
 
144
- # 5) 最后一层安全限制(保护不削顶)
145
  board.append(Limiter(threshold_db=np.random.uniform(-3, -0.1)))
146
 
147
  return board(audio, sample_rate=sr)
@@ -207,16 +216,16 @@ class MixtureAugmentation:
207
  self.encodec_model = EncodecModel.encodec_model_48khz()
208
  self.encodec_model.eval()
209
  self.encodec_available = True
210
- self.encodec_bandwidths = [6.0, 12.0, 24.0]
211
- self.p_encodec = 0.2
212
- self.p_mp3 = 0.3
213
- self.p_fm = 0.2
214
- self.p_room = 0.3
215
- self.p_limiter = 0.4
216
- self.p_resample = 0.3
217
  self.is_cuda_initialized = False
218
  self.mastering = MasteringEnhancer()
219
- self.p_mastering = 0.3
220
 
221
  def apply(self, audio: np.ndarray, sample_rate: int = 44100) -> np.ndarray:
222
  if np.max(np.abs(audio)) == 0:
@@ -231,49 +240,34 @@ class MixtureAugmentation:
231
  audio = audio / normalize_scale
232
 
233
  board = Pedalboard()
234
-
235
- if np.random.rand() < self.p_limiter:
236
- board.append(Limiter(
237
- threshold_db=np.random.uniform(-10, 0),
238
- release_ms=np.random.uniform(50, 200)
239
- ))
240
 
241
  if np.random.rand() < self.p_resample:
242
  board.append(Resample(target_sample_rate=np.random.randint(16000, 44100)))
243
 
244
  if np.random.rand() < self.p_mastering:
245
  audio = self.mastering(audio, sample_rate)
 
 
 
 
 
 
 
 
 
246
 
247
- # Encodec Part
248
  if np.random.rand() < self.p_encodec:
249
  device = 'cpu'
250
- # device = 'cuda' if torch.cuda.is_available() else 'cpu'
251
- if device == 'cuda' and not self.is_cuda_initialized:
252
- self.encodec_model = self.encodec_model.to(device)
253
- self.is_cuda_initialized = True
254
  model = self.encodec_model
255
- # print(" DEBUG:Using Encodec augmentation")
256
  target_bw = np.random.choice(self.encodec_bandwidths)
257
  model.set_target_bandwidth(target_bw)
258
  wav_tensor = torch.from_numpy(audio).float().to(device)
259
  wav_processed = convert_audio(wav_tensor, sample_rate, model.sample_rate, model.channels)
260
  wav_input = wav_processed.unsqueeze(0)
261
  with torch.no_grad():
262
- # 编码 -> 解码 (引入神经失真)
263
  reconstructed_tensor = model(wav_input).squeeze(0)
264
- # 将结果转回 numpy
265
  audio = reconstructed_tensor.cpu().numpy()
266
- # 重要:更新 sample_rate 以便后续的 Pedalboard 步骤使用 Encodec 的采样率
267
  sample_rate = model.sample_rate
268
- # MP3 Part
269
- elif np.random.rand() < self.p_mp3:
270
- board.append(MP3Compressor(vbr_quality=np.random.uniform(1.0, 9.0)))
271
- # FM part
272
- elif np.random.rand() < self.p_fm:
273
- audio = apply_fm_effect(audio, sample_rate)
274
- # Room part
275
- elif np.random.rand() < self.p_room:
276
- audio = apply_random_room_reverb(audio, sample_rate)
277
 
278
  if len(board) > 0:
279
  audio = board(audio, sample_rate=sample_rate)
 
2
  from data.eq_utils import apply_random_eq
3
  from pedalboard import Pedalboard, Resample, Compressor, Distortion, Reverb, Limiter, MP3Compressor, HighpassFilter, LowpassFilter
4
  import torch
5
+ from scipy.signal import butter, lfilter, sosfilt
6
  try:
7
  import pyroomacoustics as pra
8
  except Exception as e:
 
25
  return np.sqrt(np.mean(audio**2))
26
 
27
  def apply_fm_effect(audio: np.ndarray, sample_rate: int) -> np.ndarray:
 
 
 
 
 
 
28
  cutoff_freq = np.random.uniform(8000, 14000)
29
+ order = 5
30
+ noise_level = np.random.uniform(0.0005, 0.005)
 
 
 
 
 
31
  def butter_lowpass(cutoff, fs, order=5):
32
  nyq = 0.5 * fs
33
  normal_cutoff = cutoff / nyq
34
  b, a = butter(order, normal_cutoff, btype='low', analog=False)
35
  return b, a
 
36
  b, a = butter_lowpass(cutoff_freq, sample_rate, order=order)
37
+ filtered_audio = np.array([lfilter(b, a, channel) for channel in audio])
 
 
 
 
 
 
 
 
 
 
 
38
  noise = np.random.normal(0, 1, filtered_audio.shape) * noise_level
 
 
39
  fm_audio = filtered_audio + noise
40
+ np.clip(fm_audio, -1.0, 1.0, out=fm_audio)
 
 
 
41
  return fm_audio
42
 
43
  def apply_random_room_reverb(audio, sr):
 
 
 
 
44
  C, L = audio.shape
 
 
45
  room_dim = np.random.uniform(3, 9, size=3)
 
 
46
  room = pra.ShoeBox(room_dim, fs=sr, max_order=np.random.randint(4, 7), absorption=np.random.uniform(0.2, 0.7))
 
47
  mic_loc = np.array([
48
  np.random.uniform(0.5, room_dim[0]-0.5),
49
  np.random.uniform(0.5, room_dim[1]-0.5),
50
+ np.random.uniform(1.0, 2.0),
51
  ])
 
52
  source_loc = np.array([
53
  np.random.uniform(0.5, room_dim[0]-0.5),
54
  np.random.uniform(0.5, room_dim[1]-0.5),
55
+ np.random.uniform(1.0, 2.0),
56
  ])
57
  room.add_microphone(mic_loc)
58
+ room.add_source(source_loc, signal=audio.mean(axis=0))
 
59
  room.compute_rir()
 
60
  WET_LEVEL = np.random.uniform(0.1, 0.6)
61
  DRY_LEVEL = np.random.uniform(0.5, 1.0)
62
  wet_audio = np.vstack([
 
64
  for ch in range(C)
65
  ])
66
  wet_norm = np.max(np.abs(wet_audio)) + 1e-8
 
 
67
  out = (audio * DRY_LEVEL) + (wet_audio * (WET_LEVEL / wet_norm))
68
  max_out = np.max(np.abs(out)) + 1e-8
69
  out_normalized = out / max_out
 
70
  return out_normalized
71
 
72
+ def apply_live_dt4_simple(audio: np.ndarray, sample_rate: int, snr_db: float = 20.0) -> np.ndarray:
73
+ audio = apply_random_room_reverb(audio, sample_rate)
74
+ audio = _apply_phone_filter(audio, sample_rate)
75
+ audio = _add_environmental_noise(audio, sample_rate, snr_db)
76
+ return audio
77
+
78
+ def _apply_phone_filter(audio: np.ndarray, sample_rate: int) -> np.ndarray:
79
+ lowcut = 300.0
80
+ highcut = 3400.0
81
+
82
+ nyq = 0.5 * sample_rate
83
+ low = lowcut / nyq
84
+ high = highcut / nyq
85
+ sos = butter(4, [low, high], btype='band', output='sos')
86
+
87
+ filtered = np.array([sosfilt(sos, channel) for channel in audio])
88
+ return filtered
89
+
90
+ def _add_environmental_noise(audio: np.ndarray, sample_rate: int, snr_db: float) -> np.ndarray:
91
+ C, L = audio.shape
92
+
93
+ noise = _generate_noise(L, sample_rate)
94
+
95
+ if C > 1:
96
+ noise = np.tile(noise, (C, 1))
97
+
98
+ signal_power = np.mean(audio ** 2)
99
+ noise_power = np.mean(noise ** 2)
100
+
101
+ if noise_power > 0:
102
+ target_noise_power = signal_power / (10 ** (snr_db / 10))
103
+ scale = np.sqrt(target_noise_power / noise_power)
104
+ noise = noise * scale
105
+
106
+ mixed = audio + noise
107
+
108
+ max_val = np.max(np.abs(mixed))
109
+ if max_val > 1.0:
110
+ mixed = mixed / max_val
111
+
112
+ return mixed
113
+
114
+ def _generate_noise(length: int, sample_rate: int) -> np.ndarray:
115
+ t = np.arange(length) / sample_rate
116
+
117
+ noise = np.random.normal(0, 1, length)
118
+
119
+ low_freq = np.random.uniform(50, 120)
120
+ noise += 0.3 * np.sin(2 * np.pi * low_freq * t)
121
+
122
+ mid_freq = np.random.uniform(200, 800)
123
+ noise += 0.2 * np.sin(2 * np.pi * mid_freq * t + np.random.uniform(0, 2*np.pi))
124
+
125
+ b = [0.1, 0.2, 0.4, 0.2, 0.1]
126
+ noise = lfilter(b, 1, noise)
127
+
128
+ return noise
129
+
130
  class MasteringEnhancer:
131
  def __init__(self):
132
  pass
 
134
  def __call__(self, audio: np.ndarray, sr: int):
135
  board = Pedalboard()
136
 
 
137
  if np.random.rand() < 0.5:
138
  board.append(LowpassFilter(np.random.uniform(14000, 19000)))
139
 
 
140
  if np.random.rand() < 0.5:
141
  board.append(HighpassFilter(np.random.uniform(20, 60)))
142
 
 
143
  if np.random.rand() < 0.7:
144
  board.append(Compressor(
145
  threshold_db=np.random.uniform(-12, -6),
 
148
  release_ms=np.random.uniform(100, 300)
149
  ))
150
 
 
151
  if np.random.rand() < 0.6:
 
152
  board.append(Distortion(drive_db=np.random.uniform(0.5, 2.0)))
153
 
 
154
  board.append(Limiter(threshold_db=np.random.uniform(-3, -0.1)))
155
 
156
  return board(audio, sample_rate=sr)
 
216
  self.encodec_model = EncodecModel.encodec_model_48khz()
217
  self.encodec_model.eval()
218
  self.encodec_available = True
219
+ self.encodec_bandwidths = [3.0, 6.0, 12.0, 24.0]
220
+ self.p_resample = 0.5
221
+ self.p_mastering = 0.5
222
+ self.p_mp3 = 0.5
223
+ self.p_fm = 0.5
224
+ self.p_live = 0.5
225
+ self.p_encodec = 0.5
226
  self.is_cuda_initialized = False
227
  self.mastering = MasteringEnhancer()
228
+
229
 
230
  def apply(self, audio: np.ndarray, sample_rate: int = 44100) -> np.ndarray:
231
  if np.max(np.abs(audio)) == 0:
 
240
  audio = audio / normalize_scale
241
 
242
  board = Pedalboard()
 
 
 
 
 
 
243
 
244
  if np.random.rand() < self.p_resample:
245
  board.append(Resample(target_sample_rate=np.random.randint(16000, 44100)))
246
 
247
  if np.random.rand() < self.p_mastering:
248
  audio = self.mastering(audio, sample_rate)
249
+
250
+ if np.random.rand() < self.p_mp3:
251
+ board.append(MP3Compressor(vbr_quality=np.random.uniform(1.0, 9.0)))
252
+
253
+ if np.random.rand() < self.p_fm:
254
+ audio = apply_fm_effect(audio, sample_rate)
255
+
256
+ if np.random.rand() < self.p_live:
257
+ audio = apply_live_dt4_simple(audio, sample_rate)
258
 
 
259
  if np.random.rand() < self.p_encodec:
260
  device = 'cpu'
 
 
 
 
261
  model = self.encodec_model
 
262
  target_bw = np.random.choice(self.encodec_bandwidths)
263
  model.set_target_bandwidth(target_bw)
264
  wav_tensor = torch.from_numpy(audio).float().to(device)
265
  wav_processed = convert_audio(wav_tensor, sample_rate, model.sample_rate, model.channels)
266
  wav_input = wav_processed.unsqueeze(0)
267
  with torch.no_grad():
 
268
  reconstructed_tensor = model(wav_input).squeeze(0)
 
269
  audio = reconstructed_tensor.cpu().numpy()
 
270
  sample_rate = model.sample_rate
 
 
 
 
 
 
 
 
 
271
 
272
  if len(board) > 0:
273
  audio = board(audio, sample_rate=sample_rate)