Pj12 commited on
Commit
1eadc4f
·
verified ·
1 Parent(s): 4781718

Upload preprocess.py

Browse files
Files changed (1) hide show
  1. preprocess.py +279 -0
preprocess.py ADDED
@@ -0,0 +1,279 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sys
3
+ import time
4
+ from scipy import signal
5
+ from scipy.io import wavfile
6
+ import numpy as np
7
+ import concurrent.futures
8
+ from tqdm import tqdm
9
+ import json
10
+ from distutils.util import strtobool
11
+ import librosa
12
+ import multiprocessing
13
+ import noisereduce as nr
14
+
15
+ now_directory = os.getcwd()
16
+ sys.path.append(now_directory)
17
+
18
+ from rvc.lib.utils import load_audio
19
+ from rvc.train.preprocess.slicer import Slicer
20
+
21
+ # Remove colab logs
22
+ import logging
23
+
24
+ logging.getLogger("numba.core.byteflow").setLevel(logging.WARNING)
25
+ logging.getLogger("numba.core.ssa").setLevel(logging.WARNING)
26
+ logging.getLogger("numba.core.interpreter").setLevel(logging.WARNING)
27
+
28
+ # Constants
29
+ OVERLAP = 0.3
30
+ MAX_AMPLITUDE = 0.9
31
+ ALPHA = 0.75
32
+ HIGH_PASS_CUTOFF = 48
33
+ SAMPLE_RATE_16K = 16000
34
+
35
+
36
+ class PreProcess:
37
+ def __init__(self, sr: int, exp_dir: str, per: float):
38
+ self.slicer = Slicer(
39
+ sr=sr,
40
+ threshold=-42,
41
+ min_length=1500,
42
+ min_interval=400,
43
+ hop_size=15,
44
+ max_sil_kept=500,
45
+ )
46
+ self.sr = sr
47
+ self.b_high, self.a_high = signal.butter(
48
+ N=5, Wn=HIGH_PASS_CUTOFF, btype="high", fs=self.sr
49
+ )
50
+ self.per = per
51
+ self.exp_dir = exp_dir
52
+ self.device = "cpu"
53
+ self.gt_wavs_dir = os.path.join(exp_dir, "sliced_audios")
54
+ self.wavs16k_dir = os.path.join(exp_dir, "sliced_audios_16k")
55
+ os.makedirs(self.gt_wavs_dir, exist_ok=True)
56
+ os.makedirs(self.wavs16k_dir, exist_ok=True)
57
+
58
+ def _normalize_audio(self, audio: np.ndarray):
59
+ tmp_max = np.abs(audio).max()
60
+ if tmp_max > 2.5:
61
+ return None
62
+ return (audio / tmp_max * (MAX_AMPLITUDE * ALPHA)) + (1 - ALPHA) * audio
63
+
64
+ def process_audio_segment(
65
+ self,
66
+ normalized_audio: np.ndarray,
67
+ sid: int,
68
+ idx0: int,
69
+ idx1: int,
70
+ ):
71
+ if normalized_audio is None:
72
+ print(f"{sid}-{idx0}-{idx1}-filtered")
73
+ return
74
+ wavfile.write(
75
+ os.path.join(self.gt_wavs_dir, f"{sid}_{idx0}_{idx1}.wav"),
76
+ self.sr,
77
+ normalized_audio.astype(np.float32),
78
+ )
79
+ audio_16k = librosa.resample(
80
+ normalized_audio, orig_sr=self.sr, target_sr=SAMPLE_RATE_16K
81
+ )
82
+ wavfile.write(
83
+ os.path.join(self.wavs16k_dir, f"{sid}_{idx0}_{idx1}.wav"),
84
+ SAMPLE_RATE_16K,
85
+ audio_16k.astype(np.float32),
86
+ )
87
+
88
+ def process_audio(
89
+ self,
90
+ path: str,
91
+ idx0: int,
92
+ sid: int,
93
+ cut_preprocess: bool,
94
+ process_effects: bool,
95
+ noise_reduction: bool,
96
+ reduction_strength: float,
97
+ ):
98
+ audio_length = 0
99
+ try:
100
+ audio = load_audio(path, self.sr)
101
+ audio_length = librosa.get_duration(y=audio, sr=self.sr)
102
+ if process_effects:
103
+ audio = signal.lfilter(self.b_high, self.a_high, audio)
104
+ audio = self._normalize_audio(audio)
105
+ if noise_reduction:
106
+ audio = nr.reduce_noise(
107
+ y=audio, sr=self.sr, prop_decrease=reduction_strength
108
+ )
109
+ idx1 = 0
110
+ if cut_preprocess:
111
+ for audio_segment in self.slicer.slice(audio):
112
+ i = 0
113
+ while True:
114
+ start = int(self.sr * (self.per - OVERLAP) * i)
115
+ i += 1
116
+ if len(audio_segment[start:]) > (self.per + OVERLAP) * self.sr:
117
+ tmp_audio = audio_segment[
118
+ start : start + int(self.per * self.sr)
119
+ ]
120
+ self.process_audio_segment(
121
+ tmp_audio,
122
+ sid,
123
+ idx0,
124
+ idx1,
125
+ )
126
+ idx1 += 1
127
+ else:
128
+ tmp_audio = audio_segment[start:]
129
+ self.process_audio_segment(
130
+ tmp_audio,
131
+ sid,
132
+ idx0,
133
+ idx1,
134
+ )
135
+ idx1 += 1
136
+ break
137
+ else:
138
+ self.process_audio_segment(
139
+ audio,
140
+ sid,
141
+ idx0,
142
+ idx1,
143
+ )
144
+ except Exception as error:
145
+ print(f"Error processing audio: {error}")
146
+ return audio_length
147
+
148
+
149
+ def format_duration(seconds):
150
+ hours = int(seconds // 3600)
151
+ minutes = int((seconds % 3600) // 60)
152
+ seconds = int(seconds % 60)
153
+ return f"{hours:02}:{minutes:02}:{seconds:02}"
154
+
155
+
156
+ def save_dataset_duration(file_path, dataset_duration):
157
+ try:
158
+ with open(file_path, "r") as f:
159
+ data = json.load(f)
160
+ except FileNotFoundError:
161
+ data = {}
162
+
163
+ formatted_duration = format_duration(dataset_duration)
164
+ new_data = {
165
+ "total_dataset_duration": formatted_duration,
166
+ "total_seconds": dataset_duration,
167
+ }
168
+ data.update(new_data)
169
+
170
+ with open(file_path, "w") as f:
171
+ json.dump(data, f, indent=4)
172
+
173
+
174
+ def process_audio_wrapper(args):
175
+ pp, file, cut_preprocess, process_effects, noise_reduction, reduction_strength = (
176
+ args
177
+ )
178
+ file_path, idx0, sid = file
179
+ return pp.process_audio(
180
+ file_path,
181
+ idx0,
182
+ sid,
183
+ cut_preprocess,
184
+ process_effects,
185
+ noise_reduction,
186
+ reduction_strength,
187
+ )
188
+
189
+
190
+ def preprocess_training_set(
191
+ input_root: str,
192
+ sr: int,
193
+ num_processes: int,
194
+ exp_dir: str,
195
+ per: float,
196
+ cut_preprocess: bool,
197
+ process_effects: bool,
198
+ noise_reduction: bool,
199
+ reduction_strength: float,
200
+ ):
201
+ start_time = time.time()
202
+ pp = PreProcess(sr, exp_dir, per)
203
+ print(f"Starting preprocess with {num_processes} processes...")
204
+
205
+ files = []
206
+ idx = 0
207
+
208
+ for root, _, filenames in os.walk(input_root):
209
+ try:
210
+ sid = 0 if root == input_root else int(os.path.basename(root))
211
+ for f in filenames:
212
+ if f.lower().endswith((".wav", ".mp3", ".flac", ".ogg")):
213
+ files.append((os.path.join(root, f), idx, sid))
214
+ idx += 1
215
+ except ValueError:
216
+ print(
217
+ f'Speaker ID folder is expected to be integer, got "{os.path.basename(root)}" instead.'
218
+ )
219
+
220
+ # print(f"Number of files: {len(files)}")
221
+ audio_length = []
222
+ with tqdm(total=len(files)) as pbar:
223
+ with concurrent.futures.ProcessPoolExecutor(
224
+ max_workers=num_processes
225
+ ) as executor:
226
+ futures = [
227
+ executor.submit(
228
+ process_audio_wrapper,
229
+ (
230
+ pp,
231
+ file,
232
+ cut_preprocess,
233
+ process_effects,
234
+ noise_reduction,
235
+ reduction_strength,
236
+ ),
237
+ )
238
+ for file in files
239
+ ]
240
+ for future in concurrent.futures.as_completed(futures):
241
+ audio_length.append(future.result())
242
+ pbar.update(1)
243
+
244
+ audio_length = sum(audio_length)
245
+ save_dataset_duration(
246
+ os.path.join(exp_dir, "model_info.json"), dataset_duration=audio_length
247
+ )
248
+ elapsed_time = time.time() - start_time
249
+ print(
250
+ f"Preprocess completed in {elapsed_time:.2f} seconds on {format_duration(audio_length)} seconds of audio."
251
+ )
252
+
253
+
254
+ if __name__ == "__main__":
255
+ experiment_directory = str(sys.argv[1])
256
+ input_root = str(sys.argv[2])
257
+ sample_rate = int(sys.argv[3])
258
+ percentage = float(sys.argv[4])
259
+ num_processes = sys.argv[5]
260
+ if num_processes.lower() == "none":
261
+ num_processes = multiprocessing.cpu_count()
262
+ else:
263
+ num_processes = int(num_processes)
264
+ cut_preprocess = strtobool(sys.argv[6])
265
+ process_effects = strtobool(sys.argv[7])
266
+ noise_reduction = strtobool(sys.argv[8])
267
+ reduction_strength = float(sys.argv[9])
268
+
269
+ preprocess_training_set(
270
+ input_root,
271
+ sample_rate,
272
+ num_processes,
273
+ experiment_directory,
274
+ percentage,
275
+ cut_preprocess,
276
+ process_effects,
277
+ noise_reduction,
278
+ reduction_strength,
279
+ )