noblebarkrr commited on
Commit
7108f39
·
verified ·
1 Parent(s): 9aa1213

Upload 2 files

Browse files
Files changed (2) hide show
  1. ensembless.py +761 -0
  2. medley_vox.py +152 -0
ensembless.py ADDED
@@ -0,0 +1,761 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import pandas as pd
3
+ import tempfile
4
+ import os
5
+ from separator.ensemble import ensemble_audio_files
6
+ from model_list import models_data
7
+ from pydub.utils import mediainfo
8
+ from pydub import AudioSegment
9
+ import numpy as np
10
+ import librosa
11
+ import librosa.display
12
+ import soundfile as sf
13
+ from separator.audio_writer import write_audio_file
14
+ from multi_inference import single_multi_inference
15
+ from pydub.exceptions import CouldntDecodeError
16
+
17
+ TRANSLATIONS = {
18
+ "ru": {
19
+ "app_title": "EnsembLess",
20
+ "auto_ensemble": "Авто-ансамбль",
21
+ "manual_ensemble": "Ручной ансамбль",
22
+ "inverter": "Инвертер",
23
+ "model_selection": "Выберите модель для добавления в ансамбль",
24
+ "model_type": "Тип модели",
25
+ "model_name": "Имя модели",
26
+ "stem_selection": "Стем, который будет использован в ансамбле",
27
+ "weight": "Весы",
28
+ "add_button": "➕ Добавить",
29
+ "current_ensemble": "Текущий ансамбль",
30
+ "remove_index": "Индекс модели, который хотите удалить (начинается с 1)",
31
+ "remove_button": "❌ Удалить",
32
+ "clear_button": "Очистить",
33
+ "input_audio": "Входное аудио",
34
+ "settings": "Настройки",
35
+ "method": "Метод",
36
+ "output_format": "Формат вывода",
37
+ "run_button": "Создать ансамбль",
38
+ "results": "Результаты",
39
+ "inverted_result": "Инвертированный результат",
40
+ "invert_method": "Метод инвертирования",
41
+ "invert_button": "Инвертировать",
42
+ "audio_files": "Аудио файлы",
43
+ "weights_input": "Весы",
44
+ "main_audio": "Основное аудио",
45
+ "audio_to_remove": "Аудио для удаления",
46
+ "processing_method": "Метод обработки",
47
+ "analyze_title": "РЕЗУЛЬТАТЫ АНАЛИЗА:",
48
+ "all_same_rate": "✅ ВСЕ ФАЙЛЫ имеют одинаковую частоту дискретизации: {rate} Hz",
49
+ "different_rates": "⚠️ Файлы имеют РАЗНУЮ частоту дискретизации",
50
+ "resample_warning": "К загруженному аудио автоматически применён ресэмплинг для лучшего инвертирования",
51
+ "error_no_files": "Ошибка: файлы не загружены",
52
+ "error_unsupported_format": "не поддерживаемый формат",
53
+ "error_general": "ошибка ({error})",
54
+ "error_no_models": "Добавьте хотя бы одну модель для создания ансамбля",
55
+ "error_no_audio": "Сначала загрузите аудио",
56
+ "error_both_audio": "Пожалуйста, загрузите оба аудиофайла",
57
+ "language": "Язык",
58
+ "batch_processing": "Пакетная обработка",
59
+ "batch_info": "Позволяет загрузить сразу несколько файлов",
60
+ "separation_info": "Информация о разделении",
61
+ "vocal_separation": "Разделение вокалы",
62
+ "stereo_mode": "Стерео режим",
63
+ "stem": "Стем",
64
+ "vocal_multi_separation": "Мульти-вокал",
65
+ "ensemble": "Ансамбль",
66
+ "transform": "Преобразование",
67
+ "algorithm": "Алгоритм: {model_fullname}",
68
+ "output_format_info": "Формат выходных данных: {output_format}",
69
+ "process1": "Начало обработки",
70
+ "process2": "Модель",
71
+ "process3": "Автоматическое выравнивание длин аудио",
72
+ "process4": "Создание ансамбля",
73
+ "result_source": "Промежуточные файлы"
74
+ },
75
+ "en": {
76
+ "app_title": "EnsembLess",
77
+ "auto_ensemble": "Auto-Ensemble",
78
+ "manual_ensemble": "Manual Ensemble",
79
+ "inverter": "Inverter",
80
+ "model_selection": "Select a model to add to the ensemble",
81
+ "model_type": "Model Type",
82
+ "model_name": "Model Name",
83
+ "stem_selection": "Stem to use in the ensemble",
84
+ "weight": "Weights",
85
+ "add_button": "➕ Add",
86
+ "current_ensemble": "Current Ensemble",
87
+ "remove_index": "Index of model to remove (starts from 1)",
88
+ "remove_button": "❌ Remove",
89
+ "clear_button": "Clear",
90
+ "input_audio": "Input Audio",
91
+ "settings": "Settings",
92
+ "method": "Method",
93
+ "output_format": "Output Format",
94
+ "run_button": "Create Ensemble",
95
+ "results": "Results",
96
+ "inverted_result": "Inverted Result",
97
+ "invert_method": "Inversion Method",
98
+ "invert_button": "Invert",
99
+ "audio_files": "Audio Files",
100
+ "weights_input": "Weights",
101
+ "main_audio": "Main Audio",
102
+ "audio_to_remove": "Audio to Remove",
103
+ "processing_method": "Processing Method",
104
+ "analyze_title": "ANALYSIS RESULTS:",
105
+ "all_same_rate": "✅ ALL FILES have the same sample rate: {rate} Hz",
106
+ "different_rates": "⚠️ Files have DIFFERENT sample rates",
107
+ "resample_warning": "Resampling applied automatically for better inversion",
108
+ "error_no_files": "Error: no files uploaded",
109
+ "error_unsupported_format": "unsupported format",
110
+ "error_general": "error ({error})",
111
+ "error_no_models": "Add at least one model to create an ensemble",
112
+ "error_no_audio": "Please upload audio first",
113
+ "error_both_audio": "Please upload both audio files",
114
+ "language": "Language",
115
+ "batch_processing": "Batch Processing",
116
+ "batch_info": "Allows uploading multiple files at once",
117
+ "separation_info": "Separation Info",
118
+ "vocal_separation": "Vocal Separation",
119
+ "stereo_mode": "Stereo Mode",
120
+ "stem": "Stem",
121
+ "vocal_multi_separation": "Multi-Vocal",
122
+ "ensemble": "Ensemble",
123
+ "transform": "Transform",
124
+ "algorithm": "Algorithm: {model_fullname}",
125
+ "output_format_info": "Output format: {output_format}",
126
+ "process1": "Start process",
127
+ "process2": "Model",
128
+ "process3": "Auto post-padding audios",
129
+ "process4": "Build ensemble",
130
+ "result_source": "Intermediate files"
131
+ }
132
+ }
133
+
134
+
135
+ # Глобальная переменная для текущего языка
136
+ CURRENT_LANG = "ru"
137
+
138
+ def set_language(lang):
139
+ global CURRENT_LANG
140
+ CURRENT_LANG = lang
141
+
142
+ def t(key, **kwargs):
143
+ """Функция для получения перевода с подстановкой значений"""
144
+ translation = TRANSLATIONS[CURRENT_LANG].get(key, key)
145
+ return translation.format(**kwargs) if kwargs else translation
146
+
147
+ def analyze_sample_rate(files):
148
+ """
149
+ Анализирует частоту дискретизации для списка аудиофайлов
150
+ Возвращает форматированную строку с результатами
151
+ """
152
+ if not files:
153
+ return t("error_no_files")
154
+
155
+ results = []
156
+ common_rate = None
157
+ all_same = True
158
+
159
+ for file_info in files:
160
+ try:
161
+ # Создаем аудиосегмент из файла
162
+ audio = AudioSegment.from_file(file_info.name)
163
+ rate = audio.frame_rate
164
+
165
+ # Проверяем единообразие частоты
166
+ if common_rate is None:
167
+ common_rate = rate
168
+ elif common_rate != rate:
169
+ all_same = False
170
+
171
+ results.append(f"{file_info.name.split('/')[-1]}: {rate} Hz")
172
+
173
+ except CouldntDecodeError:
174
+ results.append(f"{file_info.name.split('/')[-1]}: {t('error_unsupported_format')}")
175
+ except Exception as e:
176
+ results.append(f"{file_info.name.split('/')[-1]}: {t('error_general', error=str(e))}")
177
+
178
+ # Форматируем итоговый результат
179
+ header = t("analyze_title") + "\n" + "-" * 50 + "\n"
180
+ body = "\n".join(results)
181
+ footer = "\n" + "-" * 50 + "\n"
182
+
183
+ if all_same and common_rate is not None:
184
+ footer += f"\n{t('all_same_rate', rate=common_rate)}"
185
+ elif common_rate is not None:
186
+ footer += f"\n{t('different_rates')}"
187
+
188
+ return header + body + footer
189
+
190
+
191
+ def manual_ensem(input_audios, method, weights, out_format):
192
+ temp_dir = tempfile.mkdtemp()
193
+ weights = [float(x) for x in weights.split(",")]
194
+ padded_files = []
195
+
196
+ audio_data = []
197
+ max_length = 0
198
+ for file in input_audios:
199
+
200
+ data, sr = librosa.load(file, sr=None, mono=False)
201
+ if data.ndim == 1:
202
+ data = np.stack([data, data])
203
+ elif data.shape[0] != 2:
204
+ data = data.T
205
+ audio_data.append([file, data])
206
+ max_length = max(max_length, data.shape[1])
207
+
208
+ for i, [file, data] in enumerate(audio_data):
209
+ if data.shape[1] < max_length:
210
+ pad_width = ((0, 0), (0, max_length - data.shape[1]))
211
+ padded_data = np.pad(data, pad_width, mode='constant')
212
+ else:
213
+ padded_data = data
214
+ sf.write(f"{file}.wav", padded_data.T, sr)
215
+ padded_files.append(f"{file}.wav")
216
+ a1, a2 = ensemble_audio_files(padded_files, output=os.path.join(temp_dir, f"ensemble_{method}"), ensemble_type=method, weights=weights, out_format=out_format)
217
+ return a1, a2
218
+
219
+
220
+ # Фиксированные параметры для STFT
221
+ N_FFT = 2048
222
+ WIN_LENGTH = 2048
223
+ HOP_LENGTH = WIN_LENGTH // 4
224
+
225
+
226
+ def load_audio(filepath):
227
+ """Загрузка аудиофайла с помощью librosa"""
228
+ if filepath is None:
229
+ return None, None
230
+ try:
231
+ return librosa.load(filepath, sr=None, mono=False)
232
+ except Exception as e:
233
+ print(f"Ошибка загрузки аудио: {e}")
234
+ return None, None
235
+
236
+ def process_channel(y1_ch, y2_ch, sr, method):
237
+ """Обработка одного аудиоканала"""
238
+ if method == "waveform":
239
+ return y1_ch - y2_ch
240
+
241
+ elif method == "spectrogram":
242
+ # Вычисляем спектрограммы
243
+ S1 = librosa.stft(y1_ch, n_fft=N_FFT, hop_length=HOP_LENGTH, win_length=WIN_LENGTH)
244
+ S2 = librosa.stft(y2_ch, n_fft=N_FFT, hop_length=HOP_LENGTH, win_length=WIN_LENGTH)
245
+
246
+ # Амплитудные спектрограммы
247
+ mag1 = np.abs(S1)
248
+ mag2 = np.abs(S2)
249
+
250
+ # Спектральное вычитание
251
+ mag_result = np.maximum(mag1 - mag2, 0)
252
+
253
+ # Сохраняем фазовую информацию исходного сигнала
254
+ phase = np.angle(S1)
255
+
256
+ # Комбинируем амплитуду результата с фазой
257
+ S_result = mag_result * np.exp(1j * phase)
258
+
259
+ # Обратное преобразование
260
+ return librosa.istft(
261
+ S_result,
262
+ n_fft=N_FFT,
263
+ hop_length=HOP_LENGTH,
264
+ win_length=WIN_LENGTH,
265
+ length=len(y1_ch)
266
+ )
267
+
268
+ def process_audio(audio1_path, audio2_path, out_format, method):
269
+ # Загрузка аудиофайлов
270
+ y1, sr1 = load_audio(audio1_path)
271
+ y2, sr2 = load_audio(audio2_path)
272
+
273
+ if sr1 is None or sr2 is None:
274
+ raise gr.Error(t("error_both_audio"))
275
+
276
+ # Определяем количество каналов
277
+ channels1 = 1 if y1.ndim == 1 else y1.shape[0]
278
+ channels2 = 1 if y2.ndim == 1 else y2.shape[0]
279
+
280
+ # Преобразование в форму (samples, channels)
281
+ if channels1 > 1:
282
+ y1 = y1.T # (channels, samples) -> (samples, channels)
283
+ else:
284
+ y1 = y1.reshape(-1, 1)
285
+
286
+ if channels2 > 1:
287
+ y2 = y2.T # (channels, samples) -> (samples, channels)
288
+ else:
289
+ y2 = y2.reshape(-1, 1)
290
+
291
+ # Ресемплинг до одинаковой частоты дискретизации
292
+ if sr1 != sr2:
293
+ if channels2 > 1:
294
+ # Ресемплинг для каждого канала отдельно
295
+ y2_resampled = np.zeros((len(y2), channels2), dtype=np.float32)
296
+ for c in range(channels2):
297
+ y2_resampled[:, c] = librosa.resample(
298
+ y2[:, c],
299
+ orig_sr=sr2,
300
+ target_sr=sr1
301
+ )
302
+ y2 = y2_resampled
303
+ else:
304
+ y2 = librosa.resample(y2[:, 0], orig_sr=sr2, target_sr=sr1)
305
+ y2 = y2.reshape(-1, 1)
306
+ sr2 = sr1
307
+
308
+ # Приводим к одинаковой длине
309
+ min_len = min(len(y1), len(y2))
310
+ y1 = y1[:min_len]
311
+ y2 = y2[:min_len]
312
+
313
+ # Обрабатываем каждый канал отдельно
314
+ result_channels = []
315
+
316
+ # Если основной сигнал моно, а удаляемый стерео - преобразуем удаляемый в моно
317
+ if channels1 == 1 and channels2 > 1:
318
+ y2 = y2.mean(axis=1, keepdims=True)
319
+ channels2 = 1
320
+
321
+ for c in range(channels1):
322
+ # Выбираем канал для основного сигнала
323
+ y1_ch = y1[:, c]
324
+
325
+ # Выбираем канал для удаляемого сигнала
326
+ if channels2 == 1:
327
+ y2_ch = y2[:, 0]
328
+ else:
329
+ # Если каналов удаляемого сигнала больше, используем соответствующий канал
330
+ y2_ch = y2[:, min(c, channels2-1)]
331
+
332
+ # Обрабатываем канал
333
+ result_ch = process_channel(y1_ch, y2_ch, sr1, method)
334
+ result_channels.append(result_ch)
335
+
336
+ # Собираем каналы в один массив
337
+ if len(result_channels) > 1:
338
+ result = np.column_stack(result_channels)
339
+ else:
340
+ result = np.array(result_channels[0])
341
+
342
+ # Нормализация (предотвращение клиппинга)
343
+ if result.ndim > 1:
344
+ # Для многоканального аудио нормализуем каждый канал отдельно
345
+ for c in range(result.shape[1]):
346
+ channel = result[:, c]
347
+ max_val = np.max(np.abs(channel))
348
+ if max_val > 0:
349
+ result[:, c] = channel * 0.9 / max_val
350
+ else:
351
+ max_val = np.max(np.abs(result))
352
+ if max_val > 0:
353
+ result = result * 0.9 / max_val
354
+
355
+ folder_path = os.path.dirname(audio2_path)
356
+
357
+ # Сохраняем временный файл для выв��да
358
+ inverted_wav = os.path.join(folder_path, "inverted.wav")
359
+ sf.write(inverted_wav, result, sr1)
360
+ inverted = os.path.join(folder_path, f"inverted_ensemble.{out_format}")
361
+ write_audio_file(inverted, result.T, sr1, out_format, "320k")
362
+ return inverted, inverted_wav
363
+
364
+
365
+
366
+
367
+
368
+
369
+
370
+ def ensembless(input_audio, input_settings, type, out_format):
371
+
372
+ progress = gr.Progress()
373
+ progress(0, desc=f"{t('process1')}...")
374
+
375
+ base_name = os.path.splitext(os.path.basename(input_audio))[0]
376
+ temp_dir = tempfile.mkdtemp()
377
+ source_files = []
378
+ output_s_files = []
379
+ output_s_weights = []
380
+ block_count = len(input_settings)
381
+
382
+ for i, (input_model, weight, s_stem) in enumerate(input_settings):
383
+
384
+ progress(i / block_count, desc=f"{t('process2')} {i+1}/{block_count}")
385
+
386
+ model_type, model_name = input_model.split(" / ")
387
+
388
+ output_s_dir = os.path.join(temp_dir, f"{model_type}_{model_name}_s_stems")
389
+
390
+ output = single_multi_inference(input_audio, output_s_dir, model_type, model_name, True, vr_aggr=10, output_format="wav", output_bitrate="320k", template="MODEL_STEM", call_method="cli", selected_stems=[])
391
+
392
+ for stem, file in output:
393
+ source_files.append(file)
394
+ if stem == s_stem:
395
+ output_s_files.append(file)
396
+ output_s_weights.append(weight)
397
+
398
+ progress(0.9, desc=f"{t('process3')}...")
399
+
400
+ padded_files = []
401
+
402
+ audio_data = []
403
+ max_length = 0
404
+ for file in output_s_files:
405
+
406
+ data, sr = sf.read(file)
407
+ if data.ndim == 1:
408
+ data = np.stack([data, data])
409
+ elif data.shape[0] != 2:
410
+ data = data.T
411
+ audio_data.append([file, data])
412
+ max_length = max(max_length, data.shape[1])
413
+
414
+ for i, [file, data] in enumerate(audio_data):
415
+ if data.shape[1] < max_length:
416
+ pad_width = ((0, 0), (0, max_length - data.shape[1]))
417
+ padded_data = np.pad(data, pad_width, mode='constant')
418
+ else:
419
+ padded_data = data
420
+ sf.write(file, padded_data.T, sr)
421
+ padded_files.append(file)
422
+
423
+ progress(0.95, desc=f"{t('process4')}...")
424
+
425
+ output, output_wav = ensemble_audio_files(files=output_s_files, output=os.path.join(temp_dir, f"ensemble_{base_name}_{type}"), ensemble_type=type, weights=output_s_weights, out_format=out_format)
426
+
427
+ return output, output_wav, source_files
428
+
429
+
430
+
431
+
432
+
433
+
434
+ def resample_audio(audio):
435
+ original_name = os.path.splitext(os.path.basename(audio))[0]
436
+ folder_path = os.path.dirname(audio)
437
+ audio = AudioSegment.from_file(audio)
438
+ audio_resampled = audio.set_frame_rate(44100)
439
+ resampled_audio = os.path.join(folder_path, f"resampled_{original_name}.wav")
440
+ audio_resampled.export(resampled_audio, format="wav")
441
+ gr.Warning(message=t("resample_warning"))
442
+ return resampled_audio
443
+
444
+ # Вспомогательные функции для обработки данных
445
+ def get_model_types():
446
+ return list(models_data.keys())
447
+
448
+ def get_models_by_type(model_type):
449
+ return list(models_data[model_type].keys()) if model_type in models_data else []
450
+
451
+ def get_stems_by_model(model_type, model_name):
452
+ if model_type in models_data and model_name in models_data[model_type]:
453
+ return models_data[model_type][model_name]['stems']
454
+ return []
455
+
456
+ # Класс для управления состоянием ансамбля
457
+ class EnsembleManager:
458
+ def __init__(self):
459
+ self.models = []
460
+
461
+ def add_model(self, model_type, model_name, stem, weight):
462
+ model_info = {
463
+ 'type': model_type,
464
+ 'name': model_name,
465
+ 'stem': stem,
466
+ 'weight': float(weight)
467
+ }
468
+ self.models.append(model_info)
469
+ return self.get_df()
470
+
471
+ def remove_model(self, index):
472
+ if 0 <= index < len(self.models):
473
+ del self.models[index]
474
+ return self.get_df()
475
+
476
+ def clear_models(self):
477
+ self.models = []
478
+ return self.get_df()
479
+
480
+ def get_df(self):
481
+ if not self.models:
482
+ columns = ["#", t("model_type"), t("model_name"), t("stem"), t("weight")]
483
+ return pd.DataFrame(columns=columns)
484
+
485
+ data = []
486
+ for i, model in enumerate(self.models):
487
+ data.append([
488
+ f"{i+1}",
489
+ model['type'],
490
+ model['name'],
491
+ model['stem'],
492
+ model['weight']
493
+ ])
494
+ columns = ["#", t("model_type"), t("model_name"), t("stem"), t("weight")]
495
+ return pd.DataFrame(data, columns=columns)
496
+
497
+ def get_settings(self):
498
+ return [(f"{m['type']} / {m['name']}", m['weight'], m['stem']) for m in self.models]
499
+
500
+ # Создаем экземпляр менеджера
501
+ manager = EnsembleManager()
502
+
503
+ # Функции обработчики для Gradio
504
+ def update_model_dropdown(model_type):
505
+ models = get_models_by_type(model_type)
506
+ return gr.Dropdown(choices=models, value=models[0] if models else None)
507
+
508
+ def update_stem_dropdown(model_type, model_name):
509
+ stems = get_stems_by_model(model_type, model_name)
510
+ return gr.Dropdown(choices=stems, value=stems[0] if stems else None)
511
+
512
+ def add_model(model_type, model_name, stem, weight):
513
+ return manager.add_model(model_type, model_name, stem, weight)
514
+
515
+ def remove_model(index):
516
+ if index >= 0:
517
+ return manager.remove_model(index-1) # Пользователь вводит начиная с 1, а индекс с 0
518
+ return manager.get_df()
519
+
520
+ def clear_all_models():
521
+ return manager.clear_models()
522
+
523
+ def run_ensemble(input_audio, ensemble_type, output_format):
524
+ if not manager.models:
525
+ raise gr.Error(t("error_no_models"))
526
+
527
+ if not input_audio:
528
+ raise gr.Error(t("error_no_audio"))
529
+
530
+ input_settings = manager.get_settings()
531
+
532
+ output, output_wav, result_source = ensembless(
533
+ input_audio=input_audio,
534
+ input_settings=input_settings,
535
+ type=ensemble_type,
536
+ out_format=output_format,
537
+ )
538
+ return output, output_wav, result_source
539
+
540
+ def ensembless_plugin_name():
541
+ return "EnsembLess"
542
+
543
+ # Создаем интерфейс
544
+ def ensembless_plugin(lang):
545
+ set_language(lang)
546
+ with gr.Blocks(title=t("app_title")) as demo:
547
+ # Добавляем переключатель языка
548
+
549
+ with gr.Tabs():
550
+ with gr.Tab(t("auto_ensemble")):
551
+ with gr.Row():
552
+ with gr.Column(scale=1):
553
+ # Секция добавления моделей
554
+ gr.Markdown(f"### {t('model_selection')}")
555
+ model_type = gr.Dropdown(
556
+ choices=get_model_types(),
557
+ label=t("model_type"),
558
+ value=get_model_types()[0] if get_model_types() else None,
559
+ filterable=False
560
+ )
561
+ model_name = gr.Dropdown(
562
+ choices=get_models_by_type(get_model_types()[0]),
563
+ label=t("model_name"),
564
+ interactive=True,
565
+ value=get_models_by_type(get_model_types()[0])[0],
566
+ filterable=False
567
+ )
568
+ stem = gr.Dropdown(
569
+ choices=get_stems_by_model(get_model_types()[0], get_models_by_type(get_model_types()[0])[0]),
570
+ label=t("stem_selection"),
571
+ interactive=True,
572
+ filterable=False
573
+ )
574
+ weight = gr.Slider(
575
+ label=t("weight"),
576
+ value=1.0,
577
+ minimum=0.1,
578
+ maximum=10.0,
579
+ step=0.1
580
+ )
581
+ add_btn = gr.Button(t("add_button"), variant="primary")
582
+
583
+ # Обновляем модели и стемы при изменении типа
584
+ model_type.change(
585
+ update_model_dropdown,
586
+ inputs=model_type,
587
+ outputs=model_name
588
+ )
589
+ model_name.change(
590
+ update_stem_dropdown,
591
+ inputs=[model_type, model_name],
592
+ outputs=stem
593
+ )
594
+
595
+ with gr.Column(scale=2):
596
+ # Секция управления ансамблем
597
+ gr.Markdown(f"### {t('current_ensemble')}")
598
+ ensemble_df = gr.Dataframe(
599
+ value=manager.get_df(),
600
+ headers=["#", t("model_type"), t("model_name"), t("stem"), t("weight")],
601
+ datatype=["str", "str", "str", "str", "number"],
602
+ interactive=False
603
+ )
604
+
605
+ with gr.Row():
606
+ remove_idx = gr.Number(
607
+ label=t("remove_index"),
608
+ precision=0,
609
+ minimum=1,
610
+ interactive=True
611
+ )
612
+ remove_btn = gr.Button(t("remove_button"), variant="stop")
613
+ clear_btn = gr.Button(t("clear_button"), variant="stop")
614
+
615
+ # Секция запуска обработки
616
+ with gr.Row(equal_height=True):
617
+ with gr.Column():
618
+ gr.Markdown(f"### {t('input_audio')}")
619
+ input_audio = gr.Audio(type="filepath", show_label=False)
620
+ input_audio_resampled = gr.Text(visible=False)
621
+
622
+ gr.Markdown(f"### {t('settings')}")
623
+ ensemble_type = gr.Dropdown(
624
+ choices=['avg_wave', 'median_wave', 'min_wave', 'max_wave',
625
+ 'avg_fft', 'median_fft', 'min_fft', 'max_fft'],
626
+ value='avg_fft',
627
+ label=t("method"),
628
+ filterable=False
629
+ )
630
+ output_format = gr.Dropdown(
631
+ choices=["wav", "mp3", "flac", "m4a", "aac", "ogg", "opus", "aiff"],
632
+ value="mp3",
633
+ label=t("output_format"),
634
+ filterable=False
635
+ )
636
+ run_btn = gr.Button(t("run_button"), variant="primary")
637
+
638
+ with gr.Tab(t('results')):
639
+
640
+ with gr.Column():
641
+ output_audio = gr.Audio(label=t("results"), type="filepath", interactive=False, show_download_button=True)
642
+ output_wav = gr.Text(label="Результат в WAV", interactive=False, visible=False)
643
+
644
+ gr.Markdown(f"###### {t('inverted_result')}")
645
+
646
+ invert_method = gr.Radio(
647
+ choices=["waveform", "spectrogram"],
648
+ label=t("invert_method"),
649
+ value="waveform"
650
+ )
651
+ invert_btn = gr.Button(t("invert_button"))
652
+ inverted_output_audio = gr.Audio(label=t("inverted_result"), type="filepath", interactive=False, show_download_button=True)
653
+ inverted_wav = gr.Text(label="Инвертированный результат в WAV", interactive=False, visible=False)
654
+
655
+ with gr.Tab(t('result_source')):
656
+ result_source = gr.Files(interactive=False, label=t('result_source'))
657
+
658
+
659
+ # Обработчики событий
660
+
661
+ invert_btn.click(
662
+ process_audio,
663
+ inputs=[input_audio_resampled, output_wav, output_format, invert_method],
664
+ outputs=[inverted_output_audio, inverted_wav]
665
+ )
666
+
667
+ input_audio.upload(
668
+ resample_audio,
669
+ inputs=input_audio,
670
+ outputs=input_audio_resampled
671
+ )
672
+
673
+ add_btn.click(
674
+ add_model,
675
+ inputs=[model_type, model_name, stem, weight],
676
+ outputs=ensemble_df
677
+ )
678
+
679
+ remove_btn.click(
680
+ remove_model,
681
+ inputs=remove_idx,
682
+ outputs=ensemble_df
683
+ )
684
+
685
+ clear_btn.click(
686
+ clear_all_models,
687
+ outputs=ensemble_df
688
+ )
689
+
690
+ run_btn.click(
691
+ run_ensemble,
692
+ inputs=[input_audio_resampled, ensemble_type, output_format],
693
+ outputs=[output_audio, output_wav, result_source]
694
+ )
695
+
696
+ with gr.Tab(t("manual_ensemble")):
697
+ with gr.Row(equal_height=True):
698
+ input_files = gr.Files(show_label=False, type="filepath", file_types=[".wav", ".mp3", ".flac", ".m4a", ".aac", ".ogg", ".opus", ".aiff"])
699
+ with gr.Column():
700
+ info_audios = gr.Textbox(label="", interactive=False)
701
+ man_method = gr.Dropdown(
702
+ choices=['avg_wave', 'median_wave', 'min_wave', 'max_wave',
703
+ 'avg_fft', 'median_fft', 'min_fft', 'max_fft'],
704
+ value='avg_fft',
705
+ label=t("method"),
706
+ filterable=False
707
+ )
708
+
709
+ weights_input = gr.Textbox(label=t("weights_input"), value="1.0,1.0")
710
+
711
+ output_man_format = gr.Dropdown(
712
+ choices=["wav", "mp3", "flac", "m4a", "aac", "ogg", "opus", "aiff"],
713
+ value="mp3",
714
+ label=t("output_format"),
715
+ filterable=False
716
+ )
717
+
718
+ run_man_btn = gr.Button(t("run_button"), variant="primary")
719
+
720
+ output_man_audio = gr.Audio(label=t("results"), type="filepath", interactive=False, show_download_button=True)
721
+ output_man_wav = gr.Text(label="Результат в WAV", interactive=False, visible=False)
722
+
723
+
724
+ input_files.upload(
725
+ fn=analyze_sample_rate,
726
+ inputs=input_files,
727
+ outputs=info_audios
728
+ )
729
+
730
+
731
+ run_man_btn.click(
732
+ manual_ensem,
733
+ inputs=[input_files, man_method, weights_input, output_man_format],
734
+ outputs=[output_man_audio, output_man_wav]
735
+ )
736
+ with gr.Tab(t("inverter")):
737
+ with gr.Row():
738
+ audio1 = gr.Audio(label=t("main_audio"), type="filepath")
739
+ audio2 = gr.Audio(label=t("audio_to_remove"), type="filepath")
740
+ invert_man_method = gr.Radio(
741
+ choices=["waveform", "spectrogram"],
742
+ label=t("processing_method"),
743
+ value="waveform"
744
+ )
745
+ output_man_i_format = gr.Dropdown(
746
+ choices=["wav", "mp3", "flac", "m4a", "aac", "ogg", "opus", "aiff"],
747
+ value="mp3",
748
+ label=t("output_format"),
749
+ filterable=False
750
+ )
751
+ invert_man_btn = gr.Button(t("invert_button"))
752
+
753
+ with gr.Column():
754
+ invert_man_output = gr.Audio(label=t("results"), interactive=False, show_download_button=True)
755
+ invert_man_output_wav = gr.Text(interactive=False, visible=False)
756
+
757
+ invert_man_btn.click(
758
+ process_audio,
759
+ inputs=[audio1, audio2, output_man_i_format, invert_man_method],
760
+ outputs=[invert_man_output, invert_man_output_wav]
761
+ )
medley_vox.py ADDED
@@ -0,0 +1,152 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import time
3
+ from datetime import datetime
4
+ import shutil
5
+ import sys
6
+ import json
7
+ import gradio as gr
8
+ from model_list import medley_vox_models
9
+ from utils.download_models import download_model
10
+ from assets.translations import MVSEPLESS_TRANSLATIONS as TRANSLATIONS
11
+
12
+ PRETRAIN_FILE = os.sep.join([os.getcwd(), "separator", "medley_vox", "pretrained_models", "xlsr_53_56k.pt"])
13
+ if os.path.exists(PRETRAIN_FILE) == False:
14
+ os.system(f"wget -O {PRETRAIN_FILE} https://huggingface.co/Sucial/MedleyVox-Inference-WebUI/resolve/main/pretrained/xlsr_53_56k.pt?download=true")
15
+
16
+ CURRENT_LANG = "ru"
17
+ MODELS_CACHE_DIR = os.path.join(os.getcwd(), os.path.join("separator", "models_cache"))
18
+ OUTPUT_FORMATS = ["mp3", "wav", "flac", "ogg", "opus", "m4a", "aac", "aiff"]
19
+ OUTPUT_DIR = "/content/output"
20
+
21
+ def set_language(lang):
22
+ global CURRENT_LANG
23
+ CURRENT_LANG = lang
24
+
25
+ def t(key, **kwargs):
26
+ """Функция для получения перевода с подстановкой значений"""
27
+ translation = TRANSLATIONS[CURRENT_LANG].get(key, key)
28
+ return translation.format(**kwargs) if kwargs else translation
29
+
30
+ def medley_voxer(input, output, model_name, output_format, stereo_mode):
31
+ config_url = medley_vox_models[model_name]["config_url"]
32
+ checkpoint_url = medley_vox_models[model_name]["checkpoint_url"]
33
+ medley_vox_model_dir = download_model(MODELS_CACHE_DIR, model_name, "medley_vox", checkpoint_url, config_url)
34
+ command = (
35
+ f"python -m separator.medley_vox.svs.inference "
36
+ f"--inference_data_dir '{input}' "
37
+ f"--results_save_dir '{output}' "
38
+ f"--model_dir '{medley_vox_model_dir}' "
39
+ f"--exp_name {model_name} "
40
+ f"--use_overlapadd=ola "
41
+ f"--stereo '{stereo_mode}' "
42
+ f"--output_format {output_format} "
43
+ )
44
+ os.system(command)
45
+ results_path = os.path.join(output, "results.json")
46
+ if os.path.exists(results_path):
47
+ with open(results_path) as f:
48
+ return json.load(f)
49
+ return []
50
+
51
+ def medley_voxer_gradio(input, output, model_name, output_format, stereo_mode):
52
+ output_audio = medley_voxer(input, output, model_name, output_format, stereo_mode)
53
+ results = []
54
+ if output_audio is not None:
55
+ for i, (stem, output_file) in enumerate(output_audio[:2]):
56
+ results.append(gr.update(
57
+ visible=True,
58
+ label=stem,
59
+ value=output_file
60
+ ))
61
+ return tuple(results)
62
+
63
+
64
+ ##############
65
+
66
+
67
+ def multi_voxer(input, output, model_name, output_format, stereo_mode, stems):
68
+ output_audio = medley_voxer(input, output, model_name, output_format, stereo_mode) # primary stems
69
+ results = []
70
+ if stems == 2:
71
+ return output_audio
72
+
73
+ if stems == 4:
74
+ for stem, file in output_audio:
75
+ voxes = medley_voxer(file, output, model_name, output_format, stereo_mode)
76
+ results.extend(voxes)
77
+ print(results)
78
+ return results
79
+
80
+ if stems == 8:
81
+ for stem, file in output_audio:
82
+ voxes = medley_voxer(file, output, model_name, output_format, stereo_mode)
83
+ for stem2, file2 in voxes:
84
+ voxes2 = medley_voxer(file2, output, model_name, output_format, stereo_mode)
85
+ results.extend(voxes2)
86
+ print(results)
87
+ return results
88
+
89
+ if stems == 16:
90
+ for stem, file in output_audio:
91
+ voxes = medley_voxer(file, output, model_name, output_format, stereo_mode)
92
+ for stem2, file2 in voxes:
93
+ voxes2 = medley_voxer(file2, output, model_name, output_format, stereo_mode)
94
+ for stem3, file3 in voxes2:
95
+ voxes3 = medley_voxer(file3, output, model_name, output_format, stereo_mode)
96
+ results.extend(voxes3)
97
+ print(results)
98
+ return results
99
+
100
+
101
+ ##############
102
+
103
+ def multi_voxer_gradio(input, output, model_name, output_format, stereo_mode, stems):
104
+
105
+ output_audio = multi_voxer(input, output, model_name, output_format, stereo_mode, stems)
106
+ batch_names = []
107
+ if output_audio is not None:
108
+ for i, (stem, output_file) in enumerate(output_audio[:20]):
109
+ batch_names.append(gr.update(
110
+ visible=True,
111
+ label=stem,
112
+ value=output_file
113
+ ))
114
+ # Заполняем оставшиеся слоты невидимыми элементами
115
+ while len(batch_names) < 20:
116
+ batch_names.append(gr.update(visible=False, label=None, value=None))
117
+ return tuple(batch_names)
118
+
119
+ def medley_vox_plugin_name():
120
+ return "Medley-Vox"
121
+
122
+ def medley_vox_plugin(lang):
123
+ set_language(lang)
124
+ output_dir = gr.Text(value="/content/output/", visible=False)
125
+ with gr.Tab(t("inference")):
126
+ with gr.Row(equal_height=True):
127
+ with gr.Column():
128
+ input_voice = gr.Audio(show_label=False, type="filepath", interactive=True)
129
+ with gr.Column():
130
+ vox_model_name = gr.Dropdown(label=t("vox_model_name"), choices=list(medley_vox_models.keys()), value=list(medley_vox_models.keys())[0], interactive=True, filterable=False)
131
+ stereo_mode = gr.Dropdown(label=t("vox_stereo_mode"), choices=["mono", "full"], value="mono", interactive=True, filterable=False)
132
+ output_vox_format = gr.Dropdown(label=t("vox_output_format"), choices=list(filter(lambda fmt: fmt != "ogg", OUTPUT_FORMATS)), value="mp3", interactive=True, filterable=False)
133
+ separate_vox_btn = gr.Button(t("separate_vocals_btn"), variant="primary")
134
+ output_voxes = [gr.Audio(visible=(i == 0), interactive=False, type="filepath", show_download_button=True) for i in range(2)]
135
+
136
+ with gr.Tab(t("vocal_multi_separation")):
137
+ with gr.Row(equal_height=True):
138
+ with gr.Column():
139
+ input_vox = gr.Audio(show_label=False, type="filepath", interactive=True)
140
+ with gr.Column():
141
+ vox_m_model_name = gr.Dropdown(label=t("vox_model_name"), choices=list(medley_vox_models.keys()), value=list(medley_vox_models.keys())[0], interactive=True, filterable=False)
142
+ with gr.Row():
143
+ stereo_m_mode = gr.Dropdown(label=t("vox_stereo_mode"), choices=["mono", "full"], value="mono", interactive=True, filterable=False)
144
+ count_stems = gr.Dropdown(label=t("vox_count_stems"), choices=[2, 4, 8, 16], value=2, interactive=True, filterable=False)
145
+ output_m_vox_format = gr.Dropdown(label=t("vox_output_format"), choices=list(filter(lambda fmt: fmt != "ogg", OUTPUT_FORMATS)), value="mp3", interactive=True, filterable=False)
146
+ separate_m_vox_btn = gr.Button(t("vox_multi_separate_btn"), variant="primary")
147
+ output_m_voxes = [gr.Audio(visible=(i == 0), interactive=False, type="filepath", show_download_button=True) for i in range(20)]
148
+
149
+ separate_m_vox_btn.click(fn=(lambda : os.path.join(OUTPUT_DIR, datetime.now().strftime("%Y%m%d_%H%M%S"))), inputs=None, outputs=output_dir).then(fn=multi_voxer_gradio, inputs=[input_vox, output_dir, vox_m_model_name, output_m_vox_format, stereo_m_mode, count_stems], outputs=[*output_m_voxes])
150
+
151
+ separate_vox_btn.click(fn=(lambda : os.path.join(OUTPUT_DIR, datetime.now().strftime("%Y%m%d_%H%M%S"))), inputs=None, outputs=output_dir).then(fn=medley_voxer_gradio, inputs=[input_voice, output_dir, vox_model_name, output_vox_format, stereo_mode], outputs=output_voxes)
152
+