AnhP commited on
Commit
464d3e5
·
verified ·
1 Parent(s): 90c5266

Delete main

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. main/app/app.py +0 -524
  2. main/app/core/csrt.py +0 -72
  3. main/app/core/downloads.py +0 -208
  4. main/app/core/editing.py +0 -92
  5. main/app/core/f0_extract.py +0 -54
  6. main/app/core/inference.py +0 -441
  7. main/app/core/model_utils.py +0 -164
  8. main/app/core/presets.py +0 -166
  9. main/app/core/process.py +0 -135
  10. main/app/core/realtime.py +0 -174
  11. main/app/core/realtime_client.py +0 -114
  12. main/app/core/restart.py +0 -48
  13. main/app/core/separate.py +0 -95
  14. main/app/core/training.py +0 -265
  15. main/app/core/tts.py +0 -100
  16. main/app/core/ui.py +0 -362
  17. main/app/core/utils.py +0 -61
  18. main/app/parser.py +0 -369
  19. main/app/run_tensorboard.py +0 -32
  20. main/app/tabs/downloads/downloads.py +0 -112
  21. main/app/tabs/editing/child/audio_effects.py +0 -393
  22. main/app/tabs/editing/child/quirk.py +0 -48
  23. main/app/tabs/editing/editing.py +0 -20
  24. main/app/tabs/extra/child/convert_model.py +0 -31
  25. main/app/tabs/extra/child/create_srt.py +0 -56
  26. main/app/tabs/extra/child/f0_extract.py +0 -51
  27. main/app/tabs/extra/child/fushion.py +0 -45
  28. main/app/tabs/extra/child/read_model.py +0 -29
  29. main/app/tabs/extra/child/settings.py +0 -61
  30. main/app/tabs/extra/extra.py +0 -40
  31. main/app/tabs/inference/child/convert.py +0 -328
  32. main/app/tabs/inference/child/convert_tts.py +0 -280
  33. main/app/tabs/inference/child/convert_with_whisper.py +0 -164
  34. main/app/tabs/inference/child/separate.py +0 -263
  35. main/app/tabs/inference/inference.py +0 -30
  36. main/app/tabs/realtime/realtime.py +0 -226
  37. main/app/tabs/realtime/realtime_client.py +0 -210
  38. main/app/tabs/training/child/create_dataset.py +0 -282
  39. main/app/tabs/training/child/create_reference.py +0 -97
  40. main/app/tabs/training/child/training.py +0 -259
  41. main/app/tabs/training/training.py +0 -25
  42. main/app/variables.py +0 -117
  43. main/configs/config.json +0 -622
  44. main/configs/config.py +0 -131
  45. main/configs/rpc.py +0 -78
  46. main/configs/v1/32000.json +0 -46
  47. main/configs/v1/40000.json +0 -46
  48. main/configs/v1/48000.json +0 -46
  49. main/configs/v2/32000.json +0 -42
  50. main/configs/v2/40000.json +0 -42
main/app/app.py DELETED
@@ -1,524 +0,0 @@
1
- import os
2
- import io
3
- import ssl
4
- import sys
5
- import time
6
- import codecs
7
- import logging
8
- import warnings
9
-
10
- import gradio as gr
11
-
12
- sys.path.append(os.getcwd())
13
- start_time = time.time()
14
-
15
- from main.app.tabs.extra.extra import extra_tab
16
- from main.app.tabs.editing.editing import editing_tab
17
- from main.app.tabs.training.training import training_tab
18
- from main.app.tabs.downloads.downloads import download_tab
19
- from main.app.tabs.inference.inference import inference_tab
20
- from main.configs.rpc import connect_discord_ipc, send_discord_rpc
21
- from main.app.variables import logger, config, translations, theme, font, configs, language, allow_disk
22
-
23
- ssl._create_default_https_context = ssl._create_unverified_context
24
-
25
- warnings.filterwarnings("ignore")
26
- for l in ["httpx", "gradio", "uvicorn", "httpcore", "urllib3"]:
27
- logging.getLogger(l).setLevel(logging.ERROR)
28
-
29
- js_code = """
30
- () => {
31
- window._activeStream = null;
32
- window._audioCtx = null;
33
- window._workletNode = null;
34
- window._playbackNode = null;
35
- window._ws = null;
36
-
37
- function setStatus(msg, use_alert = true) {
38
- const realtimeStatus = document.querySelector("#realtime-status-info h2.output-class");
39
- if (use_alert) alert(msg);
40
-
41
- if (realtimeStatus) {
42
- realtimeStatus.innerText = msg;
43
- realtimeStatus.style.whiteSpace = "nowrap";
44
- realtimeStatus.style.textAlign = "center";
45
- }
46
- }
47
-
48
- async function addModuleFromString(ctx, codeStr) {
49
- const blob = new Blob([codeStr], {type: 'application/javascript'});
50
- const url = URL.createObjectURL(blob);
51
-
52
- await ctx.audioWorklet.addModule(url);
53
- URL.revokeObjectURL(url);
54
- };
55
-
56
- function createOutputRoute(audioCtx, playbackNode, sinkId, gainValue = 1.0) {
57
- const dest = audioCtx.createMediaStreamDestination();
58
- const gainNode = audioCtx.createGain();
59
- gainNode.gain.value = gainValue;
60
-
61
- playbackNode.connect(gainNode);
62
- gainNode.connect(dest);
63
-
64
- const el = document.createElement('audio');
65
- el.autoplay = true;
66
- el.srcObject = dest.stream;
67
- el.style.display = 'none';
68
- document.body.appendChild(el);
69
-
70
- if (el.setSinkId) el.setSinkId(sinkId).catch(err => console.error(err));
71
- return { dest, gainNode, el };
72
- }
73
-
74
- const inputWorkletSource = `
75
- class InputProcessor extends AudioWorkletProcessor {
76
- constructor() {
77
- super();
78
- this.buffer = new Float32Array(0);
79
- this.block_frame = 128;
80
- this.port.onmessage = (e) => {
81
- if (e.data && e.data.block_frame) this.block_frame = e.data.block_frame;
82
- };
83
- }
84
-
85
- process(inputs) {
86
- const input = inputs[0];
87
- if (!input || !input[0]) return true;
88
- const frame = input[0];
89
-
90
- const newBuf = new Float32Array(this.buffer.length + frame.length);
91
- newBuf.set(this.buffer, 0);
92
- newBuf.set(frame, this.buffer.length);
93
- this.buffer = newBuf;
94
-
95
- while (this.buffer.length >= this.block_frame) {
96
- const chunk = this.buffer.slice(0, this.block_frame);
97
-
98
- this.port.postMessage({chunk}, [chunk.buffer]);
99
- this.buffer = this.buffer.slice(this.block_frame);
100
- }
101
-
102
- return true;
103
- }
104
- }
105
- registerProcessor('input-processor', InputProcessor);
106
- `;
107
-
108
- const playbackWorkletSource = `
109
- class PlaybackProcessor extends AudioWorkletProcessor {
110
- constructor(options) {
111
- super(options);
112
- const bufferSize = options.processorOptions && options.processorOptions.bufferSize ? options.processorOptions.bufferSize: 98304;
113
- this.buffer = new Float32Array(bufferSize);
114
- this.bufferCapacity = bufferSize;
115
- this.writePointer = 0;
116
- this.readPointer = 0;
117
- this.availableSamples = 0;
118
- this.port.onmessage = (e) => {
119
- if (e.data && e.data.chunk) {
120
- const chunk = new Float32Array(e.data.chunk);
121
- const chunkSize = chunk.length;
122
-
123
- if (this.availableSamples + chunkSize > this.bufferCapacity) return;
124
-
125
- for (let i = 0; i < chunkSize; i++) {
126
- this.buffer[this.writePointer] = chunk[i];
127
- this.writePointer = (this.writePointer + 1) % this.bufferCapacity;
128
- }
129
-
130
- this.availableSamples += chunkSize;
131
- }
132
- };
133
- }
134
-
135
- process(inputs, outputs) {
136
- const output = outputs[0];
137
- if (!output || !output[0]) return true;
138
-
139
- const frame = output[0];
140
- const frameSize = frame.length;
141
-
142
- if (this.availableSamples >= frameSize) {
143
- for (let i = 0; i < frameSize; i++) {
144
- frame[i] = this.buffer[this.readPointer];
145
- this.readPointer = (this.readPointer + 1) % this.bufferCapacity;
146
- }
147
- this.availableSamples -= frameSize;
148
- } else {
149
- frame.fill(0);
150
- }
151
-
152
- if (output.length > 1) output[1].set(output[0]);
153
- return true;
154
- }
155
- }
156
- registerProcessor('playback-processor', PlaybackProcessor);
157
- `;
158
-
159
- window.getAudioDevices = async function() {
160
- if (!navigator.mediaDevices) {
161
- setStatus("__MEDIA_DEVICES__");
162
- return {"inputs": {}, "outputs": {}};
163
- }
164
-
165
- try {
166
- await navigator.mediaDevices.getUserMedia({ audio: true });
167
- } catch (err) {
168
- console.error(err);
169
- setStatus("__MIC_INACCESSIBLE__")
170
-
171
- return {"inputs": {}, "outputs": {}};
172
- }
173
-
174
- const devices = await navigator.mediaDevices.enumerateDevices();
175
- const inputs = {};
176
- const outputs = {};
177
-
178
- for (const device of devices) {
179
- if (device.kind === "audioinput") {
180
- inputs[device.label] = device.deviceId
181
- } else if (device.kind === "audiooutput") {
182
- outputs[device.label] = device.deviceId
183
- }
184
- }
185
-
186
- if (!Object.keys(inputs).length && !Object.keys(outputs).length) return {"inputs": {}, "outputs": {}};
187
- return {"inputs": inputs, "outputs": outputs};
188
- };
189
-
190
- window.StreamAudioRealtime = async function(
191
- monitor,
192
- vad_enabled,
193
- input_audio_device,
194
- output_audio_device,
195
- monitor_output_device,
196
- input_audio_gain,
197
- output_audio_gain,
198
- monitor_audio_gain,
199
- chunk_size,
200
- pitch,
201
- model_pth,
202
- model_index,
203
- index_strength,
204
- onnx_f0_mode,
205
- f0_method,
206
- hop_length,
207
- embed_mode,
208
- embedders,
209
- custom_embedders,
210
- f0_autotune,
211
- proposal_pitch,
212
- f0_autotune_strength,
213
- proposal_pitch_threshold,
214
- rms_mix_rate,
215
- protect,
216
- filter_radius,
217
- silent_threshold,
218
- extra_convert_size,
219
- cross_fade_overlap_size,
220
- vad_sensitivity,
221
- vad_frame_ms,
222
- clean_audio,
223
- clean_strength
224
- ) {
225
- const SampleRate = 48000;
226
- const ReadChunkSize = Math.round(chunk_size * SampleRate / 1000 / 128);
227
- const block_frame = parseInt(ReadChunkSize) * 128;
228
- const ButtonState = { start_button: true, stop_button: false };
229
- const devices = await window.getAudioDevices();
230
-
231
- input_audio_device = devices["inputs"][input_audio_device];
232
- output_audio_device = devices["outputs"][output_audio_device];
233
- if (monitor && devices["outputs"][monitor_output_device]) monitor_output_device = devices["outputs"][monitor_output_device];
234
-
235
- try {
236
- if (!input_audio_device || !output_audio_device) {
237
- setStatus("__PROVIDE_AUDIO_DEVICE__");
238
- return ButtonState;
239
- }
240
-
241
- if (monitor && !monitor_output_device) {
242
- setStatus("__PROVIDE_MONITOR_DEVICE__");
243
- return ButtonState;
244
- }
245
-
246
- if (!model_pth) {
247
- setStatus("__PROVIDE_MODEL__")
248
- return ButtonState;
249
- }
250
-
251
- setStatus("__START_REALTIME__", use_alert=false)
252
-
253
- const stream = await navigator.mediaDevices.getUserMedia({
254
- audio: {
255
- deviceId: { exact: input_audio_device },
256
- channelCount: 1,
257
- sampleRate: SampleRate,
258
- echoCancellation: false,
259
- noiseSuppression: false,
260
- autoGainControl: false
261
- }
262
- });
263
-
264
- window._activeStream = stream;
265
- window._audioCtx = new AudioContext({ sampleRate: SampleRate, latencyHint: "interactive" });
266
-
267
- await addModuleFromString(window._audioCtx, inputWorkletSource);
268
- await addModuleFromString(window._audioCtx, playbackWorkletSource);
269
-
270
- const src = window._audioCtx.createMediaStreamSource(stream);
271
- const inputNode = new AudioWorkletNode(window._audioCtx, 'input-processor');
272
- const playbackNode = new AudioWorkletNode(window._audioCtx, 'playback-processor', {
273
- processorOptions: {
274
- bufferSize: block_frame * 2
275
- }
276
- });
277
-
278
- inputNode.port.postMessage({ block_frame: block_frame });
279
- src.connect(inputNode);
280
-
281
- createOutputRoute(window._audioCtx, playbackNode, output_audio_device, output_audio_gain / 100);
282
- if (monitor && monitor_output_device) createOutputRoute(window._audioCtx, playbackNode, monitor_output_device, monitor_audio_gain / 100);
283
-
284
- const protocol = (location.protocol === "https:") ? "wss:" : "ws:";
285
- const wsUrl = protocol + '//' + location.hostname + `:${location.port}` + '/api/ws-audio';
286
- const ws = new WebSocket(wsUrl);
287
-
288
- ButtonState.start_button = false;
289
- ButtonState.stop_button = true;
290
-
291
- ws.binaryType = "arraybuffer";
292
- window._ws = ws;
293
-
294
- ws.onopen = () => {
295
- console.log("__WS_CONNECTED__")
296
-
297
- ws.send(
298
- JSON.stringify({
299
- type: 'init',
300
- chunk_size: ReadChunkSize,
301
- embedders: embedders,
302
- model_pth: model_pth,
303
- custom_embedders: custom_embedders,
304
- cross_fade_overlap_size: cross_fade_overlap_size,
305
- extra_convert_size: extra_convert_size,
306
- model_index: model_index,
307
- f0_method: f0_method,
308
- f0_onnx: onnx_f0_mode,
309
- embedders_mode: embed_mode,
310
- hop_length: hop_length,
311
- silent_threshold: silent_threshold,
312
- vad_enabled: vad_enabled,
313
- vad_sensitivity: vad_sensitivity,
314
- vad_frame_ms: vad_frame_ms,
315
- clean_audio: clean_audio,
316
- clean_strength: clean_strength,
317
- f0_up_key: pitch,
318
- index_rate: index_strength,
319
- protect: protect,
320
- filter_radius: filter_radius,
321
- rms_mix_rate: rms_mix_rate,
322
- f0_autotune: f0_autotune,
323
- f0_autotune_strength: f0_autotune_strength,
324
- proposal_pitch: proposal_pitch,
325
- proposal_pitch_threshold: proposal_pitch_threshold,
326
- input_audio_gain: input_audio_gain
327
- })
328
- );
329
- };
330
-
331
- inputNode.port.onmessage = (e) => {
332
- const chunk = e.data && e.data.chunk;
333
-
334
- if (!chunk) return;
335
- if (ws.readyState === WebSocket.OPEN) ws.send(chunk);
336
- };
337
-
338
- ws.onmessage = (ev) => {
339
- if (typeof ev.data === 'string') {
340
- const msg = JSON.parse(ev.data);
341
-
342
- if (msg.type === 'latency') setStatus(`__LATENCY__: ${msg.value.toFixed(1)} ms`, use_alert=false)
343
- if (msg.type === 'warnings') {
344
- setStatus(msg.value);
345
- StopAudioStream();
346
- }
347
-
348
- return;
349
- }
350
-
351
- const ab = ev.data;
352
- playbackNode.port.postMessage({ chunk: ab }, [ab]);
353
- };
354
-
355
- ws.onclose = () => console.log("__WS_CLOSED__");
356
- window._workletNode = inputNode;
357
- window._playbackNode = playbackNode;
358
-
359
- if (window._audioCtx.state === 'suspended') await window._audioCtx.resume();
360
-
361
- console.log("__REALTIME_STARTED__");
362
- return ButtonState;
363
- } catch (err) {
364
- console.error("__ERROR__", err);
365
- alert("__ERROR__" + err.message);
366
-
367
- return StopAudioStream();
368
- }
369
- };
370
-
371
- window.StopAudioStream = async function() {
372
- try {
373
- if (window._ws) {
374
- window._ws.close();
375
- window._ws = null;
376
- }
377
-
378
- if (window._activeStream) {
379
- window._activeStream.getTracks().forEach(t => t.stop());
380
- window._activeStream = null;
381
- }
382
-
383
- if (window._workletNode) {
384
- window._workletNode.disconnect();
385
- window._workletNode = null;
386
- }
387
-
388
- if (window._playbackNode) {
389
- window._playbackNode.disconnect();
390
- window._playbackNode = null;
391
- }
392
-
393
- if (window._audioCtx) {
394
- await window._audioCtx.close();
395
- window._audioCtx = null;
396
- }
397
-
398
- document.querySelectorAll('audio').forEach(a => a.remove());
399
- setStatus("__REALTIME_HAS_STOP__", use_alert=false);
400
-
401
- return {"start_button": true, "stop_button": false};
402
- } catch (e) {
403
- setStatus(`__ERROR__ ${e}`);
404
-
405
- return {"start_button": false, "stop_button": true}
406
- }
407
- };
408
- }
409
- """.replace(
410
- "__MEDIA_DEVICES__", translations["media_devices"]
411
- ).replace(
412
- "__MIC_INACCESSIBLE__", translations["mic_inaccessible"]
413
- ).replace(
414
- "__PROVIDE_AUDIO_DEVICE__", translations["provide_audio_device"]
415
- ).replace(
416
- "__PROVIDE_MONITOR_DEVICE__", translations["provide_monitor_device"]
417
- ).replace(
418
- "__START_REALTIME__", translations["start_realtime"]
419
- ).replace(
420
- "__LATENCY__", translations['latency']
421
- ).replace(
422
- "__WS_CONNECTED__", translations["ws_connected"]
423
- ).replace(
424
- "__WS_CLOSED__", translations["ws_closed"]
425
- ).replace(
426
- "__REALTIME_STARTED__", translations["realtime_is_ready"]
427
- ).replace(
428
- "__ERROR__", translations["error_occurred"].format(e="")
429
- ).replace(
430
- "__REALTIME_HAS_STOP__", translations["realtime_has_stop"]
431
- ).replace(
432
- "__PROVIDE_MODEL__", translations["provide_file"].format(filename=translations["model"])
433
- )
434
-
435
- client_mode = True # "--client" in sys.argv
436
-
437
- with gr.Blocks(
438
- title="📱 Vietnamese-RVC GUI BY ANH",
439
- js=js_code if client_mode else None,
440
- theme=theme,
441
- css="<style> @import url('{fonts}'); * {{font-family: 'Courgette', cursive !important;}} body, html {{font-family: 'Courgette', cursive !important;}} h1, h2, h3, h4, h5, h6, p, button, input, textarea, label, span, div, select {{font-family: 'Courgette', cursive !important;}} </style>".format(fonts=font or "https://fonts.googleapis.com/css2?family=Courgette&display=swap")
442
- ) as app:
443
- gr.HTML("<h1 style='text-align: center;'>🎵VIETNAMESE RVC BY ANH🎵</h1>")
444
- gr.HTML(f"<h3 style='text-align: center;'>{translations['title']}</h3>")
445
-
446
- with gr.Tabs():
447
- inference_tab()
448
- editing_tab()
449
-
450
- if client_mode:
451
- from main.app.tabs.realtime.realtime_client import realtime_client_tab
452
- realtime_client_tab()
453
- else:
454
- from main.app.tabs.realtime.realtime import realtime_tab
455
- realtime_tab()
456
-
457
- training_tab()
458
- download_tab()
459
- extra_tab(app)
460
-
461
- with gr.Row():
462
- gr.Markdown(translations["rick_roll"].format(rickroll=codecs.decode('uggcf://jjj.lbhghor.pbz/jngpu?i=qDj4j9JtKpD', 'rot13')))
463
-
464
- with gr.Row():
465
- gr.Markdown(translations["terms_of_use"])
466
-
467
- with gr.Row():
468
- gr.Markdown(translations["exemption"])
469
-
470
- if __name__ == "__main__":
471
- logger.info(config.device.replace("privateuseone", "dml"))
472
- logger.info(translations["start_app"])
473
- logger.info(translations["set_lang"].format(lang=language))
474
-
475
- port = configs.get("app_port", 7860)
476
- server_name = configs.get("server_name", "0.0.0.0")
477
- share = "--share" in sys.argv
478
-
479
- original_stdout = sys.stdout
480
- sys.stdout = io.StringIO()
481
-
482
- for i in range(configs.get("num_of_restart", 5)):
483
- try:
484
- gradio_app, _, share_url = app.queue().launch(
485
- favicon_path=configs["ico_path"],
486
- server_name=server_name,
487
- server_port=port,
488
- show_error=configs.get("app_show_error", False),
489
- inbrowser="--open" in sys.argv,
490
- share=share,
491
- allowed_paths=allow_disk,
492
- prevent_thread_lock=True,
493
- quiet=True
494
- )
495
- break
496
- except OSError:
497
- logger.debug(translations["port"].format(port=port))
498
- port -= 1
499
- except Exception as e:
500
- logger.error(translations["error_occurred"].format(e=e))
501
- sys.exit(1)
502
-
503
- if client_mode:
504
- from main.app.core.realtime_client import app as fastapi_app
505
- gradio_app.mount("/api", fastapi_app)
506
-
507
- sys.stdout = original_stdout
508
-
509
- if configs.get("discord_presence", True):
510
- pipe = connect_discord_ipc()
511
- if pipe:
512
- try:
513
- logger.info(translations["start_rpc"])
514
- send_discord_rpc(pipe)
515
- except KeyboardInterrupt:
516
- logger.info(translations["stop_rpc"])
517
- pipe.close()
518
-
519
- logger.info(f"{translations['running_local_url']}: {server_name}:{port}")
520
- if share: logger.info(f"{translations['running_share_url']}: {share_url}")
521
- logger.info(f"{translations['gradio_start']}: {(time.time() - start_time):.2f}s")
522
-
523
- while 1:
524
- time.sleep(5)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
main/app/core/csrt.py DELETED
@@ -1,72 +0,0 @@
1
- import os
2
- import sys
3
-
4
- sys.path.append(os.getcwd())
5
-
6
- from main.app.core.inference import whisper_process
7
- from main.library.utils import check_spk_diarization
8
- from main.app.core.ui import gr_info, gr_warning, process_output
9
- from main.app.variables import config, translations, configs, logger
10
-
11
- def create_srt(model_size, input_audio, output_file, word_timestamps):
12
- import multiprocessing as mp
13
-
14
- if not input_audio or not os.path.exists(input_audio) or os.path.isdir(input_audio):
15
- gr_warning(translations["input_not_valid"])
16
- return [None]*2
17
-
18
- if not output_file.endswith(".srt"): output_file += ".srt"
19
-
20
- if not output_file:
21
- gr_warning(translations["output_not_valid"])
22
- return [None]*2
23
-
24
- output_dir = os.path.dirname(output_file)
25
- if not os.path.exists(output_dir): os.makedirs(output_dir, exist_ok=True)
26
-
27
- info = ""
28
- output_file = process_output(output_file)
29
-
30
- check_spk_diarization(model_size, speechbrain=False)
31
- gr_info(translations["csrt"])
32
-
33
- try:
34
- mp.set_start_method("spawn")
35
- except:
36
- pass
37
-
38
- whisper_queue = mp.Queue()
39
- whisperprocess = mp.Process(target=whisper_process, args=(model_size, input_audio, configs, config.device, whisper_queue, word_timestamps))
40
- whisperprocess.start()
41
-
42
- segments = whisper_queue.get()
43
-
44
- with open(output_file, "w", encoding="utf-8") as f:
45
- for i, segment in enumerate(segments):
46
- start = segment["start"]
47
- end = segment["end"]
48
- text = segment["text"].strip()
49
-
50
- index = f"{i+1}\n"
51
- timestamp = f"{format_timestamp(start)} --> {format_timestamp(end)}\n"
52
- text1 = f"{text}\n\n"
53
-
54
- f.write(index)
55
- f.write(timestamp)
56
- f.write(text1)
57
-
58
- info = info + index + timestamp + text1
59
- logger.info(info)
60
-
61
- gr_info(translations["success"])
62
-
63
- return [{"value": output_file, "visible": True, "__type__": "update"}, info]
64
-
65
- def format_timestamp(seconds):
66
- hours = int(seconds // 3600)
67
- minutes = int((seconds % 3600) // 60)
68
-
69
- seconds = int(seconds % 60)
70
- miliseconds = int((seconds - int(seconds)) * 1000)
71
-
72
- return f"{hours:02}:{minutes:02}:{seconds:02},{miliseconds:03}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
main/app/core/downloads.py DELETED
@@ -1,208 +0,0 @@
1
- import os
2
- import re
3
- import sys
4
- import json
5
- import codecs
6
- import shutil
7
- import yt_dlp
8
- import warnings
9
- import requests
10
-
11
- from bs4 import BeautifulSoup
12
-
13
- sys.path.append(os.getcwd())
14
-
15
- from main.tools import huggingface, gdown, meganz, mediafire, pixeldrain
16
- from main.app.variables import logger, translations, model_options, configs
17
- from main.app.core.process import move_files_from_directory, fetch_pretrained_data, extract_name_model
18
- from main.app.core.ui import gr_info, gr_warning, gr_error, process_output, replace_url, replace_modelname
19
-
20
- def download_url(url):
21
- if not url:
22
- gr_warning(translations["provide_url"])
23
- return [None]*3
24
-
25
- if not os.path.exists(configs["audios_path"]): os.makedirs(configs["audios_path"], exist_ok=True)
26
-
27
- with warnings.catch_warnings():
28
- warnings.filterwarnings("ignore")
29
- ydl_opts = {
30
- "format": "bestaudio/best",
31
- "postprocessors": [{
32
- "key": "FFmpegExtractAudio",
33
- "preferredcodec": "wav",
34
- "preferredquality": "192"
35
- }],
36
- "quiet": True,
37
- "no_warnings": True,
38
- "noplaylist": True,
39
- "verbose": False
40
- }
41
-
42
- gr_info(translations["start"].format(start=translations["download_music"]))
43
-
44
- with yt_dlp.YoutubeDL(ydl_opts) as ydl:
45
- audio_output = os.path.join(configs["audios_path"], re.sub(r'\s+', '-', re.sub(r'[^\w\s\u4e00-\u9fff\uac00-\ud7af\u0400-\u04FF\u1100-\u11FF]', '', ydl.extract_info(url, download=False).get('title', 'video')).strip()))
46
- if os.path.exists(audio_output): shutil.rmtree(audio_output, ignore_errors=True)
47
-
48
- ydl_opts['outtmpl'] = audio_output
49
-
50
- with yt_dlp.YoutubeDL(ydl_opts) as ydl:
51
- audio_output = process_output(audio_output + ".wav")
52
-
53
- ydl.download([url])
54
-
55
- gr_info(translations["success"])
56
- return [audio_output, audio_output, translations["success"]]
57
-
58
- def move_file(file, download_dir, model):
59
- weights_dir = configs["weights_path"]
60
- logs_dir = configs["logs_path"]
61
-
62
- if not os.path.exists(weights_dir): os.makedirs(weights_dir, exist_ok=True)
63
- if not os.path.exists(logs_dir): os.makedirs(logs_dir, exist_ok=True)
64
-
65
- if file.endswith(".zip"): shutil.unpack_archive(file, download_dir)
66
- move_files_from_directory(download_dir, weights_dir, logs_dir, model)
67
-
68
- def download_model(url=None, model=None):
69
- if not url: return gr_warning(translations["provide_url"])
70
-
71
- url = replace_url(url)
72
- download_dir = "download_model"
73
-
74
- os.makedirs(download_dir, exist_ok=True)
75
-
76
- try:
77
- gr_info(translations["start"].format(start=translations["download"]))
78
-
79
- if "huggingface.co" in url: file = huggingface.HF_download_file(url, download_dir)
80
- elif "google.com" in url: file = gdown.gdown_download(url, download_dir)
81
- elif "mediafire.com" in url: file = mediafire.Mediafire_Download(url, download_dir)
82
- elif "pixeldrain.com" in url: file = pixeldrain.pixeldrain(url, download_dir)
83
- elif "mega.nz" in url: file = meganz.mega_download_url(url, download_dir)
84
- else:
85
- gr_warning(translations["not_support_url"])
86
- return translations["not_support_url"]
87
-
88
- if not model:
89
- modelname = os.path.basename(file)
90
- model = extract_name_model(modelname) if modelname.endswith(".index") else os.path.splitext(modelname)[0]
91
- if model is None: model = os.path.splitext(modelname)[0]
92
-
93
- model = replace_modelname(model)
94
-
95
- move_file(file, download_dir, model)
96
- gr_info(translations["success"])
97
-
98
- return translations["success"]
99
- except Exception as e:
100
- gr_error(message=translations["error_occurred"].format(e=e))
101
- return translations["error_occurred"].format(e=e)
102
- finally:
103
- shutil.rmtree(download_dir, ignore_errors=True)
104
-
105
- def download_pretrained_model(choices, model, sample_rate):
106
- pretraineds_custom_path = configs["pretrained_custom_path"]
107
-
108
- if choices == translations["list_model"]:
109
- paths = fetch_pretrained_data()[model][sample_rate]
110
-
111
- if not os.path.exists(pretraineds_custom_path): os.makedirs(pretraineds_custom_path, exist_ok=True)
112
- url = codecs.decode("uggcf://uhttvatsnpr.pb/NauC/Ivrganzrfr-EIP-Cebwrpg/erfbyir/znva/cergenvarq_phfgbz/", "rot13") + paths
113
-
114
- gr_info(translations["download_pretrain"])
115
- file = huggingface.HF_download_file(replace_url(url), os.path.join(pretraineds_custom_path, paths))
116
-
117
- if file.endswith(".zip"):
118
- shutil.unpack_archive(file, pretraineds_custom_path)
119
- os.remove(file)
120
-
121
- gr_info(translations["success"])
122
- return translations["success"]
123
- elif choices == translations["download_url"]:
124
- pretrain_is_zip = model.endswith(".zip") or model.endswith(".zip?download=true") or sample_rate.endswith(".zip") or sample_rate.endswith(".zip?download=true")
125
- urls = []
126
-
127
- if not model and not pretrain_is_zip:
128
- gr_warning(translations["provide_pretrain"].format(dg="D"))
129
- return [None]*2
130
-
131
- if not sample_rate and not pretrain_is_zip:
132
- gr_warning(translations["provide_pretrain"].format(dg="G"))
133
- return [None]*2
134
-
135
- gr_info(translations["download_pretrain"])
136
-
137
- if model: urls.append(model)
138
- if sample_rate: urls.append(sample_rate)
139
-
140
- for url in urls:
141
- url = replace_url(url)
142
-
143
- if "huggingface.co" in url: file = huggingface.HF_download_file(url, pretraineds_custom_path)
144
- elif "google.com" in url: file = gdown.gdown_download(url, pretraineds_custom_path)
145
- elif "mediafire.com" in url: file = mediafire.Mediafire_Download(url, pretraineds_custom_path)
146
- elif "pixeldrain.com" in url: file = pixeldrain.pixeldrain(url, pretraineds_custom_path)
147
- elif "mega.nz" in url: file = meganz.mega_download_url(url, pretraineds_custom_path)
148
- else:
149
- gr_warning(translations["not_support_url"])
150
- return translations["not_support_url"], translations["not_support_url"]
151
-
152
- if file.endswith(".zip"):
153
- shutil.unpack_archive(file, pretraineds_custom_path)
154
- if os.path.exists(file): os.remove(file)
155
-
156
- gr_info(translations["success"])
157
- return translations["success"], translations["success"]
158
-
159
- def fetch_models_data(search):
160
- all_table_data = []
161
- page = 1
162
-
163
- while 1:
164
- try:
165
- response = requests.post(url=codecs.decode("uggcf://ibvpr-zbqryf.pbz/srgpu_qngn.cuc", "rot13"), data={"page": page, "search": search})
166
-
167
- if response.status_code == 200:
168
- table_data = response.json().get("table", "")
169
- if not table_data.strip(): break
170
-
171
- all_table_data.append(table_data)
172
- page += 1
173
- else:
174
- logger.debug(f"{translations['code_error']} {response.status_code}")
175
- break
176
- except json.JSONDecodeError:
177
- logger.debug(translations["json_error"])
178
- break
179
- except requests.RequestException as e:
180
- logger.debug(translations["requests_error"].format(e=e))
181
- break
182
-
183
- return all_table_data
184
-
185
- def search_models(name):
186
- if not name:
187
- gr_warning(translations["provide_name"])
188
- return [None]*2
189
-
190
- gr_info(translations["start"].format(start=translations["search"]))
191
-
192
- tables = fetch_models_data(name)
193
-
194
- if len(tables) == 0:
195
- gr_info(translations["not_found"].format(name=name))
196
- return [None]*2
197
- else:
198
- model_options.clear()
199
-
200
- for table in tables:
201
- for row in BeautifulSoup(table, "html.parser").select("tr"):
202
- name_tag, url_tag = row.find("a", {"class": "fs-5"}), row.find("a", {"class": "btn btn-sm fw-bold btn-light ms-0 p-1 ps-2 pe-2"})
203
- url = url_tag["href"].replace("https://easyaivoice.com/run?url=", "")
204
- if "huggingface" in url:
205
- if name_tag and url_tag: model_options[replace_modelname(name_tag.text)] = url
206
-
207
- gr_info(translations["found"].format(results=len(model_options)))
208
- return [{"value": "", "choices": model_options, "interactive": True, "visible": True, "__type__": "update"}, {"value": translations["downloads"], "visible": True, "__type__": "update"}]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
main/app/core/editing.py DELETED
@@ -1,92 +0,0 @@
1
- import os
2
- import sys
3
- import random
4
- import subprocess
5
-
6
- sys.path.append(os.getcwd())
7
-
8
- from main.app.variables import python, translations, configs
9
- from main.app.core.ui import gr_info, gr_warning, process_output, replace_export_format
10
-
11
- def audio_effects(input_path, output_path, resample, resample_sr, chorus_depth, chorus_rate, chorus_mix, chorus_delay, chorus_feedback, distortion_drive, reverb_room_size, reverb_damping, reverb_wet_level, reverb_dry_level, reverb_width, reverb_freeze_mode, pitch_shift, delay_seconds, delay_feedback, delay_mix, compressor_threshold, compressor_ratio, compressor_attack_ms, compressor_release_ms, limiter_threshold, limiter_release, gain_db, bitcrush_bit_depth, clipping_threshold, phaser_rate_hz, phaser_depth, phaser_centre_frequency_hz, phaser_feedback, phaser_mix, bass_boost_db, bass_boost_frequency, treble_boost_db, treble_boost_frequency, fade_in_duration, fade_out_duration, export_format, chorus, distortion, reverb, delay, compressor, limiter, gain, bitcrush, clipping, phaser, treble_bass_boost, fade_in_out, audio_combination, audio_combination_input, main_vol, combine_vol):
12
- if not input_path or not os.path.exists(input_path) or os.path.isdir(input_path):
13
- gr_warning(translations["input_not_valid"])
14
- return None
15
-
16
- if not output_path:
17
- gr_warning(translations["output_not_valid"])
18
- return None
19
-
20
- if os.path.isdir(output_path): output_path = os.path.join(output_path, f"audio_effects.{export_format}")
21
- output_dir = os.path.dirname(output_path) or output_path
22
-
23
- if not os.path.exists(output_dir): os.makedirs(output_dir, exist_ok=True)
24
- output_path = process_output(output_path)
25
-
26
- gr_info(translations["start"].format(start=translations["apply_effect"]))
27
-
28
- subprocess.run([python, configs["audio_effects_path"], "--input_path", input_path, "--output_path", output_path, "--resample", str(resample), "--resample_sr", str(resample_sr), "--chorus_depth", str(chorus_depth), "--chorus_rate", str(chorus_rate), "--chorus_mix", str(chorus_mix), "--chorus_delay", str(chorus_delay), "--chorus_feedback", str(chorus_feedback), "--drive_db", str(distortion_drive), "--reverb_room_size", str(reverb_room_size), "--reverb_damping", str(reverb_damping), "--reverb_wet_level", str(reverb_wet_level), "--reverb_dry_level", str(reverb_dry_level), "--reverb_width", str(reverb_width), "--reverb_freeze_mode", str(reverb_freeze_mode), "--pitch_shift", str(pitch_shift), "--delay_seconds", str(delay_seconds), "--delay_feedback", str(delay_feedback), "--delay_mix", str(delay_mix), "--compressor_threshold", str(compressor_threshold), "--compressor_ratio", str(compressor_ratio), "--compressor_attack_ms", str(compressor_attack_ms), "--compressor_release_ms", str(compressor_release_ms), "--limiter_threshold", str(limiter_threshold), "--limiter_release", str(limiter_release), "--gain_db", str(gain_db), "--bitcrush_bit_depth", str(bitcrush_bit_depth), "--clipping_threshold", str(clipping_threshold), "--phaser_rate_hz", str(phaser_rate_hz), "--phaser_depth", str(phaser_depth), "--phaser_centre_frequency_hz", str(phaser_centre_frequency_hz), "--phaser_feedback", str(phaser_feedback), "--phaser_mix", str(phaser_mix), "--bass_boost_db", str(bass_boost_db), "--bass_boost_frequency", str(bass_boost_frequency), "--treble_boost_db", str(treble_boost_db), "--treble_boost_frequency", str(treble_boost_frequency), "--fade_in_duration", str(fade_in_duration), "--fade_out_duration", str(fade_out_duration), "--export_format", export_format, "--chorus", str(chorus), "--distortion", str(distortion), "--reverb", str(reverb), "--pitchshift", str(pitch_shift != 0), "--delay", str(delay), "--compressor", str(compressor), "--limiter", str(limiter), "--gain", str(gain), "--bitcrush", str(bitcrush), "--clipping", str(clipping), "--phaser", str(phaser), "--treble_bass_boost", str(treble_bass_boost), "--fade_in_out", str(fade_in_out), "--audio_combination", str(audio_combination), "--audio_combination_input", audio_combination_input, "--main_volume", str(main_vol), "--combination_volume", str(combine_vol)])
29
-
30
- gr_info(translations["success"])
31
- return replace_export_format(output_path, export_format)
32
-
33
- def apply_voice_quirk(audio_path, mode, output_path, export_format):
34
- if not audio_path or not os.path.exists(audio_path) or os.path.isdir(audio_path):
35
- gr_warning(translations["input_not_valid"])
36
- return None
37
-
38
- if not output_path:
39
- gr_warning(translations["output_not_valid"])
40
- return None
41
-
42
- if os.path.isdir(output_path): output_path = os.path.join(output_path, f"audio_quirk.{export_format}")
43
- output_dir = os.path.dirname(output_path) or output_path
44
-
45
- if not os.path.exists(output_dir): os.makedirs(output_dir, exist_ok=True)
46
- output_path = process_output(output_path)
47
-
48
- gr_info(translations["start"].format(start=translations["apply_effect"]))
49
-
50
- import librosa
51
- import numpy as np
52
- import soundfile as sf
53
-
54
- def vibrato(y, sr, freq=5, depth=0.003):
55
- return y[np.clip((np.arange(len(y)) + (depth * np.sin(2 * np.pi * freq * (np.arange(len(y)) / sr))) * sr).astype(int), 0, len(y) - 1)]
56
-
57
- y, sr = librosa.load(audio_path, sr=None)
58
- output_path = replace_export_format(output_path, export_format)
59
-
60
- mode = translations["quirk_choice"][mode]
61
- if mode == 0: mode = random.randint(1, 16)
62
-
63
- if mode == 1: y *= np.random.uniform(0.5, 0.8, size=len(y))
64
- elif mode == 2: y = librosa.effects.pitch_shift(y=y + np.random.normal(0, 0.01, y.shape), sr=sr, n_steps=np.random.uniform(-1.5, -3.5))
65
- elif mode == 3: y = librosa.effects.time_stretch(librosa.effects.pitch_shift(y=y, sr=sr, n_steps=3), rate=1.2)
66
- elif mode == 4: y = librosa.effects.time_stretch(librosa.effects.pitch_shift(y=y, sr=sr, n_steps=8), rate=1.3)
67
- elif mode == 5: y = librosa.effects.time_stretch(librosa.effects.pitch_shift(y=y, sr=sr, n_steps=-3), rate=0.75)
68
- elif mode == 6: y *= np.sin(np.linspace(0, np.pi * 20, len(y))) * 0.5 + 0.5
69
- elif mode == 7: y = librosa.effects.time_stretch(vibrato(librosa.effects.pitch_shift(y=y, sr=sr, n_steps=-4), sr, freq=3, depth=0.004), rate=0.85)
70
- elif mode == 8: y *= 0.6 + np.pad(y, (sr // 2, 0), mode='constant')[:len(y)] * 0.4
71
- elif mode == 9: y = librosa.effects.pitch_shift(y=y, sr=sr, n_steps=2) + np.sin(np.linspace(0, np.pi * 20, len(y))) * 0.02
72
- elif mode == 10: y = vibrato(y, sr, freq=8, depth=0.005)
73
- elif mode == 11: y = librosa.effects.time_stretch(librosa.effects.pitch_shift(y=y, sr=sr, n_steps=4), rate=1.25)
74
- elif mode == 12: y = np.hstack([np.pad(f, (0, int(len(f)*0.3)), mode='edge') for f in librosa.util.frame(y, frame_length=2048, hop_length=512).T])
75
- elif mode == 13: y = np.concatenate([y, np.sin(2 * np.pi * np.linspace(0, 1, int(0.05 * sr))) * 0.02])
76
- elif mode == 14: y += np.random.normal(0, 0.005, len(y))
77
- elif mode == 15:
78
- frame = int(sr * 0.2)
79
- chunks = [y[i:i + frame] for i in range(0, len(y), frame)]
80
-
81
- np.random.shuffle(chunks)
82
- y = np.concatenate(chunks)
83
- elif mode == 16:
84
- frame = int(sr * 0.3)
85
-
86
- for i in range(0, len(y), frame * 2):
87
- y[i:i+frame] = y[i:i+frame][::-1]
88
-
89
- sf.write(output_path, y, sr, format=export_format)
90
- gr_info(translations["success"])
91
-
92
- return output_path
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
main/app/core/f0_extract.py DELETED
@@ -1,54 +0,0 @@
1
- import os
2
- import sys
3
-
4
- sys.path.append(os.getcwd())
5
-
6
- from main.app.core.ui import gr_info, gr_warning
7
- from main.app.variables import config, translations, configs
8
-
9
- def f0_extract(audio, f0_method, f0_onnx):
10
- if not audio or not os.path.exists(audio) or os.path.isdir(audio):
11
- gr_warning(translations["input_not_valid"])
12
- return [None]*2
13
-
14
- import librosa
15
- import numpy as np
16
- import matplotlib.pyplot as plt
17
-
18
- from main.library.utils import check_assets, load_audio
19
- from main.library.predictors.Generator import Generator
20
-
21
- check_assets(f0_method, "", f0_onnx, "")
22
-
23
- f0_path = os.path.join(configs["f0_path"], os.path.splitext(os.path.basename(audio))[0])
24
- image_path = os.path.join(f0_path, "f0.png")
25
- txt_path = os.path.join(f0_path, "f0.txt")
26
-
27
- gr_info(translations["start_extract"])
28
-
29
- if not os.path.exists(f0_path): os.makedirs(f0_path, exist_ok=True)
30
-
31
- y = load_audio(audio, sample_rate=16000)
32
- f0_generator = Generator(16000, 160, 50, 1100, 0.5, is_half=config.is_half, device=config.device, f0_onnx_mode=f0_onnx, del_onnx_model=f0_onnx)
33
- _, pitchf = f0_generator.calculator(config.x_pad, f0_method, y, 0, None, 3, False, 0, None, False)
34
-
35
- F_temp = np.array(pitchf, dtype=np.float32)
36
- F_temp[F_temp == 0] = np.nan
37
-
38
- f0 = 1200 * np.log2(F_temp / librosa.midi_to_hz(0))
39
-
40
- plt.figure(figsize=(10, 4))
41
- plt.plot(f0)
42
- plt.title(f0_method)
43
- plt.xlabel(translations["time_frames"])
44
- plt.ylabel(translations["Frequency"])
45
- plt.savefig(image_path)
46
- plt.close()
47
-
48
- with open(txt_path, "w") as f:
49
- for i, f0_value in enumerate(f0):
50
- f.write(f"{i * 100.0},{f0_value}\n")
51
-
52
- gr_info(translations["extract_done"])
53
-
54
- return [txt_path, image_path]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
main/app/core/inference.py DELETED
@@ -1,441 +0,0 @@
1
- import os
2
- import re
3
- import gc
4
- import sys
5
- import shutil
6
- import datetime
7
- import subprocess
8
-
9
- import numpy as np
10
-
11
- sys.path.append(os.getcwd())
12
-
13
- from main.app.variables import logger, config, configs, translations, python
14
- from main.app.core.ui import gr_info, gr_warning, gr_error, process_output, replace_export_format
15
-
16
- def convert(pitch, filter_radius, index_rate, rms_mix_rate, protect, hop_length, f0_method, input_path, output_path, pth_path, index_path, f0_autotune, clean_audio, clean_strength, export_format, embedder_model, resample_sr, split_audio, f0_autotune_strength, checkpointing, f0_onnx, embedders_mode, formant_shifting, formant_qfrency, formant_timbre, f0_file, proposal_pitch, proposal_pitch_threshold, audio_processing=False, alpha=0.5):
17
- subprocess.run([
18
- python,
19
- configs["convert_path"],
20
- "--pitch", str(pitch),
21
- "--filter_radius", str(filter_radius),
22
- "--index_rate", str(index_rate),
23
- "--rms_mix_rate", str(rms_mix_rate),
24
- "--protect", str(protect),
25
- "--hop_length", str(hop_length),
26
- "--f0_method", f0_method,
27
- "--input_path", input_path,
28
- "--output_path", output_path,
29
- "--pth_path", pth_path,
30
- "--index_path", index_path,
31
- "--f0_autotune", str(f0_autotune),
32
- "--clean_audio", str(clean_audio),
33
- "--clean_strength", str(clean_strength),
34
- "--export_format", export_format,
35
- "--embedder_model", embedder_model,
36
- "--resample_sr", str(resample_sr),
37
- "--split_audio", str(split_audio),
38
- "--f0_autotune_strength", str(f0_autotune_strength),
39
- "--checkpointing", str(checkpointing),
40
- "--f0_onnx", str(f0_onnx),
41
- "--embedders_mode", embedders_mode,
42
- "--formant_shifting", str(formant_shifting),
43
- "--formant_qfrency", str(formant_qfrency),
44
- "--formant_timbre", str(formant_timbre),
45
- "--f0_file", f0_file,
46
- "--proposal_pitch", str(proposal_pitch),
47
- "--proposal_pitch_threshold", str(proposal_pitch_threshold),
48
- "--audio_processing", str(audio_processing),
49
- "--alpha", str(alpha)
50
- ])
51
-
52
- def convert_audio(clean, autotune, use_audio, use_original, convert_backing, not_merge_backing, merge_instrument, pitch, clean_strength, model, index, index_rate, input, output, format, method, hybrid_method, hop_length, embedders, custom_embedders, resample_sr, filter_radius, rms_mix_rate, protect, split_audio, f0_autotune_strength, input_audio_name, checkpointing, onnx_f0_mode, formant_shifting, formant_qfrency, formant_timbre, f0_file, embedders_mode, proposal_pitch, proposal_pitch_threshold, audio_processing=False, alpha=0.5):
53
- model_path = os.path.join(configs["weights_path"], model) if not os.path.exists(model) else model
54
-
55
- return_none = [None]*6
56
- return_none[5] = {"visible": True, "__type__": "update"}
57
-
58
- if not use_audio:
59
- if merge_instrument or not_merge_backing or convert_backing or use_original:
60
- gr_warning(translations["turn_on_use_audio"])
61
- return return_none
62
-
63
- if use_original:
64
- if convert_backing:
65
- gr_warning(translations["turn_off_convert_backup"])
66
- return return_none
67
- elif not_merge_backing:
68
- gr_warning(translations["turn_off_merge_backup"])
69
- return return_none
70
-
71
- if not model or not os.path.exists(model_path) or os.path.isdir(model_path) or not model.endswith((".pth", ".onnx")):
72
- gr_warning(translations["provide_file"].format(filename=translations["model"]))
73
- return return_none
74
-
75
- f0method, embedder_model = (method if method != "hybrid" else hybrid_method), (embedders if embedders != "custom" else custom_embedders)
76
-
77
- if use_audio:
78
- output_audio = os.path.join(configs["audios_path"], input_audio_name)
79
-
80
- from main.library.utils import pydub_load
81
-
82
- def get_audio_file(label):
83
- matching_files = [f for f in os.listdir(output_audio) if label in f]
84
-
85
- if not matching_files: return translations["notfound"]
86
- return os.path.join(output_audio, matching_files[0])
87
-
88
- output_path = os.path.join(output_audio, f"Convert_Vocals.{format}")
89
- output_backing = os.path.join(output_audio, f"Convert_Backing.{format}")
90
- output_merge_backup = os.path.join(output_audio, f"Vocals+Backing.{format}")
91
- output_merge_instrument = os.path.join(output_audio, f"Vocals+Instruments.{format}")
92
-
93
- if os.path.exists(output_audio): os.makedirs(output_audio, exist_ok=True)
94
- output_path = process_output(output_path)
95
-
96
- if use_original:
97
- original_vocal = get_audio_file('Original_Vocals_No_Reverb.')
98
-
99
- if original_vocal == translations["notfound"]: original_vocal = get_audio_file('Original_Vocals.')
100
-
101
- if original_vocal == translations["notfound"]:
102
- gr_warning(translations["not_found_original_vocal"])
103
- return return_none
104
-
105
- input_path = original_vocal
106
- else:
107
- main_vocal = get_audio_file('Main_Vocals_No_Reverb.')
108
- backing_vocal = get_audio_file('Backing_Vocals.')
109
-
110
- if main_vocal == translations["notfound"]: main_vocal = get_audio_file('Main_Vocals.')
111
- if main_vocal == translations["notfound"]:
112
- gr_warning(translations["not_found_main_vocal"])
113
- return return_none
114
-
115
- if not not_merge_backing and backing_vocal == translations["notfound"]:
116
- gr_warning(translations["not_found_backing_vocal"])
117
- return return_none
118
-
119
- input_path = main_vocal
120
- backing_path = backing_vocal
121
-
122
- gr_info(translations["convert_vocal"])
123
-
124
- convert(pitch, filter_radius, index_rate, rms_mix_rate, protect, hop_length, f0method, input_path, output_path, model_path, index, autotune, clean, clean_strength, format, embedder_model, resample_sr, split_audio, f0_autotune_strength, checkpointing, onnx_f0_mode, embedders_mode, formant_shifting, formant_qfrency, formant_timbre, f0_file, proposal_pitch, proposal_pitch_threshold, audio_processing, alpha)
125
-
126
- gr_info(translations["convert_success"])
127
-
128
- if convert_backing:
129
- output_backing = process_output(output_backing)
130
-
131
- gr_info(translations["convert_backup"])
132
-
133
- convert(pitch, filter_radius, index_rate, rms_mix_rate, protect, hop_length, f0method, backing_path, output_backing, model_path, index, autotune, clean, clean_strength, format, embedder_model, resample_sr, split_audio, f0_autotune_strength, checkpointing, onnx_f0_mode, embedders_mode, formant_shifting, formant_qfrency, formant_timbre, f0_file, proposal_pitch, proposal_pitch_threshold, audio_processing, alpha)
134
-
135
- gr_info(translations["convert_backup_success"])
136
-
137
- try:
138
- if not not_merge_backing and not use_original:
139
- backing_source = output_backing if convert_backing else backing_vocal
140
-
141
- output_merge_backup = process_output(output_merge_backup)
142
-
143
- gr_info(translations["merge_backup"])
144
-
145
- pydub_load(output_path, volume=-4).overlay(pydub_load(backing_source, volume=-6)).export(output_merge_backup, format=format)
146
-
147
- gr_info(translations["merge_success"])
148
-
149
- if merge_instrument:
150
- vocals = output_merge_backup if not not_merge_backing and not use_original else output_path
151
-
152
- output_merge_instrument = process_output(output_merge_instrument)
153
-
154
- gr_info(translations["merge_instruments_process"])
155
-
156
- instruments = get_audio_file('Instruments.')
157
-
158
- if instruments == translations["notfound"]:
159
- gr_warning(translations["not_found_instruments"])
160
- output_merge_instrument = None
161
- else: pydub_load(instruments, volume=-7).overlay(pydub_load(vocals, volume=-4 if use_original else None)).export(output_merge_instrument, format=format)
162
-
163
- gr_info(translations["merge_success"])
164
- except:
165
- return return_none
166
-
167
- return [(None if use_original else output_path), output_backing, (None if not_merge_backing and use_original else output_merge_backup), (output_path if use_original else None), (output_merge_instrument if merge_instrument else None), {"visible": True, "__type__": "update"}]
168
- else:
169
- if not input or not os.path.exists(input):
170
- gr_warning(translations["input_not_valid"])
171
- return return_none
172
-
173
- if not output:
174
- gr_warning(translations["output_not_valid"])
175
- return return_none
176
-
177
- output = replace_export_format(output, format)
178
-
179
- if os.path.isdir(input):
180
- gr_info(translations["is_folder"])
181
-
182
- if not [f for f in os.listdir(input) if f.lower().endswith(("wav", "mp3", "flac", "ogg", "opus", "m4a", "mp4", "aac", "alac", "wma", "aiff", "webm", "ac3"))]:
183
- gr_warning(translations["not_found_in_folder"])
184
- return return_none
185
-
186
- gr_info(translations["batch_convert"])
187
-
188
- output_dir = os.path.dirname(output) or output
189
- convert(pitch, filter_radius, index_rate, rms_mix_rate, protect, hop_length, f0method, input, output_dir, model_path, index, autotune, clean, clean_strength, format, embedder_model, resample_sr, split_audio, f0_autotune_strength, checkpointing, onnx_f0_mode, embedders_mode, formant_shifting, formant_qfrency, formant_timbre, f0_file, proposal_pitch, proposal_pitch_threshold, audio_processing, alpha)
190
-
191
- gr_info(translations["batch_convert_success"])
192
-
193
- return return_none
194
- else:
195
- output_dir = os.path.dirname(output) or output
196
-
197
- if not os.path.exists(output_dir): os.makedirs(output_dir, exist_ok=True)
198
- output = process_output(output)
199
-
200
- gr_info(translations["convert_vocal"])
201
-
202
- convert(pitch, filter_radius, index_rate, rms_mix_rate, protect, hop_length, f0method, input, output, model_path, index, autotune, clean, clean_strength, format, embedder_model, resample_sr, split_audio, f0_autotune_strength, checkpointing, onnx_f0_mode, embedders_mode, formant_shifting, formant_qfrency, formant_timbre, f0_file, proposal_pitch, proposal_pitch_threshold, audio_processing, alpha)
203
-
204
- gr_info(translations["convert_success"])
205
-
206
- return_none[0] = output
207
- return return_none
208
-
209
- def convert_selection(clean, autotune, use_audio, use_original, convert_backing, not_merge_backing, merge_instrument, pitch, clean_strength, model, index, index_rate, input, output, format, method, hybrid_method, hop_length, embedders, custom_embedders, resample_sr, filter_radius, rms_mix_rate, protect, split_audio, f0_autotune_strength, checkpointing, onnx_f0_mode, formant_shifting, formant_qfrency, formant_timbre, f0_file, embedders_mode, proposal_pitch, proposal_pitch_threshold, audio_processing=False, alpha=0.5):
210
- if use_audio:
211
- gr_info(translations["search_separate"])
212
- choice = [f for f in os.listdir(configs["audios_path"]) if os.path.isdir(os.path.join(configs["audios_path"], f))] if config.debug_mode else [f for f in os.listdir(configs["audios_path"]) if os.path.isdir(os.path.join(configs["audios_path"], f)) and any(file.lower().endswith((".wav", ".mp3", ".flac", ".ogg", ".opus", ".m4a", ".mp4", ".aac", ".alac", ".wma", ".aiff", ".webm", ".ac3")) for file in os.listdir(os.path.join(configs["audios_path"], f)))]
213
-
214
- gr_info(translations["found_choice"].format(choice=len(choice)))
215
-
216
- if len(choice) == 0:
217
- gr_warning(translations["separator==0"])
218
-
219
- return [{"choices": [], "value": "", "interactive": False, "visible": False, "__type__": "update"}, None, None, None, None, None, {"visible": True, "__type__": "update"}, {"visible": False, "__type__": "update"}]
220
- elif len(choice) == 1:
221
- convert_output = convert_audio(clean, autotune, use_audio, use_original, convert_backing, not_merge_backing, merge_instrument, pitch, clean_strength, model, index, index_rate, None, None, format, method, hybrid_method, hop_length, embedders, custom_embedders, resample_sr, filter_radius, rms_mix_rate, protect, split_audio, f0_autotune_strength, choice[0], checkpointing, onnx_f0_mode, formant_shifting, formant_qfrency, formant_timbre, f0_file, embedders_mode, proposal_pitch, proposal_pitch_threshold, audio_processing, alpha)
222
-
223
- return [{"choices": [], "value": "", "interactive": False, "visible": False, "__type__": "update"}, convert_output[0], convert_output[1], convert_output[2], convert_output[3], convert_output[4], {"visible": True, "__type__": "update"}, {"visible": False, "__type__": "update"}]
224
- else: return [{"choices": choice, "value": choice[0], "interactive": True, "visible": True, "__type__": "update"}, None, None, None, None, None, {"visible": False, "__type__": "update"}, {"visible": True, "__type__": "update"}]
225
- else:
226
- main_convert = convert_audio(clean, autotune, use_audio, use_original, convert_backing, not_merge_backing, merge_instrument, pitch, clean_strength, model, index, index_rate, input, output, format, method, hybrid_method, hop_length, embedders, custom_embedders, resample_sr, filter_radius, rms_mix_rate, protect, split_audio, f0_autotune_strength, None, checkpointing, onnx_f0_mode, formant_shifting, formant_qfrency, formant_timbre, f0_file, embedders_mode, proposal_pitch, proposal_pitch_threshold, audio_processing, alpha)
227
-
228
- return [{"choices": [], "value": "", "interactive": False, "visible": False, "__type__": "update"}, main_convert[0], None, None, None, None, {"visible": True, "__type__": "update"}, {"visible": False, "__type__": "update"}]
229
-
230
- def whisper_process(model_size, input_audio, configs, device, out_queue, word_timestamps=True):
231
- from main.library.speaker_diarization.whisper import load_model
232
-
233
- try:
234
- segments = load_model(model_size, device=device).transcribe(input_audio, fp16=configs.get("fp16", False), word_timestamps=word_timestamps)
235
- out_queue.put(segments["segments"])
236
- except Exception as e:
237
- out_queue.put(e)
238
- finally:
239
- del segments
240
- gc.collect()
241
-
242
- def convert_with_whisper(num_spk, model_size, cleaner, clean_strength, autotune, f0_autotune_strength, checkpointing, model_1, model_2, model_index_1, model_index_2, pitch_1, pitch_2, index_strength_1, index_strength_2, export_format, input_audio, output_audio, onnx_f0_mode, method, hybrid_method, hop_length, embed_mode, embedders, custom_embedders, resample_sr, filter_radius, rms_mix_rate, protect, formant_shifting, formant_qfrency_1, formant_timbre_1, formant_qfrency_2, formant_timbre_2, proposal_pitch, proposal_pitch_threshold, audio_processing=False, alpha=0.5):
243
- import librosa
244
- import multiprocessing as mp
245
-
246
- from pydub import AudioSegment
247
- from sklearn.cluster import AgglomerativeClustering
248
-
249
- from main.library.utils import clear_gpu_cache
250
- from main.library.speaker_diarization.audio import Audio
251
- from main.library.speaker_diarization.segment import Segment
252
- from main.library.utils import check_spk_diarization, pydub_load
253
- from main.library.speaker_diarization.embedding import SpeechBrainPretrainedSpeakerEmbedding
254
-
255
- check_spk_diarization(model_size)
256
- model_pth_1, model_pth_2 = os.path.join(configs["weights_path"], model_1) if not os.path.exists(model_1) else model_1, os.path.join(configs["weights_path"], model_2) if not os.path.exists(model_2) else model_2
257
-
258
- if (not model_1 or not os.path.exists(model_pth_1) or os.path.isdir(model_pth_1) or not model_pth_1.endswith((".pth", ".onnx"))) and (not model_2 or not os.path.exists(model_pth_2) or os.path.isdir(model_pth_2) or not model_pth_2.endswith((".pth", ".onnx"))):
259
- gr_warning(translations["provide_file"].format(filename=translations["model"]))
260
- return None
261
-
262
- if not model_1: model_pth_1 = model_pth_2
263
- if not model_2: model_pth_2 = model_pth_1
264
-
265
- if not input_audio or not os.path.exists(input_audio) or os.path.isdir(input_audio):
266
- gr_warning(translations["input_not_valid"])
267
- return None
268
-
269
- if not output_audio:
270
- gr_warning(translations["output_not_valid"])
271
- return None
272
-
273
- output_audio = process_output(output_audio)
274
- gr_info(translations["start_whisper"])
275
-
276
- try:
277
- try:
278
- mp.set_start_method("spawn")
279
- except:
280
- pass
281
-
282
- whisper_queue = mp.Queue()
283
- whisperprocess = mp.Process(target=whisper_process, args=(model_size, input_audio, configs, config.device, whisper_queue, True))
284
- whisperprocess.start()
285
-
286
- segments = whisper_queue.get()
287
- audio = Audio()
288
-
289
- embedding_model = SpeechBrainPretrainedSpeakerEmbedding(embedding=os.path.join(configs["speaker_diarization_path"], "models", "speechbrain"), device=config.device)
290
- y, sr = librosa.load(input_audio, sr=None)
291
- duration = len(y) / sr
292
-
293
- def segment_embedding(segment):
294
- waveform, _ = audio.crop(input_audio, Segment(segment["start"], min(duration, segment["end"])))
295
- return embedding_model(waveform.mean(dim=0, keepdim=True)[None] if waveform.shape[0] == 2 else waveform[None])
296
-
297
- def time(secs):
298
- return datetime.timedelta(seconds=round(secs))
299
-
300
- def merge_audio(files_list, time_stamps, original_file_path, output_path, format):
301
- def extract_number(filename):
302
- match = re.search(r'_(\d+)', filename)
303
- return int(match.group(1)) if match else 0
304
-
305
- total_duration = len(pydub_load(original_file_path))
306
- combined = AudioSegment.empty()
307
- current_position = 0
308
-
309
- for file, (start_i, end_i) in zip(sorted(files_list, key=extract_number), time_stamps):
310
- if start_i > current_position: combined += AudioSegment.silent(duration=start_i - current_position)
311
-
312
- combined += pydub_load(file)
313
- current_position = end_i
314
-
315
- if current_position < total_duration: combined += AudioSegment.silent(duration=total_duration - current_position)
316
- combined.export(output_path, format=format)
317
-
318
- return output_path
319
-
320
- embeddings = np.zeros(shape=(len(segments), 192))
321
- for i, segment in enumerate(segments):
322
- embeddings[i] = segment_embedding(segment)
323
-
324
- labels = AgglomerativeClustering(num_spk).fit(np.nan_to_num(embeddings)).labels_
325
- for i in range(len(segments)):
326
- segments[i]["speaker"] = 'SPEAKER ' + str(labels[i] + 1)
327
-
328
- merged_segments, current_text = [], []
329
- current_speaker, current_start = None, None
330
-
331
- for i, segment in enumerate(segments):
332
- speaker = segment["speaker"]
333
- start_time = segment["start"]
334
- text = segment["text"][1:]
335
-
336
- if speaker == current_speaker:
337
- current_text.append(text)
338
- end_time = segment["end"]
339
- else:
340
- if current_speaker is not None: merged_segments.append({"speaker": current_speaker, "start": current_start, "end": end_time, "text": " ".join(current_text)})
341
-
342
- current_speaker = speaker
343
- current_start = start_time
344
- current_text = [text]
345
- end_time = segment["end"]
346
-
347
- if current_speaker is not None: merged_segments.append({"speaker": current_speaker, "start": current_start, "end": end_time, "text": " ".join(current_text)})
348
-
349
- gr_info(translations["whisper_done"])
350
-
351
- x = ""
352
- for segment in merged_segments:
353
- x += f"\n{segment['speaker']} {str(time(segment['start']))} - {str(time(segment['end']))}\n"
354
- x += segment["text"] + "\n"
355
-
356
- logger.info(x)
357
-
358
- del audio, embedding_model, segments, labels
359
- clear_gpu_cache()
360
- gc.collect()
361
-
362
- gr_info(translations["process_audio"])
363
-
364
- audio = pydub_load(input_audio)
365
- output_folder = "audios_temp"
366
-
367
- if os.path.exists(output_folder): shutil.rmtree(output_folder, ignore_errors=True)
368
- for f in [output_folder, os.path.join(output_folder, "1"), os.path.join(output_folder, "2")]:
369
- os.makedirs(f, exist_ok=True)
370
-
371
- time_stamps, processed_segments = [], []
372
- for i, segment in enumerate(merged_segments):
373
- start_ms = int(segment["start"] * 1000)
374
- end_ms = int(segment["end"] * 1000)
375
-
376
- index = i + 1
377
-
378
- segment_filename = os.path.join(output_folder, "1" if i % 2 == 1 else "2", f"segment_{index}.wav")
379
- audio[start_ms:end_ms].export(segment_filename, format="wav")
380
-
381
- processed_segments.append(os.path.join(output_folder, "1" if i % 2 == 1 else "2", f"segment_{index}_output.wav"))
382
- time_stamps.append((start_ms, end_ms))
383
-
384
- f0method, embedder_model = (method if method != "hybrid" else hybrid_method), (embedders if embedders != "custom" else custom_embedders)
385
-
386
- gr_info(translations["process_done_start_convert"])
387
-
388
- convert(pitch_1, filter_radius, index_strength_1, rms_mix_rate, protect, hop_length, f0method, os.path.join(output_folder, "1"), output_folder, model_pth_1, model_index_1, autotune, cleaner, clean_strength, "wav", embedder_model, resample_sr, False, f0_autotune_strength, checkpointing, onnx_f0_mode, embed_mode, formant_shifting, formant_qfrency_1, formant_timbre_1, "", proposal_pitch, proposal_pitch_threshold, audio_processing, alpha)
389
- convert(pitch_2, filter_radius, index_strength_2, rms_mix_rate, protect, hop_length, f0method, os.path.join(output_folder, "2"), output_folder, model_pth_2, model_index_2, autotune, cleaner, clean_strength, "wav", embedder_model, resample_sr, False, f0_autotune_strength, checkpointing, onnx_f0_mode, embed_mode, formant_shifting, formant_qfrency_2, formant_timbre_2, "", proposal_pitch, proposal_pitch_threshold, audio_processing, alpha)
390
-
391
- gr_info(translations["convert_success"])
392
- return merge_audio(processed_segments, time_stamps, input_audio, replace_export_format(output_audio, export_format), export_format)
393
- except Exception as e:
394
- gr_error(translations["error_occurred"].format(e=e))
395
- import traceback
396
- logger.debug(traceback.format_exc())
397
- return None
398
- finally:
399
- if os.path.exists("audios_temp"): shutil.rmtree("audios_temp", ignore_errors=True)
400
-
401
- def convert_tts(clean, autotune, pitch, clean_strength, model, index, index_rate, input, output, format, method, hybrid_method, hop_length, embedders, custom_embedders, resample_sr, filter_radius, rms_mix_rate, protect, split_audio, f0_autotune_strength, checkpointing, onnx_f0_mode, formant_shifting, formant_qfrency, formant_timbre, f0_file, embedders_mode, proposal_pitch, proposal_pitch_threshold, audio_processing=False, alpha=0.5):
402
- model_path = os.path.join(configs["weights_path"], model) if not os.path.exists(model) else model
403
-
404
- if not model_path or not os.path.exists(model_path) or os.path.isdir(model_path) or not model.endswith((".pth", ".onnx")):
405
- gr_warning(translations["provide_file"].format(filename=translations["model"]))
406
- return None
407
-
408
- if not input or not os.path.exists(input):
409
- gr_warning(translations["input_not_valid"])
410
- return None
411
-
412
- if os.path.isdir(input):
413
- input_audio = [f for f in os.listdir(input) if "tts" in f and f.lower().endswith(("wav", "mp3", "flac", "ogg", "opus", "m4a", "mp4", "aac", "alac", "wma", "aiff", "webm", "ac3"))]
414
-
415
- if not input_audio:
416
- gr_warning(translations["not_found_in_folder"])
417
- return None
418
-
419
- input = os.path.join(input, input_audio[0])
420
-
421
- if not output:
422
- gr_warning(translations["output_not_valid"])
423
- return None
424
-
425
- output = replace_export_format(output, format)
426
- if os.path.isdir(output): output = os.path.join(output, f"tts.{format}")
427
-
428
- output_dir = os.path.dirname(output)
429
- if not os.path.exists(output_dir): os.makedirs(output_dir, exist_ok=True)
430
-
431
- output = process_output(output)
432
-
433
- f0method = method if method != "hybrid" else hybrid_method
434
- embedder_model = embedders if embedders != "custom" else custom_embedders
435
-
436
- gr_info(translations["convert_vocal"])
437
-
438
- convert(pitch, filter_radius, index_rate, rms_mix_rate, protect, hop_length, f0method, input, output, model_path, index, autotune, clean, clean_strength, format, embedder_model, resample_sr, split_audio, f0_autotune_strength, checkpointing, onnx_f0_mode, embedders_mode, formant_shifting, formant_qfrency, formant_timbre, f0_file, proposal_pitch, proposal_pitch_threshold, audio_processing, alpha)
439
-
440
- gr_info(translations["convert_success"])
441
- return output
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
main/app/core/model_utils.py DELETED
@@ -1,164 +0,0 @@
1
- import os
2
- import sys
3
- import json
4
- import torch
5
- import datetime
6
-
7
- sys.path.append(os.getcwd())
8
-
9
- from main.app.core.ui import gr_info, gr_warning, gr_error
10
- from main.app.variables import config, logger, translations, configs
11
-
12
- def fushion_model_pth(name, pth_1, pth_2, ratio):
13
- if not name.endswith(".pth"): name = name + ".pth"
14
-
15
- if not pth_1 or not os.path.exists(pth_1) or not pth_1.endswith(".pth"):
16
- gr_warning(translations["provide_file"].format(filename=translations["model"] + " 1"))
17
- return [translations["provide_file"].format(filename=translations["model"] + " 1"), None]
18
-
19
- if not pth_2 or not os.path.exists(pth_2) or not pth_2.endswith(".pth"):
20
- gr_warning(translations["provide_file"].format(filename=translations["model"] + " 2"))
21
- return [translations["provide_file"].format(filename=translations["model"] + " 2"), None]
22
-
23
- from collections import OrderedDict
24
-
25
- def extract(ckpt):
26
- a = ckpt["model"]
27
- opt = OrderedDict()
28
- opt["weight"] = {}
29
-
30
- for key in a.keys():
31
- if "enc_q" in key: continue
32
-
33
- opt["weight"][key] = a[key]
34
-
35
- return opt
36
-
37
- try:
38
- ckpt1 = torch.load(pth_1, map_location="cpu", weights_only=True)
39
- ckpt2 = torch.load(pth_2, map_location="cpu", weights_only=True)
40
-
41
- if ckpt1["sr"] != ckpt2["sr"]:
42
- gr_warning(translations["sr_not_same"])
43
- return [translations["sr_not_same"], None]
44
-
45
- cfg = ckpt1["config"]
46
- cfg_f0 = ckpt1["f0"]
47
- cfg_version = ckpt1["version"]
48
- cfg_sr = ckpt1["sr"]
49
-
50
- vocoder = ckpt1.get("vocoder", "Default")
51
- rms_extract = ckpt1.get("energy", False)
52
-
53
- ckpt1 = extract(ckpt1) if "model" in ckpt1 else ckpt1["weight"]
54
- ckpt2 = extract(ckpt2) if "model" in ckpt2 else ckpt2["weight"]
55
-
56
- if sorted(list(ckpt1.keys())) != sorted(list(ckpt2.keys())):
57
- gr_warning(translations["architectures_not_same"])
58
- return [translations["architectures_not_same"], None]
59
-
60
- gr_info(translations["start"].format(start=translations["fushion_model"]))
61
-
62
- opt = OrderedDict()
63
- opt["weight"] = {}
64
-
65
- for key in ckpt1.keys():
66
- if key == "emb_g.weight" and ckpt1[key].shape != ckpt2[key].shape:
67
- min_shape0 = min(ckpt1[key].shape[0], ckpt2[key].shape[0])
68
- opt["weight"][key] = (ratio * (ckpt1[key][:min_shape0].float()) + (1 - ratio) * (ckpt2[key][:min_shape0].float())).half()
69
- else: opt["weight"][key] = (ratio * (ckpt1[key].float()) + (1 - ratio) * (ckpt2[key].float())).half()
70
-
71
- opt["config"] = cfg
72
- opt["sr"] = cfg_sr
73
- opt["f0"] = cfg_f0
74
- opt["version"] = cfg_version
75
- opt["infos"] = translations["model_fushion_info"].format(name=name, pth_1=pth_1, pth_2=pth_2, ratio=ratio)
76
- opt["vocoder"] = vocoder
77
- opt["energy"] = rms_extract
78
-
79
- output_model = configs["weights_path"]
80
- if not os.path.exists(output_model): os.makedirs(output_model, exist_ok=True)
81
-
82
- torch.save(opt, os.path.join(output_model, name))
83
-
84
- gr_info(translations["success"])
85
- return [translations["success"], os.path.join(output_model, name)]
86
- except Exception as e:
87
- gr_error(message=translations["error_occurred"].format(e=e))
88
- return [e, None]
89
-
90
- def fushion_model(name, path_1, path_2, ratio):
91
- if not name:
92
- gr_warning(translations["provide_name_is_save"])
93
- return [translations["provide_name_is_save"], None]
94
-
95
- if path_1.endswith(".pth") and path_2.endswith(".pth"): return fushion_model_pth(name, path_1, path_2, ratio)
96
- else:
97
- gr_warning(translations["format_not_valid"])
98
- return [None, None]
99
-
100
- def onnx_export(model_path):
101
- if not model_path.endswith(".pth"): model_path += ".pth"
102
- if not model_path or not os.path.exists(model_path) or not model_path.endswith(".pth"): return gr_warning(translations["provide_file"].format(filename=translations["model"]))
103
-
104
- try:
105
- gr_info(translations["start_onnx_export"])
106
-
107
- from main.library.onnx.onnx_export import onnx_exporter
108
- output = onnx_exporter(model_path, model_path.replace(".pth", ".onnx"), is_half=config.is_half, device=config.device)
109
-
110
- gr_info(translations["success"])
111
- return output
112
- except Exception as e:
113
- return gr_error(e)
114
-
115
- def model_info(path):
116
- if not path or not os.path.exists(path) or os.path.isdir(path) or not path.endswith((".pth", ".onnx")): return gr_warning(translations["provide_file"].format(filename=translations["model"]))
117
-
118
- def prettify_date(date_str):
119
- if date_str == translations["not_found_create_time"]: return None
120
-
121
- try:
122
- return datetime.datetime.strptime(date_str, "%Y-%m-%dT%H:%M:%S.%f").strftime("%Y-%m-%d %H:%M:%S")
123
- except ValueError as e:
124
- logger.debug(e)
125
- return translations["format_not_valid"]
126
-
127
- if path.endswith(".pth"): model_data = torch.load(path, map_location="cpu")
128
- else:
129
- import onnx
130
-
131
- model = onnx.load(path)
132
- model_data = None
133
-
134
- for prop in model.metadata_props:
135
- if prop.key == "model_info":
136
- model_data = json.loads(prop.value)
137
- break
138
-
139
- gr_info(translations["read_info"])
140
-
141
- epochs = model_data.get("epoch", None)
142
- if epochs is None:
143
- epochs = model_data.get("info", None)
144
- try:
145
- epoch = epochs.replace("epoch", "").replace("e", "").isdigit()
146
- if epoch and epochs is None: epochs = translations["not_found"].format(name=translations["epoch"])
147
- except:
148
- pass
149
-
150
- steps = model_data.get("step", translations["not_found"].format(name=translations["step"]))
151
- sr = model_data.get("sr", translations["not_found"].format(name=translations["sr"]))
152
- f0 = model_data.get("f0", translations["not_found"].format(name=translations["f0"]))
153
- version = model_data.get("version", translations["not_found"].format(name=translations["version"]))
154
- creation_date = model_data.get("creation_date", translations["not_found_create_time"])
155
- model_hash = model_data.get("model_hash", translations["not_found"].format(name="model_hash"))
156
- pitch_guidance = translations["trained_f0"] if f0 else translations["not_f0"]
157
- creation_date_str = prettify_date(creation_date) if creation_date else translations["not_found_create_time"]
158
- model_name = model_data.get("model_name", translations["unregistered"])
159
- model_author = model_data.get("author", translations["not_author"])
160
- vocoder = model_data.get("vocoder", "Default")
161
- rms_extract = model_data.get("energy", False)
162
-
163
- gr_info(translations["success"])
164
- return translations["model_info"].format(model_name=model_name, model_author=model_author, epochs=epochs, steps=steps, version=version, sr=sr, pitch_guidance=pitch_guidance, model_hash=model_hash, creation_date_str=creation_date_str, vocoder=vocoder, rms_extract=rms_extract)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
main/app/core/presets.py DELETED
@@ -1,166 +0,0 @@
1
- import os
2
- import sys
3
- import json
4
-
5
- sys.path.append(os.getcwd())
6
-
7
- from main.app.variables import translations, configs
8
- from main.app.core.ui import gr_info, gr_warning, change_preset_choices, change_effect_preset_choices
9
-
10
- def load_presets(presets, cleaner, autotune, pitch, clean_strength, index_strength, resample_sr, filter_radius, rms_mix_rate, protect, split_audio, f0_autotune_strength, formant_shifting, formant_qfrency, formant_timbre, proposal_pitch, proposal_pitch_threshold):
11
- if not presets: gr_warning(translations["provide_file_settings"])
12
-
13
- file = {}
14
- if presets:
15
- with open(os.path.join(configs["presets_path"], presets)) as f:
16
- file = json.load(f)
17
-
18
- gr_info(translations["load_presets"].format(presets=presets))
19
-
20
- return [file.get("cleaner", cleaner), file.get("autotune", autotune), file.get("pitch", pitch), file.get("clean_strength", clean_strength), file.get("index_strength", index_strength), file.get("resample_sr", resample_sr), file.get("filter_radius", filter_radius), file.get("rms_mix_rate", rms_mix_rate), file.get("protect", protect), file.get("split_audio", split_audio), file.get("f0_autotune_strength", f0_autotune_strength), file.get("formant_shifting", formant_shifting), file.get("formant_qfrency", formant_qfrency), file.get("formant_timbre", formant_timbre), file.get("proposal_pitch", proposal_pitch), file.get("proposal_pitch_threshold", proposal_pitch_threshold)]
21
-
22
- def save_presets(name, cleaner, autotune, pitch, clean_strength, index_strength, resample_sr, filter_radius, rms_mix_rate, protect, split_audio, f0_autotune_strength, cleaner_chbox, autotune_chbox, pitch_chbox, index_strength_chbox, resample_sr_chbox, filter_radius_chbox, rms_mix_rate_chbox, protect_chbox, split_audio_chbox, formant_shifting_chbox, formant_shifting, formant_qfrency, formant_timbre, proposal_pitch, proposal_pitch_threshold):
23
- if not name: return gr_warning(translations["provide_filename_settings"])
24
- if not any([cleaner_chbox, autotune_chbox, pitch_chbox, index_strength_chbox, resample_sr_chbox, filter_radius_chbox, rms_mix_rate_chbox, protect_chbox, split_audio_chbox, formant_shifting_chbox]): return gr_warning(translations["choose1"])
25
-
26
- settings = {}
27
-
28
- for checkbox, data in [(cleaner_chbox, {"cleaner": cleaner, "clean_strength": clean_strength}), (autotune_chbox, {"autotune": autotune, "f0_autotune_strength": f0_autotune_strength}), (pitch_chbox, {"pitch": pitch}), (index_strength_chbox, {"index_strength": index_strength}), (resample_sr_chbox, {"resample_sr": resample_sr}), (filter_radius_chbox, {"filter_radius": filter_radius}), (rms_mix_rate_chbox, {"rms_mix_rate": rms_mix_rate}), (protect_chbox, {"protect": protect}), (split_audio_chbox, {"split_audio": split_audio}), (formant_shifting_chbox, {"formant_shifting": formant_shifting, "formant_qfrency": formant_qfrency, "formant_timbre": formant_timbre}), (proposal_pitch, {"proposal_pitch": proposal_pitch, "proposal_pitch_threshold": proposal_pitch_threshold})]:
29
- if checkbox: settings.update(data)
30
-
31
- with open(os.path.join(configs["presets_path"], name + ".conversion.json"), "w") as f:
32
- json.dump(settings, f, indent=4)
33
-
34
- gr_info(translations["export_settings"].format(name=name))
35
- return change_preset_choices()
36
-
37
- def audio_effect_load_presets(presets, resample_checkbox, audio_effect_resample_sr, chorus_depth, chorus_rate_hz, chorus_mix, chorus_centre_delay_ms, chorus_feedback, distortion_drive_db, reverb_room_size, reverb_damping, reverb_wet_level, reverb_dry_level, reverb_width, reverb_freeze_mode, pitch_shift_semitones, delay_second, delay_feedback, delay_mix, compressor_threshold_db, compressor_ratio, compressor_attack_ms, compressor_release_ms, limiter_threshold_db, limiter_release_ms, gain_db, bitcrush_bit_depth, clipping_threshold_db, phaser_rate_hz, phaser_depth, phaser_centre_frequency_hz, phaser_feedback, phaser_mix, bass_boost, bass_frequency, treble_boost, treble_frequency, fade_in, fade_out, chorus_check_box, distortion_checkbox, reverb_check_box, delay_check_box, compressor_check_box, limiter, gain_checkbox, bitcrush_checkbox, clipping_checkbox, phaser_check_box, bass_or_treble, fade):
38
- if not presets: gr_warning(translations["provide_file_settings"])
39
-
40
- file = {}
41
- if presets:
42
- with open(os.path.join(configs["presets_path"], presets)) as f:
43
- file = json.load(f)
44
-
45
- gr_info(translations["load_presets"].format(presets=presets))
46
- return [
47
- file.get("resample_checkbox", resample_checkbox), file.get("audio_effect_resample_sr", audio_effect_resample_sr),
48
- file.get("chorus_depth", chorus_depth), file.get("chorus_rate_hz", chorus_rate_hz),
49
- file.get("chorus_mix", chorus_mix), file.get("chorus_centre_delay_ms", chorus_centre_delay_ms),
50
- file.get("chorus_feedback", chorus_feedback), file.get("distortion_drive_db", distortion_drive_db),
51
- file.get("reverb_room_size", reverb_room_size), file.get("reverb_damping", reverb_damping),
52
- file.get("reverb_wet_level", reverb_wet_level), file.get("reverb_dry_level", reverb_dry_level),
53
- file.get("reverb_width", reverb_width), file.get("reverb_freeze_mode", reverb_freeze_mode),
54
- file.get("pitch_shift_semitones", pitch_shift_semitones), file.get("delay_second", delay_second),
55
- file.get("delay_feedback", delay_feedback), file.get("delay_mix", delay_mix),
56
- file.get("compressor_threshold_db", compressor_threshold_db), file.get("compressor_ratio", compressor_ratio),
57
- file.get("compressor_attack_ms", compressor_attack_ms), file.get("compressor_release_ms", compressor_release_ms),
58
- file.get("limiter_threshold_db", limiter_threshold_db), file.get("limiter_release_ms", limiter_release_ms),
59
- file.get("gain_db", gain_db), file.get("bitcrush_bit_depth", bitcrush_bit_depth),
60
- file.get("clipping_threshold_db", clipping_threshold_db), file.get("phaser_rate_hz", phaser_rate_hz),
61
- file.get("phaser_depth", phaser_depth), file.get("phaser_centre_frequency_hz", phaser_centre_frequency_hz),
62
- file.get("phaser_feedback", phaser_feedback), file.get("phaser_mix", phaser_mix),
63
- file.get("bass_boost", bass_boost), file.get("bass_frequency", bass_frequency),
64
- file.get("treble_boost", treble_boost), file.get("treble_frequency", treble_frequency),
65
- file.get("fade_in", fade_in), file.get("fade_out", fade_out),
66
- file.get("chorus_check_box", chorus_check_box), file.get("distortion_checkbox", distortion_checkbox),
67
- file.get("reverb_check_box", reverb_check_box), file.get("delay_check_box", delay_check_box),
68
- file.get("compressor_check_box", compressor_check_box), file.get("limiter", limiter),
69
- file.get("gain_checkbox", gain_checkbox), file.get("bitcrush_checkbox", bitcrush_checkbox),
70
- file.get("clipping_checkbox", clipping_checkbox), file.get("phaser_check_box", phaser_check_box),
71
- file.get("bass_or_treble", bass_or_treble), file.get("fade", fade)
72
- ]
73
-
74
- def audio_effect_save_presets(name, resample_checkbox, audio_effect_resample_sr, chorus_depth, chorus_rate_hz, chorus_mix, chorus_centre_delay_ms, chorus_feedback, distortion_drive_db, reverb_room_size, reverb_damping, reverb_wet_level, reverb_dry_level, reverb_width, reverb_freeze_mode, pitch_shift_semitones, delay_second, delay_feedback, delay_mix, compressor_threshold_db, compressor_ratio, compressor_attack_ms, compressor_release_ms, limiter_threshold_db, limiter_release_ms, gain_db, bitcrush_bit_depth, clipping_threshold_db, phaser_rate_hz, phaser_depth, phaser_centre_frequency_hz, phaser_feedback, phaser_mix, bass_boost, bass_frequency, treble_boost, treble_frequency, fade_in, fade_out, chorus_check_box, distortion_checkbox, reverb_check_box, delay_check_box, compressor_check_box, limiter, gain_checkbox, bitcrush_checkbox, clipping_checkbox, phaser_check_box, bass_or_treble, fade):
75
- if not name: return gr_warning(translations["provide_filename_settings"])
76
- if not any([resample_checkbox, chorus_check_box, distortion_checkbox, reverb_check_box, delay_check_box, compressor_check_box, limiter, gain_checkbox, bitcrush_checkbox, clipping_checkbox, phaser_check_box, bass_or_treble, fade, pitch_shift_semitones != 0]): return gr_warning(translations["choose1"])
77
-
78
- settings = {}
79
-
80
- for checkbox, data in [
81
- (resample_checkbox, {
82
- "resample_checkbox": resample_checkbox,
83
- "audio_effect_resample_sr": audio_effect_resample_sr
84
- }),
85
- (chorus_check_box, {
86
- "chorus_check_box": chorus_check_box,
87
- "chorus_depth": chorus_depth,
88
- "chorus_rate_hz": chorus_rate_hz,
89
- "chorus_mix": chorus_mix,
90
- "chorus_centre_delay_ms": chorus_centre_delay_ms,
91
- "chorus_feedback": chorus_feedback
92
- }),
93
- (distortion_checkbox, {
94
- "distortion_checkbox": distortion_checkbox,
95
- "distortion_drive_db": distortion_drive_db
96
- }),
97
- (reverb_check_box, {
98
- "reverb_check_box": reverb_check_box,
99
- "reverb_room_size": reverb_room_size,
100
- "reverb_damping": reverb_damping,
101
- "reverb_wet_level": reverb_wet_level,
102
- "reverb_dry_level": reverb_dry_level,
103
- "reverb_width": reverb_width,
104
- "reverb_freeze_mode": reverb_freeze_mode
105
- }),
106
- (pitch_shift_semitones != 0, {
107
- "pitch_shift_semitones": pitch_shift_semitones
108
- }),
109
- (delay_check_box, {
110
- "delay_check_box": delay_check_box,
111
- "delay_second": delay_second,
112
- "delay_feedback": delay_feedback,
113
- "delay_mix": delay_mix
114
- }),
115
- (compressor_check_box, {
116
- "compressor_check_box": compressor_check_box,
117
- "compressor_threshold_db": compressor_threshold_db,
118
- "compressor_ratio": compressor_ratio,
119
- "compressor_attack_ms": compressor_attack_ms,
120
- "compressor_release_ms": compressor_release_ms
121
- }),
122
- (limiter, {
123
- "limiter": limiter,
124
- "limiter_threshold_db": limiter_threshold_db,
125
- "limiter_release_ms": limiter_release_ms
126
- }),
127
- (gain_checkbox, {
128
- "gain_checkbox": gain_checkbox,
129
- "gain_db": gain_db
130
- }),
131
- (bitcrush_checkbox, {
132
- "bitcrush_checkbox": bitcrush_checkbox,
133
- "bitcrush_bit_depth": bitcrush_bit_depth
134
- }),
135
- (clipping_checkbox, {
136
- "clipping_checkbox": clipping_checkbox,
137
- "clipping_threshold_db": clipping_threshold_db
138
- }),
139
- (phaser_check_box, {
140
- "phaser_check_box": phaser_check_box,
141
- "phaser_rate_hz": phaser_rate_hz,
142
- "phaser_depth": phaser_depth,
143
- "phaser_centre_frequency_hz": phaser_centre_frequency_hz,
144
- "phaser_feedback": phaser_feedback,
145
- "phaser_mix": phaser_mix
146
- }),
147
- (bass_or_treble, {
148
- "bass_or_treble": bass_or_treble,
149
- "bass_boost": bass_boost,
150
- "bass_frequency": bass_frequency,
151
- "treble_boost": treble_boost,
152
- "treble_frequency": treble_frequency
153
- }),
154
- (fade, {
155
- "fade": fade,
156
- "fade_in": fade_in,
157
- "fade_out": fade_out
158
- })
159
- ]:
160
- if checkbox: settings.update(data)
161
-
162
- with open(os.path.join(configs["presets_path"], name + ".effect.json"), "w") as f:
163
- json.dump(settings, f, indent=4)
164
-
165
- gr_info(translations["export_settings"].format(name=name))
166
- return change_effect_preset_choices()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
main/app/core/process.py DELETED
@@ -1,135 +0,0 @@
1
- import os
2
- import re
3
- import sys
4
- import shutil
5
- import codecs
6
- import zipfile
7
- import requests
8
-
9
- sys.path.append(os.getcwd())
10
-
11
- from main.app.variables import logger, translations, configs
12
- from main.app.core.ui import gr_info, gr_warning, gr_error, process_output, replace_punctuation
13
-
14
- def read_docx_text(path):
15
- import xml.etree.ElementTree
16
-
17
- with zipfile.ZipFile(path) as docx:
18
- with docx.open("word/document.xml") as document_xml:
19
- xml_content = document_xml.read()
20
-
21
- WORD_NAMESPACE = '{http://schemas.openxmlformats.org/wordprocessingml/2006/main}'
22
-
23
- paragraphs = []
24
- for paragraph in xml.etree.ElementTree.XML(xml_content).iter(WORD_NAMESPACE + 'p'):
25
- texts = [node.text for node in paragraph.iter(WORD_NAMESPACE + 't') if node.text]
26
- if texts: paragraphs.append(''.join(texts))
27
-
28
- return '\n'.join(paragraphs)
29
-
30
- def process_input(file_path):
31
- if file_path.endswith(".srt"): file_contents = ""
32
- elif file_path.endswith(".docx"): file_contents = read_docx_text(file_path)
33
- else:
34
- try:
35
- with open(file_path, "r", encoding="utf-8") as file:
36
- file_contents = file.read()
37
- except Exception as e:
38
- gr_warning(translations["read_error"])
39
- logger.debug(e)
40
- file_contents = ""
41
-
42
- gr_info(translations["upload_success"].format(name=translations["text"]))
43
- return file_contents
44
-
45
- def move_files_from_directory(src_dir, dest_weights, dest_logs, model_name):
46
- for root, _, files in os.walk(src_dir):
47
- for file in files:
48
- file_path = os.path.join(root, file)
49
- if file.endswith(".index"):
50
- model_log_dir = os.path.join(dest_logs, model_name)
51
- os.makedirs(model_log_dir, exist_ok=True)
52
-
53
- filepath = process_output(os.path.join(model_log_dir, replace_punctuation(file)))
54
-
55
- shutil.move(file_path, filepath)
56
- elif file.endswith(".pth") and not file.startswith("D_") and not file.startswith("G_"):
57
- pth_path = process_output(os.path.join(dest_weights, model_name + ".pth"))
58
-
59
- shutil.move(file_path, pth_path)
60
- elif file.endswith(".onnx") and not file.startswith("D_") and not file.startswith("G_"):
61
- pth_path = process_output(os.path.join(dest_weights, model_name + ".onnx"))
62
-
63
- shutil.move(file_path, pth_path)
64
-
65
- def extract_name_model(filename):
66
- match = re.search(r"_([A-Za-z0-9]+)(?=_v\d*)", replace_punctuation(filename))
67
- return match.group(1) if match else None
68
-
69
- def save_drop_model(dropboxs):
70
- weight_folder = configs["weights_path"]
71
- logs_folder = configs["logs_path"]
72
- save_model_temp = "save_model_temp"
73
-
74
- if not os.path.exists(weight_folder): os.makedirs(weight_folder, exist_ok=True)
75
- if not os.path.exists(logs_folder): os.makedirs(logs_folder, exist_ok=True)
76
- if not os.path.exists(save_model_temp): os.makedirs(save_model_temp, exist_ok=True)
77
-
78
- try:
79
- for dropbox in dropboxs:
80
- shutil.move(dropbox, save_model_temp)
81
- file_name = os.path.basename(dropbox)
82
-
83
- if file_name.endswith(".zip"):
84
- shutil.unpack_archive(os.path.join(save_model_temp, file_name), save_model_temp)
85
- move_files_from_directory(save_model_temp, weight_folder, logs_folder, file_name.replace(".zip", ""))
86
- elif file_name.endswith((".pth", ".onnx")):
87
- output_file = process_output(os.path.join(weight_folder, file_name))
88
-
89
- shutil.move(os.path.join(save_model_temp, file_name), output_file)
90
- elif file_name.endswith(".index"):
91
- modelname = extract_name_model(file_name)
92
- if modelname is None: modelname = os.path.splitext(os.path.basename(file_name))[0]
93
-
94
- model_logs = os.path.join(logs_folder, modelname)
95
- if not os.path.exists(model_logs): os.makedirs(model_logs, exist_ok=True)
96
-
97
- shutil.move(os.path.join(save_model_temp, file_name), model_logs)
98
- else:
99
- gr_warning(translations["unable_analyze_model"])
100
- return None
101
-
102
- gr_info(translations["upload_success"].format(name=translations["model"]))
103
- return None
104
- except Exception as e:
105
- gr_error(message=translations["error_occurred"].format(e=e))
106
- return None
107
- finally:
108
- shutil.rmtree(save_model_temp, ignore_errors=True)
109
-
110
- def zip_file(name, pth, index):
111
- pth_path = os.path.join(configs["weights_path"], pth)
112
- if not pth or not os.path.exists(pth_path) or not pth.endswith((".pth", ".onnx")): return gr_warning(translations["provide_file"].format(filename=translations["model"]))
113
-
114
- zip_file_path = os.path.join(configs["logs_path"], name, name + ".zip")
115
- gr_info(translations["start"].format(start=translations["zip"]))
116
-
117
- with zipfile.ZipFile(zip_file_path, 'w') as zipf:
118
- zipf.write(pth_path, os.path.basename(pth_path))
119
- if index: zipf.write(index, os.path.basename(index))
120
-
121
- gr_info(translations["success"])
122
- return {"visible": True, "value": zip_file_path, "__type__": "update"}
123
-
124
- def fetch_pretrained_data():
125
- try:
126
- response = requests.get(codecs.decode("uggcf://uhttvatsnpr.pb/NauC/Ivrganzrfr-EIP-Cebwrpg/erfbyir/znva/wfba/phfgbz_cergenvarq.wfba", "rot13"))
127
- response.raise_for_status()
128
-
129
- return response.json()
130
- except:
131
- return {}
132
-
133
- def update_sample_rate_dropdown(model):
134
- data = fetch_pretrained_data()
135
- if model != translations["success"]: return {"choices": list(data[model].keys()), "value": list(data[model].keys())[0], "__type__": "update"}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
main/app/core/realtime.py DELETED
@@ -1,174 +0,0 @@
1
- import os
2
- import sys
3
- import time
4
-
5
- sys.path.append(os.getcwd())
6
-
7
- from main.app.variables import translations, configs
8
- from main.app.core.ui import gr_info, gr_warning, audio_device
9
-
10
- running, callbacks, audio_manager = False, None, None
11
-
12
- PIPELINE_SAMPLE_RATE = 16000
13
- DEVICE_SAMPLE_RATE = 48000
14
-
15
- interactive_true = {"interactive": True, "__type__": "update"}
16
- interactive_false = {"interactive": False, "__type__": "update"}
17
-
18
- def realtime_start(
19
- monitor,
20
- exclusive_mode,
21
- vad_enabled,
22
- input_audio_device,
23
- output_audio_device,
24
- monitor_output_device,
25
- input_audio_gain,
26
- output_audio_gain,
27
- monitor_audio_gain,
28
- input_asio_channels,
29
- output_asio_channels,
30
- monitor_asio_channels,
31
- chunk_size,
32
- pitch,
33
- model_pth,
34
- model_index,
35
- index_strength,
36
- onnx_f0_mode,
37
- f0_method,
38
- hop_length,
39
- embed_mode,
40
- embedders,
41
- custom_embedders,
42
- f0_autotune,
43
- proposal_pitch,
44
- f0_autotune_strength,
45
- proposal_pitch_threshold,
46
- rms_mix_rate,
47
- protect,
48
- filter_radius,
49
- silent_threshold,
50
- extra_convert_size,
51
- cross_fade_overlap_size,
52
- vad_sensitivity,
53
- vad_frame_ms,
54
- clean_audio,
55
- clean_strength
56
- ):
57
- global running, callbacks, audio_manager
58
- running = True
59
-
60
- gr_info(translations["start_realtime"])
61
- yield translations["start_realtime"], interactive_false, interactive_true
62
-
63
- if not input_audio_device or not output_audio_device:
64
- gr_warning(translations["provide_audio_device"])
65
- yield translations["provide_audio_device"], interactive_true, interactive_false
66
- return
67
-
68
- if monitor and not monitor_output_device:
69
- gr_warning(translations["provide_monitor_device"])
70
- yield translations["provide_monitor_device"], interactive_true, interactive_false
71
- return
72
-
73
- model_pth = os.path.join(configs["weights_path"], model_pth) if not os.path.exists(model_pth) else model_pth
74
- embedder_model = (embedders if embedders != "custom" else custom_embedders)
75
-
76
- if not model_pth or not os.path.exists(model_pth) or os.path.isdir(model_pth) or not model_pth.endswith((".pth", ".onnx")):
77
- gr_warning(translations["provide_file"].format(filename=translations["model"]))
78
- yield translations["provide_file"].format(filename=translations["model"]), interactive_true, interactive_false
79
- return
80
-
81
- input_devices, output_devices = audio_device()
82
- input_device_id = input_devices[input_audio_device][0]
83
- output_device_id = output_devices[output_audio_device][0]
84
- output_monitor_id = output_devices[monitor_output_device][0] if monitor else None
85
-
86
- input_audio_gain /= 100.0
87
- output_audio_gain /= 100.0
88
- monitor_audio_gain /= 100.0
89
-
90
- chunk_size = int(chunk_size * DEVICE_SAMPLE_RATE / 1000 / 128)
91
-
92
- from main.inference.realtime.callbacks import AudioCallbacks
93
-
94
- callbacks = AudioCallbacks(
95
- pass_through=False,
96
- read_chunk_size=chunk_size,
97
- cross_fade_overlap_size=cross_fade_overlap_size,
98
- input_sample_rate=DEVICE_SAMPLE_RATE,
99
- output_sample_rate=DEVICE_SAMPLE_RATE,
100
- extra_convert_size=extra_convert_size,
101
- model_path=model_pth,
102
- index_path=model_index,
103
- f0_method=f0_method,
104
- f0_onnx=onnx_f0_mode,
105
- embedder_model=embedder_model,
106
- embedders_mode=embed_mode,
107
- sample_rate=PIPELINE_SAMPLE_RATE,
108
- hop_length=hop_length,
109
- silent_threshold=silent_threshold,
110
- f0_up_key=pitch,
111
- index_rate=index_strength,
112
- protect=protect,
113
- filter_radius=filter_radius,
114
- rms_mix_rate=rms_mix_rate,
115
- f0_autotune=f0_autotune,
116
- f0_autotune_strength=f0_autotune_strength,
117
- proposal_pitch=proposal_pitch,
118
- proposal_pitch_threshold=proposal_pitch_threshold,
119
- input_audio_gain=input_audio_gain,
120
- output_audio_gain=output_audio_gain,
121
- monitor_audio_gain=monitor_audio_gain,
122
- monitor=monitor,
123
- vad_enabled=vad_enabled,
124
- vad_sensitivity=vad_sensitivity,
125
- vad_frame_ms=vad_frame_ms,
126
- clean_audio=clean_audio,
127
- clean_strength=clean_strength
128
- )
129
-
130
- audio_manager = callbacks.audio
131
- audio_manager.start(
132
- input_device_id=input_device_id,
133
- output_device_id=output_device_id,
134
- output_monitor_id=output_monitor_id,
135
- exclusive_mode=exclusive_mode,
136
- asio_input_channel=input_asio_channels,
137
- asio_output_channel=output_asio_channels,
138
- asio_output_monitor_channel=monitor_asio_channels,
139
- read_chunk_size=chunk_size,
140
- input_audio_sample_rate=DEVICE_SAMPLE_RATE,
141
- output_monitor_sample_rate=DEVICE_SAMPLE_RATE
142
- )
143
-
144
- gr_info(translations["realtime_is_ready"])
145
-
146
- while running and callbacks is not None and audio_manager is not None:
147
- time.sleep(0.1)
148
- if hasattr(callbacks, "latency"): yield f"{translations['latency']}: {callbacks.latency:.2f} ms", interactive_false, interactive_true
149
-
150
- return translations["realtime_has_stop"], interactive_true, interactive_false
151
-
152
- def realtime_stop():
153
- global running, callbacks, audio_manager
154
-
155
- if running and audio_manager is not None and callbacks is not None:
156
- gr_info(translations["stop_realtime"])
157
-
158
- audio_manager.stop()
159
- running = False
160
-
161
- if hasattr(callbacks, "latency"): del callbacks.latency
162
- del audio_manager, callbacks
163
-
164
- audio_manager = callbacks = None
165
- gr_info(translations["realtime_has_stop"])
166
-
167
- from main.library.utils import clear_gpu_cache
168
- clear_gpu_cache()
169
-
170
- return translations["realtime_has_stop"], interactive_true, interactive_false
171
- else:
172
- gr_warning(translations["realtime_not_found"])
173
-
174
- return translations["realtime_not_found"], interactive_true, interactive_false
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
main/app/core/realtime_client.py DELETED
@@ -1,114 +0,0 @@
1
- import os
2
- import sys
3
- import json
4
-
5
- import numpy as np
6
-
7
- from fastapi import FastAPI, WebSocketDisconnect, WebSocket
8
-
9
- sys.path.append(os.getcwd())
10
-
11
- from main.library.utils import clear_gpu_cache
12
- from main.app.variables import configs, translations, logger
13
- from main.inference.realtime.realtime import VoiceChanger, RVC_Realtime
14
-
15
- app = FastAPI()
16
- vc_instance = None
17
-
18
- PIPELINE_SAMPLE_RATE = 16000
19
- DEVICE_SAMPLE_RATE = 48000
20
-
21
- @app.websocket("/ws-audio")
22
- async def websocket_audio(ws: WebSocket):
23
- global vc_instance
24
- await ws.accept()
25
-
26
- logger.info(translations["ws_connected"])
27
-
28
- try:
29
- text = await ws.receive_text()
30
- params = json.loads(text)
31
-
32
- read_chunk_size = int(params["chunk_size"])
33
- block_frame = read_chunk_size * 128
34
- embedders = params["embedders"]
35
-
36
- model_pth = params["model_pth"]
37
- model_pth = os.path.join(configs["weights_path"], model_pth) if not os.path.exists(model_pth) else model_pth
38
-
39
- if not model_pth or not os.path.exists(model_pth) or os.path.isdir(model_pth) or not model_pth.endswith((".pth", ".onnx")):
40
- logger.warning(translations["provide_file"].format(filename=translations["model"]))
41
- await ws.send_text(json.dumps({"type": "warnings", "value": translations["provide_file"].format(filename=translations["model"])}))
42
- return
43
-
44
- logger.info(translations["start_realtime"])
45
-
46
- if vc_instance is None:
47
- vc_instance = VoiceChanger(
48
- read_chunk_size=read_chunk_size,
49
- cross_fade_overlap_size=params["cross_fade_overlap_size"],
50
- input_sample_rate=DEVICE_SAMPLE_RATE,
51
- extra_convert_size=params["extra_convert_size"]
52
- )
53
- vc_instance.initialize(vc_model=RVC_Realtime(
54
- model_path=model_pth,
55
- index_path=params["model_index"],
56
- f0_method=params["f0_method"],
57
- f0_onnx=params["f0_onnx"],
58
- embedder_model=(embedders if embedders != "custom" else params["custom_embedders"]),
59
- embedders_mode=params["embedders_mode"],
60
- sample_rate=PIPELINE_SAMPLE_RATE,
61
- hop_length=params["hop_length"],
62
- silent_threshold=params["silent_threshold"],
63
- input_sample_rate=DEVICE_SAMPLE_RATE,
64
- output_sample_rate=DEVICE_SAMPLE_RATE,
65
- vad_enabled=params["vad_enabled"],
66
- vad_sensitivity=params["vad_sensitivity"],
67
- vad_frame_ms=params["vad_frame_ms"],
68
- clean_audio=params["clean_audio"],
69
- clean_strength=params["clean_strength"]
70
- ))
71
-
72
- logger.info(translations["realtime_is_ready"])
73
-
74
- while 1:
75
- audio = await ws.receive_bytes()
76
- arr = np.frombuffer(audio, dtype=np.float32)
77
-
78
- if arr.size != block_frame:
79
- arr = np.pad(arr, (0, block_frame - arr.size)).astype(np.float32) if arr.size < block_frame else arr[:block_frame].astype(np.float32)
80
-
81
- audio_output, _, perf = vc_instance.on_request(
82
- arr * (params["input_audio_gain"] / 100.0),
83
- f0_up_key=params["f0_up_key"],
84
- index_rate=params["index_rate"],
85
- protect=params["protect"],
86
- filter_radius=params["filter_radius"],
87
- rms_mix_rate=params["rms_mix_rate"],
88
- f0_autotune=params["f0_autotune"],
89
- f0_autotune_strength=params["f0_autotune_strength"],
90
- proposal_pitch=params["proposal_pitch"],
91
- proposal_pitch_threshold=params["proposal_pitch_threshold"]
92
- )
93
-
94
- await ws.send_text(json.dumps({"type": "latency", "value": perf[1]}))
95
- await ws.send_bytes(audio_output.tobytes())
96
- except WebSocketDisconnect:
97
- logger.info(translations["ws_disconnected"])
98
- except Exception as e:
99
- import traceback
100
- logger.debug(traceback.format_exc())
101
- logger.info(translations["error_occurred"].format(e=e))
102
- finally:
103
- if vc_instance is not None:
104
- del vc_instance
105
- vc_instance = None
106
-
107
- clear_gpu_cache()
108
-
109
- try:
110
- await ws.close()
111
- except:
112
- pass
113
-
114
- logger.info(translations["ws_closed"])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
main/app/core/restart.py DELETED
@@ -1,48 +0,0 @@
1
- import os
2
- import sys
3
- import json
4
- import platform
5
- import subprocess
6
-
7
- sys.path.append(os.getcwd())
8
-
9
- from main.app.core.ui import gr_info
10
- from main.app.variables import python, translations, configs_json
11
-
12
- def restart_app(app):
13
- gr_info(translations["30s"])
14
- os.system("cls" if platform.system() == "Windows" else "clear")
15
-
16
- app.close()
17
- subprocess.run([python, os.path.join("main", "app", "app.py")] + [arg for arg in sys.argv[1:] if arg != "--open"])
18
-
19
- def change_language(lang, app):
20
- configs = json.load(open(configs_json, "r"))
21
-
22
- if lang != configs["language"]:
23
- configs["language"] = lang
24
-
25
- with open(configs_json, "w") as f:
26
- json.dump(configs, f, indent=4)
27
-
28
- restart_app(app)
29
-
30
- def change_theme(theme, app):
31
- configs = json.load(open(configs_json, "r"))
32
-
33
- if theme != configs["theme"]:
34
- configs["theme"] = theme
35
- with open(configs_json, "w") as f:
36
- json.dump(configs, f, indent=4)
37
-
38
- restart_app(app)
39
-
40
- def change_font(font, app):
41
- configs = json.load(open(configs_json, "r"))
42
-
43
- if font != configs["font"]:
44
- configs["font"] = font
45
- with open(configs_json, "w") as f:
46
- json.dump(configs, f, indent=4)
47
-
48
- restart_app(app)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
main/app/core/separate.py DELETED
@@ -1,95 +0,0 @@
1
- import os
2
- import sys
3
- import subprocess
4
-
5
- sys.path.append(os.getcwd())
6
-
7
- from main.app.core.ui import gr_info, gr_warning
8
- from main.app.variables import python, translations, configs
9
-
10
- def separate_music(
11
- input_path,
12
- output_dirs,
13
- export_format,
14
- model_name,
15
- karaoke_model,
16
- reverb_model,
17
- denoise_model,
18
- sample_rate,
19
- shifts,
20
- batch_size,
21
- overlap,
22
- aggression,
23
- hop_length,
24
- window_size,
25
- segments_size,
26
- post_process_threshold,
27
- enable_tta,
28
- enable_denoise,
29
- high_end_process,
30
- enable_post_process,
31
- separate_backing,
32
- separate_reverb
33
- ):
34
- output_dirs = os.path.dirname(output_dirs) or output_dirs
35
-
36
- if not input_path or not os.path.exists(input_path) or os.path.isdir(input_path):
37
- gr_warning(translations["input_not_valid"])
38
- return [None]*4
39
-
40
- if not os.path.exists(output_dirs):
41
- gr_warning(translations["output_not_valid"])
42
- return [None]*4
43
-
44
- if not os.path.exists(output_dirs): os.makedirs(output_dirs)
45
- gr_info(translations["start"].format(start=translations["separator_music"]))
46
-
47
- subprocess.run([
48
- python, configs["separate_path"],
49
- "--input_path", input_path,
50
- "--output_dirs", output_dirs,
51
- "--export_format", export_format,
52
- "--model_name", model_name,
53
- "--karaoke_model", karaoke_model,
54
- "--reverb_model", reverb_model,
55
- "--denoise_model", denoise_model,
56
- "--sample_rate", str(sample_rate),
57
- "--shifts", str(shifts),
58
- "--batch_size", str(batch_size),
59
- "--overlap", str(overlap),
60
- "--aggression", str(aggression),
61
- "--hop_length", str(hop_length),
62
- "--window_size", str(window_size),
63
- "--segments_size", str(segments_size),
64
- "--post_process_threshold", str(post_process_threshold),
65
- "--enable_tta", str(enable_tta),
66
- "--enable_denoise", str(enable_denoise),
67
- "--high_end_process", str(high_end_process),
68
- "--enable_post_process", str(enable_post_process),
69
- "--separate_backing", str(separate_backing),
70
- "--separate_reverb", str(separate_reverb),
71
- ])
72
-
73
- gr_info(translations["success"])
74
-
75
- filename, _ = os.path.splitext(os.path.basename(input_path))
76
- output_dirs = os.path.join(output_dirs, filename)
77
-
78
- return [
79
- os.path.join(
80
- output_dirs,
81
- f"Original_Vocals_No_Reverb.{export_format}" if separate_reverb else f"Original_Vocals.{export_format}"
82
- ),
83
- os.path.join(
84
- output_dirs,
85
- f"Instruments.{export_format}"
86
- ),
87
- os.path.join(
88
- output_dirs,
89
- f"Main_Vocals_No_Reverb.{export_format}" if separate_reverb else f"Main_Vocals.{export_format}"
90
- ) if separate_backing else None,
91
- os.path.join(
92
- output_dirs,
93
- f"Backing_Vocals.{export_format}"
94
- ) if separate_backing else None
95
- ] if os.path.isfile(input_path) else [None]*4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
main/app/core/training.py DELETED
@@ -1,265 +0,0 @@
1
- import os
2
- import sys
3
- import time
4
- import shutil
5
- import codecs
6
- import threading
7
- import subprocess
8
-
9
- sys.path.append(os.getcwd())
10
-
11
- from main.tools import huggingface
12
- from main.app.core.ui import gr_info, gr_warning
13
- from main.app.variables import python, translations, configs
14
-
15
- def if_done(done, p):
16
- while 1:
17
- if p.poll() is None: time.sleep(0.5)
18
- else: break
19
-
20
- done[0] = True
21
-
22
- def log_read(done, name):
23
- log_file = os.path.join(configs["logs_path"], "app.log")
24
-
25
- f = open(log_file, "w", encoding="utf-8")
26
- f.close()
27
-
28
- while 1:
29
- with open(log_file, "r", encoding="utf-8") as f:
30
- yield "".join(line for line in f.readlines() if "DEBUG" not in line and name in line and line.strip() != "")
31
-
32
- time.sleep(1)
33
- if done[0]: break
34
-
35
- with open(log_file, "r", encoding="utf-8") as f:
36
- log = "".join(line for line in f.readlines() if "DEBUG" not in line and line.strip() != "")
37
-
38
- yield log
39
-
40
- def create_dataset(
41
- input_data,
42
- output_dirs,
43
- skip_seconds,
44
- skip_start_audios,
45
- skip_end_audios,
46
- separate,
47
- model_name,
48
- reverb_model,
49
- denoise_model,
50
- sample_rate,
51
- shifts,
52
- batch_size,
53
- overlap,
54
- aggression,
55
- hop_length,
56
- window_size,
57
- segments_size,
58
- post_process_threshold,
59
- enable_tta,
60
- enable_denoise,
61
- high_end_process,
62
- enable_post_process,
63
- separate_reverb,
64
- clean_dataset,
65
- clean_strength
66
- ):
67
- gr_info(translations["start"].format(start=translations["create"]))
68
-
69
- p = subprocess.Popen(f'{python} {configs["create_dataset_path"]} --input_data "{input_data}" --output_dirs "{output_dirs}" --skip_seconds {skip_seconds} --skip_start_audios "{skip_start_audios}" --skip_end_audios "{skip_end_audios}" --separate {separate} --model_name "{model_name}" --reverb_model "{reverb_model}" --denoise_model "{denoise_model}" --sample_rate {sample_rate} --shifts {shifts} --batch_size {batch_size} --overlap {overlap} --aggression {aggression} --hop_length {hop_length} --window_size {window_size} --segments_size {segments_size} --post_process_threshold {post_process_threshold} --enable_tta {enable_tta} --enable_denoise {enable_denoise} --high_end_process {high_end_process} --enable_post_process {enable_post_process} --separate_reverb {separate_reverb} --clean_dataset {clean_dataset} --clean_strength {clean_strength}', shell=True)
70
- done = [False]
71
-
72
- threading.Thread(target=if_done, args=(done, p)).start()
73
-
74
- for log in log_read(done, "create_dataset"):
75
- yield log
76
-
77
- def create_reference(audio_path, reference_name, pitch_guidance, use_energy, version, embedder_model, embedders_mode, f0_method, f0_onnx, f0_up_key, filter_radius, f0_autotune, f0_autotune_strength, proposal_pitch, proposal_pitch_threshold, alpha=0.5):
78
- gr_info(translations["start"].format(start=translations["create_reference"]))
79
-
80
- p = subprocess.Popen(f'{python} {configs["create_reference_path"]} --audio_path "{audio_path}" --reference_name "{reference_name}" --pitch_guidance {pitch_guidance} --use_energy {use_energy} --version {version} --embedder_model {embedder_model} --embedders_mode {embedders_mode} --f0_method {f0_method} --f0_onnx {f0_onnx} --f0_up_key {f0_up_key} --filter_radius {filter_radius} --f0_autotune {f0_autotune} --f0_autotune_strength {f0_autotune_strength} --proposal_pitch {proposal_pitch} --proposal_pitch_threshold {proposal_pitch_threshold} --alpha {alpha}', shell=True)
81
- done = [False]
82
-
83
- threading.Thread(target=if_done, args=(done, p)).start()
84
-
85
- for log in log_read(done, "create_reference"):
86
- yield log
87
-
88
- def preprocess(model_name, sample_rate, cpu_core, cut_preprocess, process_effects, dataset, clean_dataset, clean_strength, chunk_len=3.0, overlap_len=0.3, normalization_mode="none"):
89
- sr = int(float(sample_rate.rstrip("k")) * 1000)
90
-
91
- if not model_name: return gr_warning(translations["provide_name"])
92
- if not os.path.exists(dataset) or not any(f.lower().endswith(("wav", "mp3", "flac", "ogg", "opus", "m4a", "mp4", "aac", "alac", "wma", "aiff", "webm", "ac3")) for f in os.listdir(dataset) if os.path.isfile(os.path.join(dataset, f))): return gr_warning(translations["not_found_data"])
93
-
94
- model_dir = os.path.join(configs["logs_path"], model_name)
95
- if os.path.exists(model_dir): shutil.rmtree(model_dir, ignore_errors=True)
96
-
97
- p = subprocess.Popen(f'{python} {configs["preprocess_path"]} --model_name "{model_name}" --dataset_path "{dataset}" --sample_rate {sr} --cpu_cores {cpu_core} --cut_preprocess {cut_preprocess} --process_effects {process_effects} --clean_dataset {clean_dataset} --clean_strength {clean_strength} --chunk_len {chunk_len} --overlap_len {overlap_len} --normalization_mode {normalization_mode}', shell=True)
98
- done = [False]
99
-
100
- threading.Thread(target=if_done, args=(done, p)).start()
101
- os.makedirs(model_dir, exist_ok=True)
102
-
103
- for log in log_read(done, "preprocess"):
104
- yield log
105
-
106
- def extract(model_name, version, method, pitch_guidance, hop_length, cpu_cores, gpu, sample_rate, embedders, custom_embedders, onnx_f0_mode, embedders_mode, f0_autotune, f0_autotune_strength, hybrid_method, rms_extract, alpha=0.5):
107
- f0method, embedder_model = (method if method != "hybrid" else hybrid_method), (embedders if embedders != "custom" else custom_embedders)
108
- sr = int(float(sample_rate.rstrip("k")) * 1000)
109
-
110
- if not model_name: return gr_warning(translations["provide_name"])
111
- model_dir = os.path.join(configs["logs_path"], model_name)
112
-
113
- try:
114
- if not any(os.path.isfile(os.path.join(model_dir, "sliced_audios", f)) for f in os.listdir(os.path.join(model_dir, "sliced_audios"))) or not any(os.path.isfile(os.path.join(model_dir, "sliced_audios_16k", f)) for f in os.listdir(os.path.join(model_dir, "sliced_audios_16k"))): return gr_warning(translations["not_found_data_preprocess"])
115
- except:
116
- return gr_warning(translations["not_found_data_preprocess"])
117
-
118
- p = subprocess.Popen(f'{python} {configs["extract_path"]} --model_name "{model_name}" --rvc_version {version} --f0_method {f0method} --pitch_guidance {pitch_guidance} --hop_length {hop_length} --cpu_cores {cpu_cores} --gpu {gpu} --sample_rate {sr} --embedder_model {embedder_model} --f0_onnx {onnx_f0_mode} --embedders_mode {embedders_mode} --f0_autotune {f0_autotune} --f0_autotune_strength {f0_autotune_strength} --rms_extract {rms_extract} --alpha {alpha}', shell=True)
119
- done = [False]
120
-
121
- threading.Thread(target=if_done, args=(done, p)).start()
122
- os.makedirs(model_dir, exist_ok=True)
123
-
124
- for log in log_read(done, "extract"):
125
- yield log
126
-
127
- def create_index(model_name, rvc_version, index_algorithm):
128
- if not model_name: return gr_warning(translations["provide_name"])
129
- model_dir = os.path.join(configs["logs_path"], model_name)
130
-
131
- try:
132
- if not any(os.path.isfile(os.path.join(model_dir, f"{rvc_version}_extracted", f)) for f in os.listdir(os.path.join(model_dir, f"{rvc_version}_extracted"))): return gr_warning(translations["not_found_data_extract"])
133
- except:
134
- return gr_warning(translations["not_found_data_extract"])
135
-
136
- p = subprocess.Popen(f'{python} {configs["create_index_path"]} --model_name "{model_name}" --rvc_version {rvc_version} --index_algorithm {index_algorithm}', shell=True)
137
- done = [False]
138
-
139
- threading.Thread(target=if_done, args=(done, p)).start()
140
- os.makedirs(model_dir, exist_ok=True)
141
-
142
- for log in log_read(done, "create_index"):
143
- yield log
144
-
145
- def training(model_name, rvc_version, save_every_epoch, save_only_latest, save_every_weights, total_epoch, sample_rate, batch_size, gpu, pitch_guidance, not_pretrain, custom_pretrained, pretrain_g, pretrain_d, detector, threshold, clean_up, cache, model_author, vocoder, checkpointing, deterministic, benchmark, optimizer, energy_use, custom_reference=False, reference_name="", multiscale_mel_loss=False):
146
- sr = int(float(sample_rate.rstrip("k")) * 1000)
147
- if not model_name: return gr_warning(translations["provide_name"])
148
-
149
- model_dir = os.path.join(configs["logs_path"], model_name)
150
- if os.path.exists(os.path.join(model_dir, "train_pid.txt")): os.remove(os.path.join(model_dir, "train_pid.txt"))
151
-
152
- try:
153
- if not any(os.path.isfile(os.path.join(model_dir, f"{rvc_version}_extracted", f)) for f in os.listdir(os.path.join(model_dir, f"{rvc_version}_extracted"))): return gr_warning(translations["not_found_data_extract"])
154
- except:
155
- return gr_warning(translations["not_found_data_extract"])
156
-
157
- if not not_pretrain:
158
- if not custom_pretrained:
159
- pretrain_dir = configs["pretrained_v2_path"] if rvc_version == 'v2' else configs["pretrained_v1_path"]
160
- download_version = codecs.decode(f"uggcf://uhttvatsnpr.pb/NauC/Ivrganzrfr-EIP-Cebwrpg/erfbyir/znva/cergenvarq_", "rot13") + f"{rvc_version}/"
161
-
162
- pretrained_selector = {
163
- True: {
164
- 32000: ("f0G32k.pth", "f0D32k.pth"),
165
- 40000: ("f0G40k.pth", "f0D40k.pth"),
166
- 48000: ("f0G48k.pth", "f0D48k.pth")
167
- },
168
- False: {
169
- 32000: ("G32k.pth", "D32k.pth"),
170
- 40000: ("G40k.pth", "D40k.pth"),
171
- 48000: ("G48k.pth", "D48k.pth")
172
- }
173
- }
174
-
175
- pg2, pd2 = "", ""
176
- pg, pd = pretrained_selector[pitch_guidance][sr]
177
-
178
- if energy_use: pg2, pd2 = pg2 + "ENERGY_", pd2 + "ENERGY_"
179
- if vocoder != 'Default': pg2, pd2 = pg2 + vocoder + "_", pd2 + vocoder + "_"
180
-
181
- pg2, pd2 = pg2 + pg, pd2 + pd
182
- pretrained_G, pretrained_D = (
183
- os.path.join(
184
- pretrain_dir,
185
- pg2
186
- ),
187
- os.path.join(
188
- pretrain_dir,
189
- pd2
190
- )
191
- )
192
-
193
- try:
194
- if not os.path.exists(pretrained_G):
195
- gr_info(translations["download_pretrained"].format(dg="G", rvc_version=rvc_version))
196
- huggingface.HF_download_file(
197
- "".join(
198
- [
199
- download_version,
200
- pg2
201
- ]
202
- ),
203
- os.path.join(
204
- pretrain_dir,
205
- pg2
206
- )
207
- )
208
-
209
- if not os.path.exists(pretrained_D):
210
- gr_info(translations["download_pretrained"].format(dg="D", rvc_version=rvc_version))
211
- huggingface.HF_download_file(
212
- "".join(
213
- [
214
- download_version,
215
- pd2
216
- ]
217
- ),
218
- os.path.join(
219
- pretrain_dir,
220
- pd2
221
- )
222
- )
223
- except:
224
- gr_warning(translations["not_use_pretrain_error_download"])
225
- pretrained_G = pretrained_D = None
226
- else:
227
- if not pretrain_g: return gr_warning(translations["provide_pretrained"].format(dg="G"))
228
- if not pretrain_d: return gr_warning(translations["provide_pretrained"].format(dg="D"))
229
-
230
- pg2, pd2 = pretrain_g, pretrain_d
231
- pretrained_G, pretrained_D = (
232
- (os.path.join(configs["pretrained_custom_path"], pg2) if not os.path.exists(pg2) else pg2),
233
- (os.path.join(configs["pretrained_custom_path"], pd2) if not os.path.exists(pd2) else pd2)
234
- )
235
-
236
- if not os.path.exists(pretrained_G): return gr_warning(translations["not_found_pretrain"].format(dg="G"))
237
- if not os.path.exists(pretrained_D): return gr_warning(translations["not_found_pretrain"].format(dg="D"))
238
- else:
239
- pretrained_G = pretrained_D = None
240
- gr_warning(translations["not_use_pretrain"])
241
-
242
- if custom_reference:
243
- reference_path = os.path.join(configs["reference_path"], reference_name)
244
-
245
- if not os.path.exists(reference_path):
246
- gr_warning(translations["not_found_reference"])
247
-
248
- custom_reference = False
249
- reference_path = None
250
- else: reference_path = None
251
-
252
- gr_info(translations["start"].format(start=translations["training"]))
253
-
254
- p = subprocess.Popen(f'{python} {configs["train_path"]} --model_name "{model_name}" --rvc_version {rvc_version} --save_every_epoch {save_every_epoch} --save_only_latest {save_only_latest} --save_every_weights {save_every_weights} --total_epoch {total_epoch} --batch_size {batch_size} --gpu {gpu} --pitch_guidance {pitch_guidance} --overtraining_detector {detector} --overtraining_threshold {threshold} --cleanup {clean_up} --cache_data_in_gpu {cache} --g_pretrained_path "{pretrained_G}" --d_pretrained_path "{pretrained_D}" --model_author "{model_author}" --vocoder "{vocoder}" --checkpointing {checkpointing} --deterministic {deterministic} --benchmark {benchmark} --optimizer {optimizer} --energy_use {energy_use} --use_custom_reference {custom_reference} --reference_path {reference_path} --multiscale_mel_loss {multiscale_mel_loss}', shell=True)
255
- done = [False]
256
-
257
- with open(os.path.join(model_dir, "train_pid.txt"), "w") as pid_file:
258
- pid_file.write(str(p.pid))
259
-
260
- threading.Thread(target=if_done, args=(done, p)).start()
261
-
262
- for log in log_read(done, "train"):
263
- lines = log.splitlines()
264
- if len(lines) > 50: log = "\n".join(lines[-50:])
265
- yield log
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
main/app/core/tts.py DELETED
@@ -1,100 +0,0 @@
1
- import os
2
- import sys
3
- import pysrt
4
- import codecs
5
- import librosa
6
- import asyncio
7
- import requests
8
- import tempfile
9
-
10
- sys.path.append(os.getcwd())
11
-
12
- from main.app.variables import translations
13
- from main.app.core.ui import gr_info, gr_warning, gr_error
14
-
15
- def synthesize_tts(prompt, voice, speed, output, pitch, google):
16
- if not google:
17
- from edge_tts import Communicate
18
- asyncio.run(Communicate(text=prompt, voice=voice, rate=f"+{speed}%" if speed >= 0 else f"{speed}%", pitch=f"+{pitch}Hz" if pitch >= 0 else f"{pitch}Hz").save(output))
19
- else:
20
- response = requests.get(codecs.decode("uggcf://genafyngr.tbbtyr.pbz/genafyngr_ggf", "rot13"), params={"ie": "UTF-8", "q": prompt, "tl": voice, "ttsspeed": speed, "client": "tw-ob"}, headers={"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36"})
21
-
22
- if response.status_code == 200:
23
- with open(output, "wb") as f:
24
- f.write(response.content)
25
-
26
- if pitch != 0 or speed != 0:
27
- y, sr = librosa.load(output, sr=None)
28
-
29
- if pitch != 0: y = librosa.effects.pitch_shift(y, sr=sr, n_steps=pitch)
30
- if speed != 0: y = librosa.effects.time_stretch(y, rate=speed)
31
-
32
- import soundfile as sf
33
- sf.write(file=output, data=y, samplerate=sr, format=os.path.splitext(os.path.basename(output))[-1].lower().replace('.', ''))
34
- else: gr_error(f"{response.status_code}, {response.text}")
35
-
36
- def srt_tts(srt_file, out_file, voice, rate = 0, sr = 24000, google = False):
37
- import numpy as np
38
- import soundfile as sf
39
-
40
- def time_stretch(y, sr, target_duration):
41
- rate = (len(y) / sr) / target_duration
42
- if rate != 1.0: y = librosa.effects.time_stretch(y=y.astype(np.float32), rate=rate)
43
-
44
- n_target = int(round(target_duration * sr))
45
- return np.pad(y, (0, n_target - len(y))) if len(y) < n_target else y[:n_target]
46
-
47
- def pysrttime_to_seconds(t):
48
- return (t.hours * 60 + t.minutes) * 60 + t.seconds + t.milliseconds / 1000
49
-
50
- subs = pysrt.open(srt_file)
51
- if not subs: raise ValueError(translations["srt"])
52
-
53
- final_audio = np.zeros(int(round(pysrttime_to_seconds(subs[-1].end) * sr)), dtype=np.float32)
54
-
55
- with tempfile.TemporaryDirectory() as tempdir:
56
- for idx, seg in enumerate(subs):
57
- wav_path = os.path.join(tempdir, f"seg_{idx}.wav")
58
- synthesize_tts(" ".join(seg.text.splitlines()), voice, 0, wav_path, rate, google)
59
-
60
- audio, file_sr = sf.read(wav_path, dtype=np.float32)
61
- if file_sr != sr: audio = np.interp(np.linspace(0, len(audio) - 1, int(len(audio) * sr / file_sr)), np.arange(len(audio)), audio)
62
- adjusted = time_stretch(audio, sr, pysrttime_to_seconds(seg.duration))
63
-
64
- start_sample = int(round(pysrttime_to_seconds(seg.start) * sr))
65
- end_sample = start_sample + adjusted.shape[0]
66
-
67
- if end_sample > final_audio.shape[0]:
68
- adjusted = adjusted[: final_audio.shape[0] - start_sample]
69
- end_sample = final_audio.shape[0]
70
-
71
- final_audio[start_sample:end_sample] += adjusted
72
-
73
- sf.write(out_file, final_audio, sr)
74
-
75
- def TTS(prompt, voice, speed, output, pitch, google, srt_input):
76
- if not srt_input: srt_input = ""
77
-
78
- if not prompt and not srt_input.endswith(".srt"):
79
- gr_warning(translations["enter_the_text"])
80
- return None
81
-
82
- if not voice:
83
- gr_warning(translations["choose_voice"])
84
- return None
85
-
86
- if not output:
87
- gr_warning(translations["output_not_valid"])
88
- return None
89
-
90
- if os.path.isdir(output): output = os.path.join(output, f"tts.wav")
91
- gr_info(translations["convert"].format(name=translations["text"]))
92
-
93
- output_dir = os.path.dirname(output) or output
94
- if not os.path.exists(output_dir): os.makedirs(output_dir, exist_ok=True)
95
-
96
- if srt_input.endswith(".srt"): srt_tts(srt_input, output, voice, 0, 24000, google)
97
- else: synthesize_tts(prompt, voice, speed, output, pitch, google)
98
-
99
- gr_info(translations["success"])
100
- return output
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
main/app/core/ui.py DELETED
@@ -1,362 +0,0 @@
1
- import os
2
- import re
3
- import sys
4
- import json
5
- import torch
6
- import shutil
7
-
8
- import gradio as gr
9
- import sounddevice as sd
10
-
11
- sys.path.append(os.getcwd())
12
-
13
- from main.library.backends import directml, opencl
14
- from main.inference.realtime.audio import list_audio_device
15
- from main.app.variables import config, configs, configs_json, logger, translations, edgetts, google_tts_voice, method_f0, method_f0_full, vr_models, mdx_models, demucs_models, embedders_model, spin_model, whisper_model
16
-
17
- def gr_info(message):
18
- gr.Info(message, duration=2)
19
- logger.info(message)
20
-
21
- def gr_warning(message):
22
- gr.Warning(message, duration=2)
23
- logger.warning(message)
24
-
25
- def gr_error(message):
26
- gr.Error(message=message, duration=6)
27
- logger.error(message)
28
-
29
- def get_gpu_info():
30
- ngpu = torch.cuda.device_count()
31
- gpu_infos = [
32
- f"{i}: {torch.cuda.get_device_name(i)} ({int(torch.cuda.get_device_properties(i).total_memory / 1024 / 1024 / 1024 + 0.4)} GB)"
33
- for i in range(ngpu)
34
- if torch.cuda.is_available() or ngpu != 0
35
- ]
36
-
37
- if len(gpu_infos) == 0:
38
- if directml.torch_available:
39
- ngpu = directml.device_count()
40
- gpu_infos = [f"{i}: {directml.device_name(i)}" for i in range(ngpu) if directml.is_available() or ngpu != 0]
41
- elif opencl.torch_available:
42
- ngpu = opencl.device_count()
43
- gpu_infos = [f"{i}: {opencl.device_name(i)}" for i in range(ngpu) if opencl.is_available() or ngpu != 0]
44
- else:
45
- ngpu = 0
46
- gpu_infos = []
47
-
48
- return "\n".join(gpu_infos) if len(gpu_infos) > 0 and not config.cpu_mode else translations["no_support_gpu"]
49
-
50
- def gpu_number_str():
51
- if config.cpu_mode: return "-"
52
-
53
- ngpu = torch.cuda.device_count()
54
- if ngpu == 0: ngpu = directml.device_count() if directml.torch_available else opencl.device_count()
55
-
56
- return str("-".join(map(str, range(ngpu))) if torch.cuda.is_available() or directml.is_available() or opencl.is_available() else "-")
57
-
58
- def change_f0_choices():
59
- f0_file = sorted([os.path.abspath(os.path.join(root, f)) for root, _, files in os.walk(configs["f0_path"]) for f in files if f.endswith(".txt")])
60
- return {"value": f0_file[0] if len(f0_file) >= 1 else "", "choices": f0_file, "__type__": "update"}
61
-
62
- def change_audios_choices(input_audio):
63
- audios = sorted([os.path.abspath(os.path.join(root, f)) for root, _, files in os.walk(configs["audios_path"]) for f in files if os.path.splitext(f)[1].lower() in (".wav", ".mp3", ".flac", ".ogg", ".opus", ".m4a", ".mp4", ".aac", ".alac", ".wma", ".aiff", ".webm", ".ac3")])
64
- return {"value": input_audio if input_audio != "" else (audios[0] if len(audios) >= 1 else ""), "choices": audios, "__type__": "update"}
65
-
66
- def change_reference_choices():
67
- reference = sorted([re.sub(r'_v\d+_(?:[A-Za-z0-9_]+?)_(True|False)_(True|False)$', '', name) for name in os.listdir(configs["reference_path"]) if os.path.exists(os.path.join(configs["reference_path"], name)) and os.path.isdir(os.path.join(configs["reference_path"], name))])
68
- return {"value": reference[0] if len(reference) >= 1 else "", "choices": reference, "__type__": "update"}
69
-
70
- def change_models_choices():
71
- model, index = sorted(list(model for model in os.listdir(configs["weights_path"]) if model.endswith((".pth", ".onnx")) and not model.startswith("G_") and not model.startswith("D_"))), sorted([os.path.join(root, name) for root, _, files in os.walk(configs["logs_path"], topdown=False) for name in files if name.endswith(".index") and "trained" not in name])
72
- return [{"value": model[0] if len(model) >= 1 else "", "choices": model, "__type__": "update"}, {"value": index[0] if len(index) >= 1 else "", "choices": index, "__type__": "update"}]
73
-
74
- def change_pretrained_choices():
75
- pretrainD = sorted([model for model in os.listdir(configs["pretrained_custom_path"]) if model.endswith(".pth") and "D" in model])
76
- pretrainG = sorted([model for model in os.listdir(configs["pretrained_custom_path"]) if model.endswith(".pth") and "G" in model])
77
-
78
- return [{"choices": pretrainD, "value": pretrainD[0] if len(pretrainD) >= 1 else "", "__type__": "update"}, {"choices": pretrainG, "value": pretrainG[0] if len(pretrainG) >= 1 else "", "__type__": "update"}]
79
-
80
- def change_choices_del():
81
- return [{"choices": sorted(list(model for model in os.listdir(configs["weights_path"]) if model.endswith(".pth") and not model.startswith("G_") and not model.startswith("D_"))), "__type__": "update"}, {"choices": sorted([os.path.join(configs["logs_path"], f) for f in os.listdir(configs["logs_path"]) if f not in ["mute", "reference"] and os.path.isdir(os.path.join(configs["logs_path"], f))]), "__type__": "update"}]
82
-
83
- def change_preset_choices():
84
- return {"value": "", "choices": sorted(list(f for f in os.listdir(configs["presets_path"]) if f.endswith(".conversion.json"))), "__type__": "update"}
85
-
86
- def change_effect_preset_choices():
87
- return {"value": "", "choices": sorted(list(f for f in os.listdir(configs["presets_path"]) if f.endswith(".effect.json"))), "__type__": "update"}
88
-
89
- def change_tts_voice_choices(google):
90
- return {"choices": google_tts_voice if google else edgetts, "value": google_tts_voice[0] if google else edgetts[0], "__type__": "update"}
91
-
92
- def change_backing_choices(backing, merge):
93
- if backing or merge: return {"value": False, "interactive": False, "__type__": "update"}
94
- elif not backing or not merge: return {"interactive": True, "__type__": "update"}
95
- else: gr_warning(translations["option_not_valid"])
96
-
97
- def change_download_choices(select):
98
- selects = [False]*10
99
-
100
- if select == translations["download_url"]: selects[0] = selects[1] = selects[2] = True
101
- elif select == translations["download_from_csv"]: selects[3] = selects[4] = True
102
- elif select == translations["search_models"]: selects[5] = selects[6] = True
103
- elif select == translations["upload"]: selects[9] = True
104
- else: gr_warning(translations["option_not_valid"])
105
-
106
- return [{"visible": selects[i], "__type__": "update"} for i in range(len(selects))]
107
-
108
- def change_download_pretrained_choices(select):
109
- selects = [False]*7
110
-
111
- if select == translations["download_url"]: selects[0] = selects[1] = selects[2] = True
112
- elif select == translations["list_model"]: selects[3] = selects[4] = selects[5] = True
113
- elif select == translations["upload"]: selects[6] = True
114
- else: gr_warning(translations["option_not_valid"])
115
-
116
- return [{"visible": selects[i], "__type__": "update"} for i in range(len(selects))]
117
-
118
- def get_index(model):
119
- model = os.path.basename(model).split("_")[0]
120
- return {"value": next((f for f in [os.path.join(root, name) for root, _, files in os.walk(configs["logs_path"], topdown=False) for name in files if name.endswith(".index") and "trained" not in name] if model.split(".")[0] in f), ""), "__type__": "update"} if model else None
121
-
122
- def index_strength_show(index):
123
- return {"visible": index != "" and index != None and os.path.exists(index) and os.path.isfile(index), "value": 0.5, "__type__": "update"}
124
-
125
- def hoplength_show(method, hybrid_method=None):
126
- visible = False
127
-
128
- for m in ["mangio-crepe", "fcpe", "yin", "piptrack", "mangio-penn"]:
129
- if m in method: visible = True
130
- if hybrid_method is not None and m in hybrid_method: visible = True
131
-
132
- if visible: break
133
- else: visible = False
134
-
135
- return {"visible": visible, "__type__": "update"}
136
-
137
- def visible(value):
138
- return {"visible": value, "__type__": "update"}
139
-
140
- def valueFalse_interactive(value):
141
- return {"value": False, "interactive": value, "__type__": "update"}
142
-
143
- def valueEmpty_visible1(value):
144
- return {"value": "", "visible": value, "__type__": "update"}
145
-
146
- def pitch_guidance_lock(vocoders):
147
- return {"value": True, "interactive": vocoders == "Default", "__type__": "update"}
148
-
149
- def vocoders_lock(pitch, vocoders):
150
- return {"value": vocoders if pitch else "Default", "interactive": pitch, "__type__": "update"}
151
-
152
- def unlock_f0(value):
153
- return {"choices": method_f0_full if value else method_f0, "value": "rmvpe", "__type__": "update"}
154
-
155
- def unlock_vocoder(value, vocoder):
156
- return {"value": vocoder if value == "v2" else "Default", "interactive": value == "v2", "__type__": "update"}
157
-
158
- def unlock_ver(value, vocoder):
159
- return {"value": "v2" if vocoder == "Default" else value, "interactive": vocoder == "Default", "__type__": "update"}
160
-
161
- def change_embedders_mode(value):
162
- if value == "spin":
163
- return {"value": spin_model[0], "choices": spin_model, "__type__": "update"}
164
- elif value == "whisper":
165
- return {"value": whisper_model[0], "choices": whisper_model, "__type__": "update"}
166
- else:
167
- return {"value": embedders_model[0], "choices": embedders_model, "__type__": "update"}
168
-
169
- def change_fp(fp):
170
- fp16 = fp == "fp16"
171
-
172
- if fp16 and config.device in ["cpu", "mps", "ocl:0"]:
173
- gr_warning(translations["fp16_not_support"])
174
- return "fp32"
175
- else:
176
- gr_info(translations["start_update_precision"])
177
-
178
- configs = json.load(open(configs_json, "r"))
179
- configs["fp16"] = config.is_half = fp16
180
-
181
- with open(configs_json, "w") as f:
182
- json.dump(configs, f, indent=4)
183
-
184
- gr_info(translations["success"])
185
- return "fp16" if fp16 else "fp32"
186
-
187
- def process_output(file_path):
188
- if config.configs.get("delete_exists_file", True):
189
- if os.path.exists(file_path) and os.path.isfile(file_path): os.remove(file_path)
190
- return file_path
191
- else:
192
- if not os.path.exists(file_path): return file_path
193
- file = os.path.splitext(os.path.basename(file_path))
194
-
195
- index = 1
196
- while 1:
197
- file_path = os.path.join(os.path.dirname(file_path), f"{file[0]}_{index}{file[1]}")
198
- if not os.path.exists(file_path): return file_path
199
- index += 1
200
-
201
- def shutil_move(input_path, output_path):
202
- output_path = os.path.join(output_path, os.path.basename(input_path)) if os.path.isdir(output_path) else output_path
203
-
204
- return shutil.move(input_path, process_output(output_path)) if os.path.exists(output_path) else shutil.move(input_path, output_path)
205
-
206
- def separate_change(model_name, karaoke_model, reverb_model, enable_post_process, separate_backing, separate_reverb, enable_denoise):
207
- model_type = "vr" if model_name in list(vr_models.keys()) else "mdx" if model_name in list(mdx_models.keys()) else "demucs" if model_name in list(demucs_models.keys()) else ""
208
- karaoke_type = ("vr" if karaoke_model.startswith("VR") else "mdx") if separate_backing else None
209
- reverb_type = ("vr" if not reverb_model.startswith("MDX") else "mdx") if separate_reverb else None
210
-
211
- all_types = {model_type, karaoke_type, reverb_type}
212
-
213
- is_vr = "vr" in all_types
214
- is_mdx = "mdx" in all_types
215
- is_demucs = "demucs" in all_types
216
-
217
- return [
218
- visible(separate_backing),
219
- visible(separate_reverb),
220
- visible(is_mdx or is_demucs),
221
- visible(is_mdx or is_demucs),
222
- visible(is_mdx),
223
- visible(is_mdx or is_vr),
224
- visible(is_demucs),
225
- visible(is_vr),
226
- visible(is_vr),
227
- visible(is_vr and enable_post_process),
228
- visible(is_vr and enable_denoise),
229
- valueFalse_interactive(is_vr),
230
- valueFalse_interactive(is_vr),
231
- valueFalse_interactive(is_vr)
232
- ]
233
-
234
- def create_dataset_change(model_name, reverb_model, enable_post_process, separate_reverb, enable_denoise):
235
- model_type = "vr" if model_name in list(vr_models.keys()) else "mdx" if model_name in list(mdx_models.keys()) else "demucs" if model_name in list(demucs_models.keys()) else ""
236
- reverb_type = ("vr" if not reverb_model.startswith("MDX") else "mdx") if separate_reverb else None
237
- all_types = {model_type, reverb_type}
238
-
239
- is_vr = "vr" in all_types
240
- is_mdx = "mdx" in all_types
241
- is_demucs = "demucs" in all_types
242
-
243
- return [
244
- visible(separate_reverb),
245
- visible(is_mdx or is_demucs),
246
- visible(is_mdx or is_demucs),
247
- visible(is_mdx),
248
- visible(is_mdx or is_vr),
249
- visible(is_demucs),
250
- visible(is_vr),
251
- visible(is_vr),
252
- visible(is_vr and enable_post_process),
253
- visible(is_vr and enable_denoise),
254
- valueFalse_interactive(is_vr),
255
- valueFalse_interactive(is_vr),
256
- valueFalse_interactive(is_vr)
257
- ]
258
-
259
- def audio_device():
260
- try:
261
- input_devices, output_devices = list_audio_device()
262
-
263
- def priority(name):
264
- n = name.lower()
265
- if "virtual" in n:
266
- return 0
267
- if "vb" in n:
268
- return 1
269
- return 2
270
-
271
- output_sorted = sorted(output_devices, key=lambda d: priority(d.name))
272
- input_sorted = sorted(
273
- input_devices, key=lambda d: priority(d.name), reverse=True
274
- )
275
-
276
- input_device_list = {
277
- f"{input_sorted.index(d)+1}: {d.name} ({d.host_api})": [d.index, d.max_input_channels] for d in input_sorted
278
- }
279
- output_device_list = {
280
- f"{output_sorted.index(d)+1}: {d.name} ({d.host_api})": [d.index, d.max_output_channels] for d in output_sorted
281
- }
282
-
283
- return input_device_list, output_device_list
284
- except Exception:
285
- return [], []
286
-
287
- def update_audio_device(input_device, output_device, monitor_device, monitor):
288
- input_channels_map, output_channels_map = audio_device()
289
-
290
- input_is_asio = "ASIO" in input_device if input_device else False
291
- output_is_asio = "ASIO" in output_device if output_device else False
292
- monitor_is_asio = "ASIO" in monitor_device if monitor_device else False
293
-
294
- try:
295
- input_max_ch = input_channels_map.get(input_device, [])[1]
296
- output_max_ch = output_channels_map.get(output_device, [])[1]
297
- monitor_max_ch = output_channels_map.get(monitor_device, [])[1] if monitor else 128
298
- except:
299
- input_max_ch = output_max_ch = monitor_max_ch = -1
300
-
301
- return [
302
- visible(monitor),
303
- visible(monitor),
304
- visible(monitor_is_asio),
305
- visible(input_is_asio or output_is_asio or monitor_is_asio),
306
- gr.update(visible=input_is_asio, maximum=input_max_ch),
307
- gr.update(visible=output_is_asio, maximum=output_max_ch),
308
- gr.update(visible=monitor_is_asio, maximum=monitor_max_ch)
309
- ]
310
-
311
- def change_audio_device_choices():
312
- sd._terminate()
313
- sd._initialize()
314
-
315
- input_channels_map, output_channels_map = audio_device()
316
- input_channels_map, output_channels_map = list(input_channels_map.keys()), list(output_channels_map.keys())
317
-
318
- return [
319
- {"value": input_channels_map[0] if len(input_channels_map) >= 1 else "", "choices": input_channels_map, "__type__": "update"},
320
- {"value": output_channels_map[0] if len(output_channels_map) >= 1 else "", "choices": output_channels_map, "__type__": "update"},
321
- {"value": output_channels_map[0] if len(output_channels_map) >= 1 else "", "choices": output_channels_map, "__type__": "update"}
322
- ]
323
-
324
- def replace_punctuation(filename):
325
- return filename.replace(" ", "_").replace("-", "").replace("(", "").replace(")", "").replace("[", "").replace("]", "").replace(",", "").replace('"', "").replace("'", "").replace("|", "_").replace("{", "").replace("}", "").replace("-_-", "_").replace("_-_", "_").replace("-", "_").replace("---", "_").replace("___", "_").strip()
326
-
327
- def replace_url(url):
328
- return url.replace("/blob/", "/resolve/").replace("?download=true", "").strip()
329
-
330
- def replace_modelname(modelname):
331
- return replace_punctuation(modelname.replace(".onnx", "").replace(".pth", "").replace(".index", "").replace(".zip", ""))
332
-
333
- def replace_export_format(audio_path, export_format = "wav"):
334
- export_format = f".{export_format}"
335
-
336
- return audio_path if audio_path.endswith(export_format) else audio_path.replace(f".{os.path.basename(audio_path).split('.')[-1]}", export_format)
337
-
338
- def update_dropdowns_from_json(data):
339
- if not data:
340
- return [
341
- gr.update(choices=[], value=None),
342
- gr.update(choices=[], value=None),
343
- gr.update(choices=[], value=None)
344
- ]
345
-
346
- inputs = list(data.get("inputs", {}).keys())
347
- outputs = list(data.get("outputs", {}).keys())
348
-
349
- return [
350
- gr.update(choices=inputs, value=inputs[0] if len(inputs) > 0 else None),
351
- gr.update(choices=outputs, value=outputs[0] if len(outputs) > 0 else None),
352
- gr.update(choices=outputs, value=outputs[0] if len(outputs) > 0 else None),
353
- ]
354
-
355
- def update_button_from_json(data):
356
- if not data:
357
- return [gr.update(interactive=True), gr.update(interactive=False)]
358
-
359
- return [
360
- gr.update(interactive=data.get("start_button", True)),
361
- gr.update(interactive=data.get("stop_button", False))
362
- ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
main/app/core/utils.py DELETED
@@ -1,61 +0,0 @@
1
- import os
2
- import sys
3
- import json
4
- import codecs
5
- import requests
6
-
7
- sys.path.append(os.getcwd())
8
-
9
- from main.app.core.ui import gr_info, gr_warning
10
- from main.app.variables import translations, configs
11
-
12
- def stop_pid(pid_file, model_name=None, train=False):
13
- try:
14
- pid_file_path = os.path.join("assets", f"{pid_file}.txt") if model_name is None else os.path.join(configs["logs_path"], model_name, f"{pid_file}.txt")
15
-
16
- if not os.path.exists(pid_file_path): return gr_warning(translations["not_found_pid"])
17
- else:
18
- with open(pid_file_path, "r") as pid_file:
19
- pids = [int(pid) for pid in pid_file.readlines()]
20
-
21
- for pid in pids:
22
- os.kill(pid, 9)
23
-
24
- if os.path.exists(pid_file_path): os.remove(pid_file_path)
25
-
26
- pid_file_path = os.path.join(configs["logs_path"], model_name, "config.json")
27
-
28
- if train and os.path.exists(pid_file_path):
29
- with open(pid_file_path, "r") as pid_file:
30
- pid_data = json.load(pid_file)
31
- pids = pid_data.get("process_pids", [])
32
-
33
- with open(pid_file_path, "w") as pid_file:
34
- pid_data.pop("process_pids", None)
35
-
36
- json.dump(pid_data, pid_file, indent=4)
37
-
38
- for pid in pids:
39
- os.kill(pid, 9)
40
-
41
- gr_info(translations["end_pid"])
42
- except:
43
- pass
44
-
45
- def google_translate(text, source='auto', target='vi'):
46
- if text == "": return gr_warning(translations["prompt_warning"])
47
-
48
- try:
49
- import textwrap
50
-
51
- def translate_chunk(chunk):
52
- response = requests.get(codecs.decode("uggcf://genafyngr.tbbtyrncvf.pbz/genafyngr_n/fvatyr", "rot13"), params={'client': 'gtx', 'sl': source, 'tl': target, 'dt': 't', 'q': chunk})
53
- return ''.join([i[0] for i in response.json()[0]]) if response.status_code == 200 else chunk
54
-
55
- translated_text = ''
56
- for chunk in textwrap.wrap(text, 5000, break_long_words=False, break_on_hyphens=False):
57
- translated_text += translate_chunk(chunk)
58
-
59
- return translated_text
60
- except:
61
- return text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
main/app/parser.py DELETED
@@ -1,369 +0,0 @@
1
- import os
2
- import sys
3
-
4
- sys.path.append(os.getcwd())
5
-
6
- try:
7
- argv = sys.argv[1]
8
- except IndexError:
9
- argv = None
10
-
11
- argv_is_allows = ["--audio_effects", "--convert", "--create_dataset", "--create_index", "--extract", "--preprocess", "--separator_music", "--train", "--help_audio_effects", "--help_convert", "--help_create_dataset", "--help_create_index", "--help_extract", "--help_preprocess", "--help_separate_music", "--help_train", "--help", "--create_reference", "help_create_reference"]
12
-
13
- if argv not in argv_is_allows:
14
- print("Cú pháp không hợp lệ! Sử dụng --help để biết thêm")
15
- quit()
16
-
17
- if argv_is_allows[0] in argv: from main.inference.audio_effects import main
18
- elif argv_is_allows[1] in argv: from main.inference.conversion.convert import main
19
- elif argv_is_allows[2] in argv: from main.inference.create_dataset import main
20
- elif argv_is_allows[3] in argv: from main.inference.create_index import main
21
- elif argv_is_allows[4] in argv: from main.inference.extracting.extract import main
22
- elif argv_is_allows[5] in argv: from main.inference.preprocess.preprocess import main
23
- elif argv_is_allows[6] in argv: from main.inference.separate_music import main
24
- elif argv_is_allows[7] in argv: from main.inference.training.train import main
25
- elif argv_is_allows[17] in argv: from main.inference.create_reference import main
26
- elif argv_is_allows[8] in argv:
27
- print("""Các tham số của `--audio_effects`:
28
- 1. Đường dẫn tệp:
29
- - `--input_path` (bắt buộc): Đường dẫn đến tệp âm thanh đầu vào.
30
- - `--output_path` (mặc định: `./audios/apply_effects.wav`): Đường dẫn lưu tệp đầu ra.
31
- - `--export_format` (mặc định: `wav`): Định dạng xuất tệp (`wav`, `mp3`, ...).
32
-
33
- 2. Lấy mẫu lại:
34
- - `--resample` (mặc định: `False`): Có lấy mẫu lại hay không.
35
- - `--resample_sr` (mặc định: `0`): Tần số lấy mẫu mới (Hz).
36
-
37
- 3. Hiệu ứng chorus:
38
- - `--chorus`: Bật/tắt chorus.
39
- - `--chorus_depth`, `--chorus_rate`, `--chorus_mix`, `--chorus_delay`, `--chorus_feedback`: Các thông số điều chỉnh chorus.
40
-
41
- 4. Hiệu ứng distortion:
42
- - `--distortion`: Bật/tắt distortion.
43
- - `--drive_db`: Mức độ méo âm thanh.
44
-
45
- 5. Hiệu ứng reverb:
46
- - `--reverb`: Bật/tắt hồi âm.
47
- - `--reverb_room_size`, `--reverb_damping`, `--reverb_wet_level`, `--reverb_dry_level`, `--reverb_width`, `--reverb_freeze_mode`: Điều chỉnh hồi âm.
48
-
49
- 6. Hiệu ứng pitch shift:
50
- - `--pitchshift`: Bật/tắt thay đổi cao độ.
51
- - `--pitch_shift`: Giá trị dịch cao độ.
52
-
53
- 7. Hiệu ứng delay:
54
- - `--delay`: Bật/tắt delay.
55
- - `--delay_seconds`, `--delay_feedback`, `--delay_mix`: Điều chỉnh thời gian trễ, phản hồi và hòa trộn.
56
-
57
- 8. Compressor:
58
- - `--compressor`: Bật/tắt compressor.
59
- - `--compressor_threshold`, `--compressor_ratio`, `--compressor_attack_ms`, `--compressor_release_ms`: Các thông số nén.
60
-
61
- 9. Limiter:
62
- - `--limiter`: Bật/tắt giới hạn mức âm thanh.
63
- - `--limiter_threshold`, `--limiter_release`: Ngưỡng giới hạn và thời gian nhả.
64
-
65
- 10. Gain (Khuếch đại):
66
- - `--gain`: Bật/tắt gain.
67
- - `--gain_db`: Mức gain (dB).
68
-
69
- 11. Bitcrush:
70
- - `--bitcrush`: Bật/tắt hiệu ứng giảm độ phân giải.
71
- - `--bitcrush_bit_depth`: Số bit của bitcrush.
72
-
73
- 12. Clipping:
74
- - `--clipping`: Bật/tắt cắt âm thanh.
75
- - `--clipping_threshold`: Ngưỡng clipping.
76
-
77
- 13. Phaser:
78
- - `--phaser`: Bật/tắt hiệu ứng phaser.
79
- - `--phaser_rate_hz`, `--phaser_depth`, `--phaser_centre_frequency_hz`, `--phaser_feedback`, `--phaser_mix`: Điều chỉnh hiệu ứng phaser.
80
-
81
- 14. Boost bass & treble:
82
- - `--treble_bass_boost`: Bật/tắt tăng cường âm bass và treble.
83
- - `--bass_boost_db`, `--bass_boost_frequency`, `--treble_boost_db`, `--treble_boost_frequency`: Các thông số tăng bass và treble.
84
-
85
- 15. Fade in & fade out:
86
- - `--fade_in_out`: Bật/tắt hiệu ứng fade.
87
- - `--fade_in_duration`, `--fade_out_duration`: Thời gian fade vào/ra.
88
-
89
- 16. Kết hợp âm thanh:
90
- - `--audio_combination`: Bật/tắt ghép nhiều tệp âm thanh.
91
- - `--audio_combination_input`: Đường dẫn tệp âm thanh bổ sung.
92
- - `--main_volume`: Âm lượng của âm thanh chính.
93
- - `--combination_volume`:: Âm lượng của âm thanh cần kết hợp.
94
- """)
95
- quit()
96
- elif argv_is_allows[9] in argv:
97
- print("""Các tham số của --convert:
98
- 1. Cấu hình xử lí giọng nói:
99
- - `--pitch` (mặc định: `0`): Điều chỉnh cao độ.
100
- - `--filter_radius` (mặc định: `3`): Độ mượt của đường F0.
101
- - `--index_rate` (mặc định: `0.5`): Tỷ lệ sử dụng chỉ mục giọng nói.
102
- - `--rms_mix_rate` (mặc định: `1`): Hệ số điều chỉnh biên độ âm lượng.
103
- - `--protect` (mặc định: `0.33`): Bảo vệ phụ âm.
104
- - `--hop_length` (mặc định: `64`): Bước nhảy khi xử lí âm thanh.
105
-
106
- 2. Cấu hình F0:
107
- - `--f0_method` (mặc định: `rmvpe`): Phương pháp dự đoán F0 (`pm`, `dio`, `mangio-crepe-tiny`, `mangio-crepe-small`, `mangio-crepe-medium`, `mangio-crepe-large`, `mangio-crepe-full`, `crepe-tiny`, `crepe-small`, `crepe-medium`, `crepe-large`, `crepe-full`, `fcpe`, `fcpe-legacy`, `rmvpe`, `rmvpe-legacy`, `harvest`, `yin`, `pyin`, `swipe`).
108
- - `--f0_autotune` (mặc định: `False`): Có tự động điều chỉnh F0 hay không.
109
- - `--f0_autotune_strength` (mặc định: `1`): Cường độ hiệu chỉnh tự động F0.
110
- - `--f0_file` (mặc định: ``): Đường dẫn tệp F0 có sẵn.
111
- - `--f0_onnx` (mặc định: `False`): Có sử dụng phiên bản ONNX của F0 hay không.
112
- - `--proposal_pitch` (mặc định: `False`): Đề xuất cao độ thay vì điều chỉnh thủ công.
113
- - `--proposal_pitch_threshold` (mặc định: `0.0`): Ngưỡng tần số ước tính cao độ.
114
- - `--alpha` (mặc định: `0.5`): Ngưỡng trộn cao độ khi ước tính cao độ hybrid.
115
-
116
- 3. Mô hình nhúng:
117
- - `--embedder_model` (mặc định: `hubert_base`): Mô hình nhúng sử dụng.
118
- - `--embedders_mode` (mặc định: `fairseq`): Chế độ nhúng (`fairseq`, `transformers`, `onnx`, `whisper`).
119
-
120
- 4. Đường dẫn tệp:
121
- - `--input_path` (bắt buộc): Đường dẫn tệp âm thanh đầu vào.
122
- - `--output_path` (mặc định: `./audios/output.wav`): Đường dẫn lưu tệp đầu ra.
123
- - `--export_format` (mặc định: `wav`): Định dạng xuất tệp.
124
- - `--pth_path` (bắt buộc): Đường dẫn đến tệp mô hình `.pth`.
125
- - `--index_path` (mặc định: `None`): Đường dẫn tệp chỉ mục (nếu có).
126
-
127
- 5. Làm sạch âm thanh:
128
- - `--clean_audio` (mặc định: `False`): Có áp dụng làm sạch âm thanh không.
129
- - `--clean_strength` (mặc định: `0.7`): Mức độ làm sạch.
130
-
131
- 6. Resampling & chia nhỏ âm thanh:
132
- - `--resample_sr` (mặc định: `0`): Tần số lấy mẫu mới (0 nghĩa là giữ nguyên).
133
- - `--split_audio` (mặc định: `False`): Có chia nhỏ audio trước khi xử lí không.
134
-
135
- 7. Kiểm tra & tối ưu hóa:
136
- - `--checkpointing` (mặc định: `False`): Bật/tắt checkpointing để tiết kiệm RAM.
137
-
138
- 8. Dịch formant:
139
- - `--formant_shifting` (mặc định: `False`): Có bật hiệu ứng dịch formant không.
140
- - `--formant_qfrency` (mặc định: `0.8`): Hệ số dịch formant theo tần số.
141
- - `--formant_timbre` (mặc định: `0.8`): Hệ số thay đổi màu sắc giọng.
142
- """)
143
- quit()
144
- elif argv_is_allows[10] in argv:
145
- print("""Các tham số của --create_dataset:
146
- 1. Đường dẫn & cấu hình dataset:
147
- - `--input_data` (bắt buộc): Đường dẫn liên kết đến âm thanh (Liên kết Youtube, có thể dùng dấu `,` để dùng nhiều liên kết).
148
- - `--output_dirs` (mặc định: `./dataset`): Thư mục xuất dữ liệu đầu ra.
149
- - `--sample_rate` (mặc định: `48000`): Tần số lấy mẫu cho âm thanh.
150
-
151
- 2. Làm sạch dữ liệu:
152
- - `--clean_dataset` (mặc định: `False`): Có áp dụng làm sạch dữ liệu hay không.
153
- - `--clean_strength` (mặc định: `0.7`): Mức độ làm sạch dữ liệu.
154
-
155
- 3. Tách giọng & hiệu ứng:
156
- - `--separate` (mặc định: `True`): có tách nhạc hay không.
157
- - `--separator_reverb` (mặc định: `False`): Có tách vang giọng không.
158
- - `--model_name` (mặc định: `MDXNET_Main`): Mô hình tách nhạc ('Main_340', 'Main_390', 'Main_406', 'Main_427', 'Main_438', 'Inst_full_292', 'Inst_HQ_1', 'Inst_HQ_2', 'Inst_HQ_3', 'Inst_HQ_4', 'Inst_HQ_5', 'Kim_Vocal_1', 'Kim_Vocal_2', 'Kim_Inst', 'Inst_187_beta', 'Inst_82_beta', 'Inst_90_beta', 'Voc_FT', 'Crowd_HQ', 'MDXNET_9482', 'Inst_1', 'Inst_2', 'Inst_3', 'MDXNET_1_9703', 'MDXNET_2_9682', 'MDXNET_3_9662', 'Inst_Main', 'MDXNET_Main', 'HT-Tuned', 'HT-Normal', 'HD_MMI', 'HT_6S', 'HP-1', 'HP-2', 'HP-Vocal-1', 'HP-Vocal-2', 'HP2-1', 'HP2-2', 'HP2-3', 'SP-2B-1', 'SP-2B-2', 'SP-3B-1', 'SP-4B-1', 'SP-4B-2', 'SP-MID-1', 'SP-MID-2').
159
- - `--reverb_model` (mặc định: `MDX-Reverb`): Mô hình tách nhạc ("MDX-Reverb", 'VR-Reverb', 'Echo-Aggressive', 'Echo-Normal').
160
- - `--denoise_model` (mặc định: `Normal`): Mô hình tách nhạc ('Lite', 'Normal').
161
-
162
- 4. Cấu hình xử lí âm thanh:
163
- - `--shifts` (mặc định: `2`): Số lượng dự đoán.
164
- - `--batch_size` (mặc định: `1`): Kích thước lô.
165
- - `--overlap` (mặc định: `0.25`): Mức độ chồng lấn giữa các đoạn.
166
- - `--aggression` (mặc định: `5`): Cường độ chiết xuất thân chính.
167
- - `--hop_length` (mặc định: `1024`): Bước nhảy MDX khi xử lí.
168
- - `--window_size` (mặc định: `512`): Kích thước cửa sổ.
169
- - `--segments_size` (mặc định: `256`): Kích thước phân đoạn âm thanh.
170
- - `--post_process_threshold` (mặc định: `0.2`): Mức độ xử lí hậu kỳ sau khi tách nhạc.
171
-
172
- 5. Cấu hình xử lí âm thanh khác:
173
- - `--enable_tta` (mặc định: `False`): Tăng cường suy luận.
174
- - `--enable_denoise` (mặc định: `False`): Khữ tách nhạc.
175
- - `--high_end_process` (mặc định: `False`): Xử lí dải cao.
176
- - `--enable_post_process` (mặc định: `False`): Hậu xử lí.
177
-
178
- 6. Bỏ qua phần âm thanh:
179
- - `--skip_seconds` (mặc định: `False`): Có bỏ qua giây âm thanh nào không.
180
- - `--skip_start_audios` (mặc định: `0`): Thời gian (giây) cần bỏ qua ở đầu audio.
181
- - `--skip_end_audios` (mặc định: `0`): Thời gian (giây) cần bỏ qua ở cuối audio.
182
- """)
183
- quit()
184
- elif argv_is_allows[11] in argv:
185
- print("""Các tham số của --create_index:
186
- 1. Thông tin mô hình:
187
- - `--model_name` (bắt buộc): Tên mô hình.
188
- - `--rvc_version` (mặc định: `v2`): Phiên bản (`v1`, `v2`).
189
- - `--index_algorithm` (mặc định: `Auto`): Thuật toán index sử dụng (`Auto`, `Faiss`, `KMeans`).
190
- """)
191
- quit()
192
- elif argv_is_allows[12] in argv:
193
- print("""Các tham số của --extract:
194
- 1. Thông tin mô hình:
195
- - `--model_name` (bắt buộc): Tên mô hình.
196
- - `--rvc_version` (mặc định: `v2`): Phiên bản RVC (`v1`, `v2`).
197
-
198
- 2. Cấu hình F0:
199
- - `--f0_method` (mặc định: `rmvpe`): Phương pháp dự đoán F0 (`pm`, `dio`, `mangio-crepe-tiny`, `mangio-crepe-small`, `mangio-crepe-medium`, `mangio-crepe-large`, `mangio-crepe-full`, `crepe-tiny`, `crepe-small`, `crepe-medium`, `crepe-large`, `crepe-full`, `fcpe`, `fcpe-legacy`, `rmvpe`, `rmvpe-legacy`, `harvest`, `yin`, `pyin`, `swipe`).
200
- - `--f0_onnx` (mặc định: `False`): Có sử dụng phiên bản ONNX của F0 hay không.
201
- - `--pitch_guidance` (mặc định: `True`): Có sử dụng hướng dẫn cao độ hay không.
202
- - `--f0_autotune` (mặc định: `False`): Có tự động điều chỉnh F0 hay không.
203
- - `--f0_autotune_strength` (mặc định: `1`): Cường độ hiệu chỉnh tự động F0.
204
- - `--alpha` (mặc định: `0.5`): Ngưỡng trộn cao độ khi ước tính cao độ hybrid.
205
-
206
- 3. Cấu hình xử lí:
207
- - `--hop_length` (mặc định: `128`): Độ dài bước nhảy trong quá trình xử lí.
208
- - `--cpu_cores` (mặc định: `2`): Số lượng luồng CPU sử dụng.
209
- - `--gpu` (mặc định: `-`): Chỉ định GPU sử dụng (ví dụ: `0` cho GPU đầu tiên, `-` để tắt GPU).
210
- - `--sample_rate` (bắt buộc): Tần số lấy mẫu của âm thanh đầu vào.
211
-
212
- 4. Cấu hình nhúng:
213
- - `--embedder_model` (mặc định: `hubert_base`): Tên mô hình nhúng.
214
- - `--embedders_mode` (mặc định: `fairseq`): Chế độ nhúng (`fairseq`, `transformers`, `onnx`, `whisper`).
215
-
216
- 4. RMS:
217
- - `--rms_extract` (mặc định: False): Trích xuất thêm năng lượng rms.
218
- """)
219
- quit()
220
- elif argv_is_allows[13] in argv:
221
- print("""Các tham số của --preprocess:
222
- 1. Thông tin mô hình:
223
- - `--model_name` (bắt buộc): Tên mô hình.
224
-
225
- 2. Cấu hình dữ liệu:
226
- - `--dataset_path` (mặc định: `./dataset`): Đường dẫn thư mục chứa tệp dữ liệu.
227
- - `--sample_rate` (bắt buộc): Tần số lấy mẫu của dữ liệu âm thanh.
228
-
229
- 3. Cấu hình xử lí:
230
- - `--cpu_cores` (mặc định: `2`): Số lượng luồng CPU sử dụng.
231
- - `--cut_preprocess` (mặc định: `Automatic`): Cách cắt dữ liệu tiền xử l�� (`Automatic`, `Simple`, `Skip`).
232
- - `--process_effects` (mặc định: `False`): Có áp dụng tiền xử lí hay không.
233
- - `--clean_dataset` (mặc định: `False`): Có làm sạch tệp dữ liệu hay không.
234
- - `--clean_strength` (mặc định: `0.7`): Độ mạnh của quá trình làm sạch dữ liệu.
235
-
236
- 4. Cấu hình khác:
237
- - `--chunk_len` (mặc định: `3.0`): Độ dài của đoạn âm thanh cho phương pháp 'Simple'.
238
- - `--overlap_len` (mặc định: `0.3`): Độ dài của phần chồng chéo giữa các lát cắt đối với phương pháp 'Simple'.
239
- - `--normalization_mode` (mặc định: `none`): Có xử lí chuẩn hóa âm thanh không (`none`, `pre`, `post`)
240
- """)
241
- quit()
242
- elif argv_is_allows[14] in argv:
243
- print("""Các tham số của --separate_music:
244
- 1. Cấu hình đầu vào, đầu ra:
245
- - `--input_path` (bắt buộc): Đường dẫn tệp âm thanh đầu vào.
246
- - `--output_dirs` (mặc định: `./audios`): Thư mục lưu tệp đầu ra.
247
- - `--export_format` (mặc định: `wav`): Định dạng xuất tệp (`wav`, `mp3`,...).
248
- - `--sample_rate` (mặc định: `44100`): Tần số lấy mẫu của âm thanh đầu ra.
249
-
250
- 2. Cấu hình mô hình:
251
- - `--model_name` (mặc định: `MDXNET_Main`): Mô hình tách nhạc ('Main_340', 'Main_390', 'Main_406', 'Main_427', 'Main_438', 'Inst_full_292', 'Inst_HQ_1', 'Inst_HQ_2', 'Inst_HQ_3', 'Inst_HQ_4', 'Inst_HQ_5', 'Kim_Vocal_1', 'Kim_Vocal_2', 'Kim_Inst', 'Inst_187_beta', 'Inst_82_beta', 'Inst_90_beta', 'Voc_FT', 'Crowd_HQ', 'MDXNET_9482', 'Inst_1', 'Inst_2', 'Inst_3', 'MDXNET_1_9703', 'MDXNET_2_9682', 'MDXNET_3_9662', 'Inst_Main', 'MDXNET_Main', 'HT-Tuned', 'HT-Normal', 'HD_MMI', 'HT_6S', 'HP-1', 'HP-2', 'HP-Vocal-1', 'HP-Vocal-2', 'HP2-1', 'HP2-2', 'HP2-3', 'SP-2B-1', 'SP-2B-2', 'SP-3B-1', 'SP-4B-1', 'SP-4B-2', 'SP-MID-1', 'SP-MID-2').
252
- - `--karaoke_model` (mặc định: `MDX-Version-1`): Mô hình tách nhạc ('MDX-Version-1', 'MDX-Version-2', 'VR-Version-1', 'VR-Version-2').
253
- - `--reverb_model` (mặc định: `MDX-Reverb`): Mô hình tách nhạc ("MDX-Reverb", 'VR-Reverb', 'Echo-Aggressive', 'Echo-Normal').
254
- - `--denoise_model` (mặc định: `Normal`): Mô hình tách nhạc ('Lite', 'Normal').
255
-
256
- 3. Cấu hình xử lí âm thanh:
257
- - `--shifts` (mặc định: `2`): Số lượng dự đoán.
258
- - `--batch_size` (mặc định: `1`): Kích thước lô.
259
- - `--overlap` (mặc định: `0.25`): Mức độ chồng lấn giữa các đoạn.
260
- - `--aggression` (mặc định: `5`): Cường độ chiết xuất thân chính.
261
- - `--hop_length` (mặc định: `1024`): Bước nhảy MDX khi xử lí.
262
- - `--window_size` (mặc định: `512`): Kích thước cửa sổ.
263
- - `--segments_size` (mặc định: `256`): Kích thước phân đoạn âm thanh.
264
- - `--post_process_threshold` (mặc định: `0.2`): Mức độ xử lí hậu kỳ sau khi tách nhạc.
265
-
266
- 4. Cấu hình xử lí âm thanh khác:
267
- - `--enable_tta` (mặc định: `False`): Tăng cường suy luận.
268
- - `--enable_denoise` (mặc định: `False`): Khữ tách nhạc.
269
- - `--high_end_process` (mặc định: `False`): Xử lí dải cao.
270
- - `--enable_post_process` (mặc định: `False`): Hậu xử lí.
271
- - `--separate_backing` (mặc định: `False`): Tách bè giọng.
272
- - `--separate_reverb` (mặc định: `False`): Tách vang giọng.
273
- """)
274
- quit()
275
- elif argv_is_allows[15] in argv:
276
- print("""Các tham số của --train:
277
- 1. Cấu hình mô hình:
278
- - `--model_name` (bắt buộc): Tên mô hình.
279
- - `--rvc_version` (mặc định: `v2`): Phiên bản RVC (`v1`, `v2`).
280
- - `--model_author` (tùy chọn): Tác giả của mô hình.
281
-
282
- 2. Cấu hình lưu:
283
- - `--save_every_epoch` (bắt buộc): Số kỷ nguyên giữa mỗi lần lưu.
284
- - `--save_only_latest` (mặc định: `True`): Chỉ lưu điểm mới nhất.
285
- - `--save_every_weights` (mặc định: `True`): Lưu tất cả trọng số của mô hình.
286
-
287
- 3. Cấu hình huấn luyện:
288
- - `--total_epoch` (mặc định: `300`): Tổng số kỷ nguyên huấn luyện.
289
- - `--batch_size` (mặc định: `8`): Kích thước lô trong quá trình huấn luyện.
290
-
291
- 4. Cấu hình thiết bị:
292
- - `--gpu` (mặc định: `0`): Chỉ định GPU để sử dụng (số hoặc `-` nếu không dùng GPU).
293
- - `--cache_data_in_gpu` (mặc định: `False`): Lưu dữ liệu vào GPU để tăng tốc.
294
-
295
- 5. Cấu hình huấn luyện nâng cao:
296
- - `--pitch_guidance` (mặc định: `True`): Sử dụng hướng dẫn cao độ.
297
- - `--g_pretrained_path` (mặc định: ``): Đường dẫn đến trọng số G đã huấn luyện trước.
298
- - `--d_pretrained_path` (mặc định: ``): Đường dẫn đến trọng số D đã huấn luyện trước.
299
- - `--vocoder` (mặc định: `Default`): Bộ mã hóa được sử dụng (`Default`, `MRF-HiFi-GAN`, `RefineGAN`).
300
- - `--energy_use` (mặc định: `False`): Sử dụng năng lượng rms.
301
-
302
- 6. Phát hiện huấn luyện quá mức:
303
- - `--overtraining_detector` (mặc định: `False`): Bật/tắt chế độ phát hiện huấn luyện quá mức.
304
- - `--overtraining_threshold` (mặc định: `50`): Ngưỡng để xác định huấn luyện quá mức.
305
-
306
- 7. Xử lí dữ liệu:
307
- - `--cleanup` (mặc định: `False`): Dọn dẹp tệp huấn luyện cũ để tiến hành huấn luyện lại từ đầu.
308
-
309
- 8. Tối ưu:
310
- - `--checkpointing` (mặc định: `False`): Bật/tắt checkpointing để tiết kiệm RAM.
311
- - `--deterministic` (mặc định: `False`): Khi bật sẽ sử dụng các thuật toán có tính xác định cao, đảm bảo rằng mỗi lần chạy cùng một dữ liệu đầu vào sẽ cho kết quả giống nhau.
312
- - `--benchmark` (mặc định: `False`): Khi bật sẽ thử nghiệm và chọn thuật toán tối ưu nhất cho phần cứng và kích thước cụ thể.
313
- - `--optimizer` (mặc định: `AdamW`): Trình tối ưu hóa được sử dụng (`AdamW`, `RAdam`, `AnyPrecisionAdamW`).
314
- - `--multiscale_mel_loss` (mặc định: `False`): So sánh phổ Mel của âm thanh thật và âm thanh giả ở nhiều thang độ khác nhau. Giúp mô hình học được chi tiết âm sắc, độ sáng và cấu trúc tần số tốt hơn, từ đó cải thiện chất lượng và độ tự nhiên của giọng nói đầu ra.
315
-
316
- 9. Bộ tham chiếu:
317
- - `--use_custom_reference` (mặc định: `False`): Có tùy chỉnh bộ tham chiếu hay không.
318
- - `--reference_path` (mặc định: `False`): Đường dẫn đến bộ tham chiếu.
319
- """)
320
- quit()
321
- elif argv_is_allows[18] in argv:
322
- print("""Các tham số của --create_reference:
323
- 1. Đường dẫn tệp:
324
- - `--audio_path` (bắt buộc): Đường dẫn tệp âm thanh đầu vào.
325
- - `--reference_name` (mặc định: `reference`): Đường dẫn lưu bộ tham chiếu đầu ra.
326
-
327
- 2. Cấu hình bộ tham chiếu:
328
- - `--pitch_guidance` (mặc định: `True`): Sử dụng hướng dẫn cao độ.
329
- - `--energy_use` (mặc định: `False`): Sử dụng năng lượng rms.
330
- - `--version` (mặc định: `v2`): Phiên bản RVC (`v1`, `v2`).
331
-
332
- 3. Cấu hình nhúng:
333
- - `--embedder_model` (mặc định: `hubert_base`): Tên mô hình nhúng.
334
- - `--embedders_mode` (mặc định: `fairseq`): Chế độ nhúng (`fairseq`, `transformers`, `onnx`, `whisper`).
335
-
336
- 4. Cấu hình F0:
337
- - `--f0_method` (mặc định: `rmvpe`): Phương pháp dự đoán F0 (`pm`, `dio`, `mangio-crepe-tiny`, `mangio-crepe-small`, `mangio-crepe-medium`, `mangio-crepe-large`, `mangio-crepe-full`, `crepe-tiny`, `crepe-small`, `crepe-medium`, `crepe-large`, `crepe-full`, `fcpe`, `fcpe-legacy`, `rmvpe`, `rmvpe-legacy`, `harvest`, `yin`, `pyin`, `swipe`).
338
- - `--f0_onnx` (mặc định: `False`): Có sử dụng phiên bản ONNX của F0 hay không.
339
- - `--f0_up_key` (mặc định: `0`): Điều chỉnh cao độ.
340
- - `--filter_radius` (mặc định: `3`): Độ mượt của đường F0.
341
- - `--f0_autotune` (mặc định: `False`): Có tự động điều chỉnh F0 hay không.
342
- - `--f0_autotune_strength` (mặc định: `1`): Cường độ hiệu chỉnh tự động F0.
343
- - `--f0_file` (mặc định: ``): Đường dẫn tệp F0 có sẵn.
344
- - `--proposal_pitch` (mặc định: `False`): Đề xuất cao độ thay vì điều chỉnh thủ công.
345
- - `--proposal_pitch_threshold` (mặc định: `0.0`): Ngưỡng tần số ước tính cao độ.
346
- - `--alpha` (mặc định: `0.5`): Ngưỡng trộn cao độ khi ước tính cao độ hybrid.
347
- """)
348
- quit()
349
- elif argv_is_allows[16] in argv:
350
- print("""Sử dụng:
351
- 1. `--help_audio_effects`: Trợ giúp về phần thêm hiệu ứng âm thanh.
352
- 2. `--help_convert`: Trợ giúp về chuyển đổi âm thanh.
353
- 3. `--help_create_dataset`: Trợ giúp về tạo dữ liệu huấn luyện.
354
- 4. `--help_create_index`: Trợ giúp về tạo chỉ mục.
355
- 5. `--help_extract`: Trợ giúp về trích xuất dữ liệu huấn luyện.
356
- 6. `--help_preprocess`: Trợ giúp về xử lí trước dữ liệu.
357
- 7. `--help_separate_music`: Trợ giúp về tách nhạc.
358
- 8. `--help_train`: Trợ giúp về huấn luyện mô hình.
359
- 9. `--help_create_reference`: Trợ giúp về tạo bộ tham chiếu.
360
- """)
361
- quit()
362
-
363
- if __name__ == "__main__":
364
- import torch.multiprocessing as mp
365
-
366
- if "--train" in argv: mp.set_start_method("spawn")
367
- if "--preprocess" in argv or "--extract" in argv: mp.set_start_method("spawn", force=True)
368
-
369
- main()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
main/app/run_tensorboard.py DELETED
@@ -1,32 +0,0 @@
1
- import os
2
- import sys
3
- import time
4
- import logging
5
- import warnings
6
- import webbrowser
7
-
8
- from tensorboard import program
9
-
10
- sys.path.append(os.getcwd())
11
-
12
- from main.app.variables import config, translations, logger
13
-
14
- def launch_tensorboard():
15
- warnings.filterwarnings("ignore")
16
- for l in ["root", "tensorboard"]:
17
- logging.getLogger(l).setLevel(logging.ERROR)
18
-
19
- tb = program.TensorBoard()
20
- tb.configure(argv=[None, "--logdir", config.configs["logs_path"], f"--port={config.configs['tensorboard_port']}"])
21
- url = tb.launch()
22
-
23
- logger.info(f"{translations['tensorboard_url']}: {url}")
24
- if "--open" in sys.argv: webbrowser.open(url)
25
-
26
- return f"{translations['tensorboard_url']}: {url}"
27
-
28
- if __name__ == "__main__":
29
- launch_tensorboard()
30
-
31
- while 1:
32
- time.sleep(5)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
main/app/tabs/downloads/downloads.py DELETED
@@ -1,112 +0,0 @@
1
- import os
2
- import sys
3
-
4
- import gradio as gr
5
-
6
- sys.path.append(os.getcwd())
7
-
8
- from main.app.variables import translations, configs, models, model_options
9
- from main.app.core.downloads import download_model, search_models, download_pretrained_model
10
- from main.app.core.ui import change_download_choices, change_download_pretrained_choices, shutil_move
11
- from main.app.core.process import fetch_pretrained_data, save_drop_model, update_sample_rate_dropdown
12
-
13
- def download_tab():
14
- with gr.TabItem(translations["downloads"], visible=configs.get("downloads_tab", True)):
15
- gr.Markdown(translations["download_markdown"])
16
- with gr.Row():
17
- gr.Markdown(translations["download_markdown_2"])
18
- with gr.Row():
19
- with gr.Accordion(translations["model_download"], open=True):
20
- with gr.Row():
21
- downloadmodel = gr.Radio(label=translations["model_download_select"], choices=[translations["download_url"], translations["download_from_csv"], translations["search_models"], translations["upload"]], interactive=True, value=translations["download_url"])
22
- with gr.Row():
23
- gr.Markdown("___")
24
- with gr.Column():
25
- with gr.Row():
26
- url_input = gr.Textbox(label=translations["model_url"], value="", placeholder="https://...", scale=6)
27
- download_model_name = gr.Textbox(label=translations["modelname"], value="", placeholder=translations["modelname"], scale=2)
28
- url_download = gr.Button(value=translations["downloads"], scale=2)
29
- with gr.Column():
30
- model_browser = gr.Dropdown(choices=models.keys(), label=translations["model_warehouse"], scale=8, allow_custom_value=True, visible=False)
31
- download_from_browser = gr.Button(value=translations["get_model"], scale=2, variant="primary", visible=False)
32
- with gr.Column():
33
- search_name = gr.Textbox(label=translations["name_to_search"], placeholder=translations["modelname"], interactive=True, scale=8, visible=False)
34
- search = gr.Button(translations["search_2"], scale=2, visible=False)
35
- search_dropdown = gr.Dropdown(label=translations["select_download_model"], value="", choices=[], allow_custom_value=True, interactive=False, visible=False)
36
- download = gr.Button(translations["downloads"], variant="primary", visible=False)
37
- with gr.Column():
38
- model_upload = gr.Files(label=translations["drop_model"], file_types=[".pth", ".onnx", ".index", ".zip"], visible=False)
39
- with gr.Row():
40
- with gr.Accordion(translations["download_pretrained_2"], open=False):
41
- with gr.Row():
42
- pretrain_download_choices = gr.Radio(label=translations["model_download_select"], choices=[translations["download_url"], translations["list_model"], translations["upload"]], value=translations["download_url"], interactive=True)
43
- with gr.Row():
44
- gr.Markdown("___")
45
- with gr.Column():
46
- with gr.Row():
47
- pretrainD = gr.Textbox(label=translations["pretrained_url"].format(dg="D"), value="", placeholder="https://...", interactive=True, scale=4)
48
- pretrainG = gr.Textbox(label=translations["pretrained_url"].format(dg="G"), value="", placeholder="https://...", interactive=True, scale=4)
49
- download_pretrain_button = gr.Button(translations["downloads"], scale=2)
50
- with gr.Column():
51
- with gr.Row():
52
- pretrain_choices = gr.Dropdown(label=translations["select_pretrain"], info=translations["select_pretrain_info"], choices=list(fetch_pretrained_data().keys()), value="Titan_Medium", allow_custom_value=True, interactive=True, scale=6, visible=False)
53
- sample_rate_pretrain = gr.Dropdown(label=translations["pretrain_sr"], info=translations["pretrain_sr"], choices=["48k", "40k", "32k"], value="48k", interactive=True, visible=False)
54
- download_pretrain_choices_button = gr.Button(translations["downloads"], scale=2, variant="primary", visible=False)
55
- with gr.Row():
56
- pretrain_upload = gr.Files(label=translations["drop_pretrain"].format(dg="G, D"), file_types=[".pth"], visible=False)
57
- with gr.Row():
58
- url_download.click(
59
- fn=download_model,
60
- inputs=[
61
- url_input,
62
- download_model_name
63
- ],
64
- outputs=[url_input],
65
- api_name="download_model"
66
- )
67
- download_from_browser.click(
68
- fn=lambda model: download_model(models[model], model),
69
- inputs=[model_browser],
70
- outputs=[model_browser],
71
- api_name="download_browser"
72
- )
73
- with gr.Row():
74
- downloadmodel.change(fn=change_download_choices, inputs=[downloadmodel], outputs=[url_input, download_model_name, url_download, model_browser, download_from_browser, search_name, search, search_dropdown, download, model_upload])
75
- search.click(fn=search_models, inputs=[search_name], outputs=[search_dropdown, download])
76
- model_upload.upload(fn=save_drop_model, inputs=[model_upload], outputs=[model_upload])
77
- download.click(
78
- fn=lambda model: download_model(model_options[model], model),
79
- inputs=[search_dropdown],
80
- outputs=[search_dropdown],
81
- api_name="search_models"
82
- )
83
- with gr.Row():
84
- pretrain_download_choices.change(fn=change_download_pretrained_choices, inputs=[pretrain_download_choices], outputs=[pretrainD, pretrainG, download_pretrain_button, pretrain_choices, sample_rate_pretrain, download_pretrain_choices_button, pretrain_upload])
85
- pretrain_choices.change(fn=update_sample_rate_dropdown, inputs=[pretrain_choices], outputs=[sample_rate_pretrain])
86
- with gr.Row():
87
- download_pretrain_button.click(
88
- fn=download_pretrained_model,
89
- inputs=[
90
- pretrain_download_choices,
91
- pretrainD,
92
- pretrainG
93
- ],
94
- outputs=[pretrainD, pretrainG],
95
- api_name="download_pretrain_link"
96
- )
97
- download_pretrain_choices_button.click(
98
- fn=download_pretrained_model,
99
- inputs=[
100
- pretrain_download_choices,
101
- pretrain_choices,
102
- sample_rate_pretrain
103
- ],
104
- outputs=[pretrain_choices],
105
- api_name="download_pretrain_choices"
106
- )
107
- pretrain_upload.upload(
108
- fn=lambda pretrain_upload: [shutil_move(pretrain.name, configs["pretrained_custom_path"]) for pretrain in pretrain_upload],
109
- inputs=[pretrain_upload],
110
- outputs=[],
111
- api_name="upload_pretrain"
112
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
main/app/tabs/editing/child/audio_effects.py DELETED
@@ -1,393 +0,0 @@
1
- import os
2
- import sys
3
-
4
- import gradio as gr
5
-
6
- sys.path.append(os.getcwd())
7
-
8
- from main.app.core.editing import audio_effects
9
- from main.app.core.presets import audio_effect_load_presets, audio_effect_save_presets
10
- from main.app.core.ui import visible, change_audios_choices, change_effect_preset_choices, shutil_move
11
- from main.app.variables import translations, paths_for_files, sample_rate_choice, audio_effect_presets_file, configs, file_types, export_format_choices
12
-
13
- def audio_effects_tab():
14
- with gr.Row():
15
- gr.Markdown(translations["audio_effects_edit"])
16
- with gr.Row():
17
- with gr.Column():
18
- with gr.Row():
19
- reverb_check_box = gr.Checkbox(label=translations["reverb"], value=False, interactive=True)
20
- chorus_check_box = gr.Checkbox(label=translations["chorus"], value=False, interactive=True)
21
- delay_check_box = gr.Checkbox(label=translations["delay"], value=False, interactive=True)
22
- phaser_check_box = gr.Checkbox(label=translations["phaser"], value=False, interactive=True)
23
- compressor_check_box = gr.Checkbox(label=translations["compressor"], value=False, interactive=True)
24
- more_options = gr.Checkbox(label=translations["more_option"], value=False, interactive=True)
25
- with gr.Row():
26
- with gr.Accordion(translations["input_output"], open=False):
27
- with gr.Row():
28
- upload_audio = gr.Files(label=translations["drop_audio"], file_types=file_types)
29
- with gr.Row():
30
- audio_in_path = gr.Dropdown(label=translations["input_audio"], value="", choices=paths_for_files, info=translations["provide_audio"], interactive=True, allow_custom_value=True)
31
- audio_out_path = gr.Textbox(label=translations["output_audio"], value="audios/audio_effects.wav", placeholder="audios/audio_effects.wav", info=translations["provide_output"], interactive=True)
32
- with gr.Row():
33
- with gr.Column():
34
- audio_combination = gr.Checkbox(label=translations["merge_instruments"], value=False, interactive=True)
35
- audio_combination_input = gr.Dropdown(label=translations["input_audio"], value="", choices=paths_for_files, info=translations["provide_audio"], interactive=True, allow_custom_value=True, visible=audio_combination.value)
36
- with gr.Row():
37
- main_vol = gr.Slider(minimum=-80, maximum=80, label=translations["main_volume"], info=translations["main_volume_info"], value=-4, step=1, interactive=True, visible=audio_combination.value)
38
- combine_vol = gr.Slider(minimum=-80, maximum=80, label=translations["combination_volume"], info=translations["combination_volume_info"], value=-7, step=1, interactive=True, visible=audio_combination.value)
39
- with gr.Row():
40
- audio_effects_refresh = gr.Button(translations["refresh"])
41
- with gr.Row():
42
- audio_output_format = gr.Radio(label=translations["export_format"], info=translations["export_info"], choices=export_format_choices, value="wav", interactive=True)
43
- with gr.Row():
44
- with gr.Accordion(translations["use_presets"], open=False):
45
- with gr.Row():
46
- presets_name = gr.Dropdown(label=translations["file_preset"], choices=audio_effect_presets_file, value=audio_effect_presets_file[0] if len(audio_effect_presets_file) > 0 else '', interactive=True, allow_custom_value=True)
47
- with gr.Row():
48
- load_click = gr.Button(translations["load_file"], variant="primary")
49
- refresh_click = gr.Button(translations["refresh"])
50
- with gr.Accordion(translations["export_file"], open=False):
51
- with gr.Row():
52
- with gr.Column():
53
- name_to_save_file = gr.Textbox(label=translations["filename_to_save"])
54
- save_file_button = gr.Button(translations["export_file"])
55
- with gr.Row():
56
- upload_presets = gr.Files(label=translations["upload_presets"], file_types=[".effect.json"])
57
- with gr.Row():
58
- apply_effects_button = gr.Button(translations["apply"], variant="primary", scale=2)
59
- with gr.Row():
60
- with gr.Column():
61
- with gr.Row():
62
- with gr.Accordion(translations["reverb"], open=False, visible=reverb_check_box.value) as reverb_accordion:
63
- reverb_freeze_mode = gr.Checkbox(label=translations["reverb_freeze"], info=translations["reverb_freeze_info"], value=False, interactive=True)
64
- reverb_room_size = gr.Slider(minimum=0, maximum=1, step=0.01, value=0.15, label=translations["room_size"], info=translations["room_size_info"], interactive=True)
65
- reverb_damping = gr.Slider(minimum=0, maximum=1, step=0.01, value=0.7, label=translations["damping"], info=translations["damping_info"], interactive=True)
66
- reverb_wet_level = gr.Slider(minimum=0, maximum=1, step=0.01, value=0.2, label=translations["wet_level"], info=translations["wet_level_info"], interactive=True)
67
- reverb_dry_level = gr.Slider(minimum=0, maximum=1, step=0.01, value=0.8, label=translations["dry_level"], info=translations["dry_level_info"], interactive=True)
68
- reverb_width = gr.Slider(minimum=0, maximum=1, step=0.01, value=1, label=translations["width"], info=translations["width_info"], interactive=True)
69
- with gr.Row():
70
- with gr.Accordion(translations["chorus"], open=False, visible=chorus_check_box.value) as chorus_accordion:
71
- chorus_depth = gr.Slider(minimum=0, maximum=1, step=0.01, value=0.5, label=translations["chorus_depth"], info=translations["chorus_depth_info"], interactive=True)
72
- chorus_rate_hz = gr.Slider(minimum=0.1, maximum=10, step=0.1, value=1.5, label=translations["chorus_rate_hz"], info=translations["chorus_rate_hz_info"], interactive=True)
73
- chorus_mix = gr.Slider(minimum=0, maximum=1, step=0.01, value=0.5, label=translations["chorus_mix"], info=translations["chorus_mix_info"], interactive=True)
74
- chorus_centre_delay_ms = gr.Slider(minimum=0, maximum=50, step=1, value=10, label=translations["chorus_centre_delay_ms"], info=translations["chorus_centre_delay_ms_info"], interactive=True)
75
- chorus_feedback = gr.Slider(minimum=-1, maximum=1, step=0.01, value=0, label=translations["chorus_feedback"], info=translations["chorus_feedback_info"], interactive=True)
76
- with gr.Row():
77
- with gr.Accordion(translations["delay"], open=False, visible=delay_check_box.value) as delay_accordion:
78
- delay_second = gr.Slider(minimum=0, maximum=5, step=0.01, value=0.5, label=translations["delay_seconds"], info=translations["delay_seconds_info"], interactive=True)
79
- delay_feedback = gr.Slider(minimum=0, maximum=1, step=0.01, value=0.5, label=translations["delay_feedback"], info=translations["delay_feedback_info"], interactive=True)
80
- delay_mix = gr.Slider(minimum=0, maximum=1, step=0.01, value=0.5, label=translations["delay_mix"], info=translations["delay_mix_info"], interactive=True)
81
- with gr.Column():
82
- with gr.Row():
83
- with gr.Accordion(translations["more_option"], open=False, visible=more_options.value) as more_accordion:
84
- with gr.Row():
85
- fade = gr.Checkbox(label=translations["fade"], value=False, interactive=True)
86
- bass_or_treble = gr.Checkbox(label=translations["bass_or_treble"], value=False, interactive=True)
87
- limiter = gr.Checkbox(label=translations["limiter"], value=False, interactive=True)
88
- resample_checkbox = gr.Checkbox(label=translations["resample"], value=False, interactive=True)
89
- with gr.Row():
90
- distortion_checkbox = gr.Checkbox(label=translations["distortion"], value=False, interactive=True)
91
- gain_checkbox = gr.Checkbox(label=translations["gain"], value=False, interactive=True)
92
- bitcrush_checkbox = gr.Checkbox(label=translations["bitcrush"], value=False, interactive=True)
93
- clipping_checkbox = gr.Checkbox(label=translations["clipping"], value=False, interactive=True)
94
- with gr.Accordion(translations["fade"], open=True, visible=fade.value) as fade_accordion:
95
- with gr.Row():
96
- fade_in = gr.Slider(minimum=0, maximum=10000, step=100, value=0, label=translations["fade_in"], info=translations["fade_in_info"], interactive=True)
97
- fade_out = gr.Slider(minimum=0, maximum=10000, step=100, value=0, label=translations["fade_out"], info=translations["fade_out_info"], interactive=True)
98
- with gr.Accordion(translations["bass_or_treble"], open=True, visible=bass_or_treble.value) as bass_treble_accordion:
99
- with gr.Row():
100
- bass_boost = gr.Slider(minimum=0, maximum=20, step=1, value=0, label=translations["bass_boost"], info=translations["bass_boost_info"], interactive=True)
101
- bass_frequency = gr.Slider(minimum=20, maximum=200, step=10, value=100, label=translations["bass_frequency"], info=translations["bass_frequency_info"], interactive=True)
102
- with gr.Row():
103
- treble_boost = gr.Slider(minimum=0, maximum=20, step=1, value=0, label=translations["treble_boost"], info=translations["treble_boost_info"], interactive=True)
104
- treble_frequency = gr.Slider(minimum=1000, maximum=10000, step=500, value=3000, label=translations["treble_frequency"], info=translations["treble_frequency_info"], interactive=True)
105
- with gr.Accordion(translations["limiter"], open=True, visible=limiter.value) as limiter_accordion:
106
- with gr.Row():
107
- limiter_threshold_db = gr.Slider(minimum=-60, maximum=0, step=1, value=-1, label=translations["limiter_threshold_db"], info=translations["limiter_threshold_db_info"], interactive=True)
108
- limiter_release_ms = gr.Slider(minimum=10, maximum=1000, step=1, value=100, label=translations["limiter_release_ms"], info=translations["limiter_release_ms_info"], interactive=True)
109
- with gr.Column():
110
- pitch_shift_semitones = gr.Slider(minimum=-20, maximum=20, step=1, value=0, label=translations["pitch"], info=translations["pitch_info"], interactive=True)
111
- audio_effect_resample_sr = gr.Radio(choices=[0]+sample_rate_choice, value=0, label=translations["resample"], info=translations["resample_info"], interactive=True, visible=resample_checkbox.value)
112
- distortion_drive_db = gr.Slider(minimum=0, maximum=50, step=1, value=20, label=translations["distortion"], info=translations["distortion_info"], interactive=True, visible=distortion_checkbox.value)
113
- gain_db = gr.Slider(minimum=-60, maximum=60, step=1, value=0, label=translations["gain"], info=translations["gain_info"], interactive=True, visible=gain_checkbox.value)
114
- clipping_threshold_db = gr.Slider(minimum=-60, maximum=0, step=1, value=-1, label=translations["clipping_threshold_db"], info=translations["clipping_threshold_db_info"], interactive=True, visible=clipping_checkbox.value)
115
- bitcrush_bit_depth = gr.Slider(minimum=1, maximum=24, step=1, value=16, label=translations["bitcrush_bit_depth"], info=translations["bitcrush_bit_depth_info"], interactive=True, visible=bitcrush_checkbox.value)
116
- with gr.Row():
117
- with gr.Accordion(translations["phaser"], open=False, visible=phaser_check_box.value) as phaser_accordion:
118
- phaser_depth = gr.Slider(minimum=0, maximum=1, step=0.01, value=0.5, label=translations["phaser_depth"], info=translations["phaser_depth_info"], interactive=True)
119
- phaser_rate_hz = gr.Slider(minimum=0.1, maximum=10, step=0.1, value=1, label=translations["phaser_rate_hz"], info=translations["phaser_rate_hz_info"], interactive=True)
120
- phaser_mix = gr.Slider(minimum=0, maximum=1, step=0.01, value=0.5, label=translations["phaser_mix"], info=translations["phaser_mix_info"], interactive=True)
121
- phaser_centre_frequency_hz = gr.Slider(minimum=50, maximum=5000, step=10, value=1000, label=translations["phaser_centre_frequency_hz"], info=translations["phaser_centre_frequency_hz_info"], interactive=True)
122
- phaser_feedback = gr.Slider(minimum=-1, maximum=1, step=0.01, value=0, label=translations["phaser_feedback"], info=translations["phaser_feedback_info"], interactive=True)
123
- with gr.Row():
124
- with gr.Accordion(translations["compressor"], open=False, visible=compressor_check_box.value) as compressor_accordion:
125
- compressor_threshold_db = gr.Slider(minimum=-60, maximum=0, step=1, value=-20, label=translations["compressor_threshold_db"], info=translations["compressor_threshold_db_info"], interactive=True)
126
- compressor_ratio = gr.Slider(minimum=1, maximum=20, step=0.1, value=1, label=translations["compressor_ratio"], info=translations["compressor_ratio_info"], interactive=True)
127
- compressor_attack_ms = gr.Slider(minimum=0.1, maximum=100, step=0.1, value=10, label=translations["compressor_attack_ms"], info=translations["compressor_attack_ms_info"], interactive=True)
128
- compressor_release_ms = gr.Slider(minimum=10, maximum=1000, step=1, value=100, label=translations["compressor_release_ms"], info=translations["compressor_release_ms_info"], interactive=True)
129
- with gr.Row():
130
- gr.Markdown(translations["output_audio"])
131
- with gr.Row():
132
- audio_play_input = gr.Audio(show_download_button=True, interactive=False, label=translations["input_audio"])
133
- audio_play_output = gr.Audio(show_download_button=True, interactive=False, label=translations["output_audio"])
134
- with gr.Row():
135
- reverb_check_box.change(fn=visible, inputs=[reverb_check_box], outputs=[reverb_accordion])
136
- chorus_check_box.change(fn=visible, inputs=[chorus_check_box], outputs=[chorus_accordion])
137
- delay_check_box.change(fn=visible, inputs=[delay_check_box], outputs=[delay_accordion])
138
- with gr.Row():
139
- compressor_check_box.change(fn=visible, inputs=[compressor_check_box], outputs=[compressor_accordion])
140
- phaser_check_box.change(fn=visible, inputs=[phaser_check_box], outputs=[phaser_accordion])
141
- more_options.change(fn=visible, inputs=[more_options], outputs=[more_accordion])
142
- with gr.Row():
143
- fade.change(fn=visible, inputs=[fade], outputs=[fade_accordion])
144
- bass_or_treble.change(fn=visible, inputs=[bass_or_treble], outputs=[bass_treble_accordion])
145
- limiter.change(fn=visible, inputs=[limiter], outputs=[limiter_accordion])
146
- resample_checkbox.change(fn=visible, inputs=[resample_checkbox], outputs=[audio_effect_resample_sr])
147
- with gr.Row():
148
- distortion_checkbox.change(fn=visible, inputs=[distortion_checkbox], outputs=[distortion_drive_db])
149
- gain_checkbox.change(fn=visible, inputs=[gain_checkbox], outputs=[gain_db])
150
- clipping_checkbox.change(fn=visible, inputs=[clipping_checkbox], outputs=[clipping_threshold_db])
151
- bitcrush_checkbox.change(fn=visible, inputs=[bitcrush_checkbox], outputs=[bitcrush_bit_depth])
152
- with gr.Row():
153
- upload_audio.upload(fn=lambda audio_in: [shutil_move(audio.name, configs["audios_path"]) for audio in audio_in][0], inputs=[upload_audio], outputs=[audio_in_path])
154
- audio_in_path.change(fn=lambda audio: audio if audio else None, inputs=[audio_in_path], outputs=[audio_play_input])
155
- audio_effects_refresh.click(fn=lambda a, b: [change_audios_choices(a), change_audios_choices(b)], inputs=[audio_in_path, audio_combination_input], outputs=[audio_in_path, audio_combination_input])
156
- with gr.Row():
157
- more_options.change(fn=lambda: [False]*8, inputs=[], outputs=[fade, bass_or_treble, limiter, resample_checkbox, distortion_checkbox, gain_checkbox, clipping_checkbox, bitcrush_checkbox])
158
- audio_combination.change(fn=visible, inputs=[audio_combination], outputs=[audio_combination_input])
159
- audio_combination.change(fn=lambda a: [visible(a) for _ in range(2)], inputs=[audio_combination], outputs=[main_vol, combine_vol])
160
- with gr.Row():
161
- upload_presets.upload(fn=lambda presets_in: [shutil_move(preset.name, configs["presets_path"]) for preset in presets_in][0], inputs=[upload_presets], outputs=[presets_name])
162
- refresh_click.click(fn=change_effect_preset_choices, inputs=[], outputs=[presets_name])
163
- with gr.Row():
164
- load_click.click(
165
- fn=audio_effect_load_presets,
166
- inputs=[
167
- presets_name,
168
- resample_checkbox,
169
- audio_effect_resample_sr,
170
- chorus_depth,
171
- chorus_rate_hz,
172
- chorus_mix,
173
- chorus_centre_delay_ms,
174
- chorus_feedback,
175
- distortion_drive_db,
176
- reverb_room_size,
177
- reverb_damping,
178
- reverb_wet_level,
179
- reverb_dry_level,
180
- reverb_width,
181
- reverb_freeze_mode,
182
- pitch_shift_semitones,
183
- delay_second,
184
- delay_feedback,
185
- delay_mix,
186
- compressor_threshold_db,
187
- compressor_ratio,
188
- compressor_attack_ms,
189
- compressor_release_ms,
190
- limiter_threshold_db,
191
- limiter_release_ms,
192
- gain_db,
193
- bitcrush_bit_depth,
194
- clipping_threshold_db,
195
- phaser_rate_hz,
196
- phaser_depth,
197
- phaser_centre_frequency_hz,
198
- phaser_feedback,
199
- phaser_mix,
200
- bass_boost,
201
- bass_frequency,
202
- treble_boost,
203
- treble_frequency,
204
- fade_in,
205
- fade_out,
206
- chorus_check_box,
207
- distortion_checkbox,
208
- reverb_check_box,
209
- delay_check_box,
210
- compressor_check_box,
211
- limiter,
212
- gain_checkbox,
213
- bitcrush_checkbox,
214
- clipping_checkbox,
215
- phaser_check_box,
216
- bass_or_treble,
217
- fade
218
- ],
219
- outputs=[
220
- resample_checkbox,
221
- audio_effect_resample_sr,
222
- chorus_depth,
223
- chorus_rate_hz,
224
- chorus_mix,
225
- chorus_centre_delay_ms,
226
- chorus_feedback,
227
- distortion_drive_db,
228
- reverb_room_size,
229
- reverb_damping,
230
- reverb_wet_level,
231
- reverb_dry_level,
232
- reverb_width,
233
- reverb_freeze_mode,
234
- pitch_shift_semitones,
235
- delay_second,
236
- delay_feedback,
237
- delay_mix,
238
- compressor_threshold_db,
239
- compressor_ratio,
240
- compressor_attack_ms,
241
- compressor_release_ms,
242
- limiter_threshold_db,
243
- limiter_release_ms,
244
- gain_db,
245
- bitcrush_bit_depth,
246
- clipping_threshold_db,
247
- phaser_rate_hz,
248
- phaser_depth,
249
- phaser_centre_frequency_hz,
250
- phaser_feedback,
251
- phaser_mix,
252
- bass_boost,
253
- bass_frequency,
254
- treble_boost,
255
- treble_frequency,
256
- fade_in,
257
- fade_out,
258
- chorus_check_box,
259
- distortion_checkbox,
260
- reverb_check_box,
261
- delay_check_box,
262
- compressor_check_box,
263
- limiter,
264
- gain_checkbox,
265
- bitcrush_checkbox,
266
- clipping_checkbox,
267
- phaser_check_box,
268
- bass_or_treble,
269
- fade
270
- ],
271
- )
272
- save_file_button.click(
273
- fn=audio_effect_save_presets,
274
- inputs=[
275
- name_to_save_file,
276
- resample_checkbox,
277
- audio_effect_resample_sr,
278
- chorus_depth,
279
- chorus_rate_hz,
280
- chorus_mix,
281
- chorus_centre_delay_ms,
282
- chorus_feedback,
283
- distortion_drive_db,
284
- reverb_room_size,
285
- reverb_damping,
286
- reverb_wet_level,
287
- reverb_dry_level,
288
- reverb_width,
289
- reverb_freeze_mode,
290
- pitch_shift_semitones,
291
- delay_second,
292
- delay_feedback,
293
- delay_mix,
294
- compressor_threshold_db,
295
- compressor_ratio,
296
- compressor_attack_ms,
297
- compressor_release_ms,
298
- limiter_threshold_db,
299
- limiter_release_ms,
300
- gain_db,
301
- bitcrush_bit_depth,
302
- clipping_threshold_db,
303
- phaser_rate_hz,
304
- phaser_depth,
305
- phaser_centre_frequency_hz,
306
- phaser_feedback,
307
- phaser_mix,
308
- bass_boost,
309
- bass_frequency,
310
- treble_boost,
311
- treble_frequency,
312
- fade_in,
313
- fade_out,
314
- chorus_check_box,
315
- distortion_checkbox,
316
- reverb_check_box,
317
- delay_check_box,
318
- compressor_check_box,
319
- limiter,
320
- gain_checkbox,
321
- bitcrush_checkbox,
322
- clipping_checkbox,
323
- phaser_check_box,
324
- bass_or_treble,
325
- fade
326
- ],
327
- outputs=[presets_name]
328
- )
329
- with gr.Row():
330
- apply_effects_button.click(
331
- fn=audio_effects,
332
- inputs=[
333
- audio_in_path,
334
- audio_out_path,
335
- resample_checkbox,
336
- audio_effect_resample_sr,
337
- chorus_depth,
338
- chorus_rate_hz,
339
- chorus_mix,
340
- chorus_centre_delay_ms,
341
- chorus_feedback,
342
- distortion_drive_db,
343
- reverb_room_size,
344
- reverb_damping,
345
- reverb_wet_level,
346
- reverb_dry_level,
347
- reverb_width,
348
- reverb_freeze_mode,
349
- pitch_shift_semitones,
350
- delay_second,
351
- delay_feedback,
352
- delay_mix,
353
- compressor_threshold_db,
354
- compressor_ratio,
355
- compressor_attack_ms,
356
- compressor_release_ms,
357
- limiter_threshold_db,
358
- limiter_release_ms,
359
- gain_db,
360
- bitcrush_bit_depth,
361
- clipping_threshold_db,
362
- phaser_rate_hz,
363
- phaser_depth,
364
- phaser_centre_frequency_hz,
365
- phaser_feedback,
366
- phaser_mix,
367
- bass_boost,
368
- bass_frequency,
369
- treble_boost,
370
- treble_frequency,
371
- fade_in,
372
- fade_out,
373
- audio_output_format,
374
- chorus_check_box,
375
- distortion_checkbox,
376
- reverb_check_box,
377
- delay_check_box,
378
- compressor_check_box,
379
- limiter,
380
- gain_checkbox,
381
- bitcrush_checkbox,
382
- clipping_checkbox,
383
- phaser_check_box,
384
- bass_or_treble,
385
- fade,
386
- audio_combination,
387
- audio_combination_input,
388
- main_vol,
389
- combine_vol
390
- ],
391
- outputs=[audio_play_output],
392
- api_name="audio_effects"
393
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
main/app/tabs/editing/child/quirk.py DELETED
@@ -1,48 +0,0 @@
1
- import os
2
- import sys
3
-
4
- import gradio as gr
5
-
6
- sys.path.append(os.getcwd())
7
-
8
- from main.app.core.editing import apply_voice_quirk
9
- from main.app.core.ui import change_audios_choices, shutil_move
10
- from main.app.variables import translations, paths_for_files, configs, file_types, export_format_choices
11
-
12
- def quirk_tab():
13
- with gr.Row():
14
- gr.Markdown(translations["quirk_markdown"])
15
- with gr.Row():
16
- input_audio_play = gr.Audio(show_download_button=True, interactive=False, label=translations["input_audio"])
17
- with gr.Row():
18
- quirk_choice = gr.Radio(label=translations["quirk_label"], info=translations["quirk_label_info"], choices=list(translations["quirk_choice"].keys()), interactive=True, value=list(translations["quirk_choice"].keys())[0])
19
- with gr.Row():
20
- apply_quirk_button = gr.Button(translations["apply"], variant="primary")
21
- with gr.Row():
22
- with gr.Accordion(translations["input_output"], open=False):
23
- with gr.Row():
24
- quirk_upload_audio = gr.Files(label=translations["drop_audio"], file_types=file_types)
25
- with gr.Column():
26
- quirk_export_format = gr.Radio(label=translations["export_format"], info=translations["export_info"], choices=export_format_choices, value="wav", interactive=True)
27
- quirk_input_path = gr.Dropdown(label=translations["audio_path"], value="", choices=paths_for_files, info=translations["provide_audio"], allow_custom_value=True, interactive=True)
28
- quirk_output_path = gr.Textbox(label=translations["output_path"], value="audios/output.wav", placeholder="audios/output.wav", info=translations["output_path_info"], interactive=True)
29
- with gr.Column():
30
- quirk_refresh = gr.Button(translations["refresh"])
31
- with gr.Row():
32
- output_audio_play = gr.Audio(show_download_button=True, interactive=False, label=translations["output_audio"])
33
- with gr.Row():
34
- quirk_upload_audio.upload(fn=lambda audio_in: [shutil_move(audio.name, configs["audios_path"]) for audio in audio_in][0], inputs=[quirk_upload_audio], outputs=[quirk_input_path])
35
- quirk_input_path.change(fn=lambda audio: audio if audio else None, inputs=[quirk_input_path], outputs=[input_audio_play])
36
- quirk_refresh.click(fn=change_audios_choices, inputs=[quirk_input_path], outputs=[quirk_input_path])
37
- with gr.Row():
38
- apply_quirk_button.click(
39
- fn=apply_voice_quirk,
40
- inputs=[
41
- quirk_input_path,
42
- quirk_choice,
43
- quirk_output_path,
44
- quirk_export_format
45
- ],
46
- outputs=[output_audio_play],
47
- api_name="quirk"
48
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
main/app/tabs/editing/editing.py DELETED
@@ -1,20 +0,0 @@
1
- import os
2
- import sys
3
-
4
- import gradio as gr
5
-
6
- sys.path.append(os.getcwd())
7
-
8
- from main.app.variables import configs, translations
9
- from main.app.tabs.editing.child.quirk import quirk_tab
10
- from main.app.tabs.editing.child.audio_effects import audio_effects_tab
11
-
12
- def editing_tab():
13
- with gr.TabItem(translations["editing"], visible=configs.get("editing_tab", True)):
14
- with gr.TabItem(translations["audio_effects"], visible=configs.get("effects_tab", True)):
15
- gr.Markdown(translations["apply_audio_effects"])
16
- audio_effects_tab()
17
-
18
- with gr.TabItem(translations["quirk"], visible=configs.get("quirk", True)):
19
- gr.Markdown(translations["quirk_info"])
20
- quirk_tab()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
main/app/tabs/extra/child/convert_model.py DELETED
@@ -1,31 +0,0 @@
1
- import os
2
- import sys
3
-
4
- import gradio as gr
5
-
6
- sys.path.append(os.getcwd())
7
-
8
- from main.app.core.ui import visible, shutil_move
9
- from main.app.core.model_utils import onnx_export
10
- from main.app.variables import translations, configs
11
-
12
- def convert_model_tab():
13
- with gr.Row():
14
- gr.Markdown(translations["pytorch2onnx_markdown"])
15
- with gr.Row():
16
- model_pth_upload = gr.File(label=translations["drop_model"], file_types=[".pth"])
17
- with gr.Row():
18
- convert_onnx = gr.Button(translations["convert_model"], variant="primary", scale=2)
19
- with gr.Row():
20
- model_pth_path = gr.Textbox(label=translations["model_path"], value="", placeholder="assets/weights/Model.pth", info=translations["model_path_info"], interactive=True)
21
- with gr.Row():
22
- output_model2 = gr.File(label=translations["output_model_path"], file_types=[".pth", ".onnx"], interactive=False, visible=False)
23
- with gr.Row():
24
- model_pth_upload.upload(fn=lambda model_pth_upload: shutil_move(model_pth_upload.name, configs["weights_path"]), inputs=[model_pth_upload], outputs=[model_pth_path])
25
- convert_onnx.click(
26
- fn=onnx_export,
27
- inputs=[model_pth_path],
28
- outputs=[output_model2],
29
- api_name="model_onnx_export"
30
- )
31
- convert_onnx.click(fn=lambda: visible(True), inputs=[], outputs=[output_model2])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
main/app/tabs/extra/child/create_srt.py DELETED
@@ -1,56 +0,0 @@
1
- import os
2
- import sys
3
-
4
- import gradio as gr
5
-
6
- sys.path.append(os.getcwd())
7
-
8
- from main.app.core.csrt import create_srt
9
- from main.app.core.ui import shutil_move, change_audios_choices
10
- from main.app.variables import translations, file_types, configs, paths_for_files
11
-
12
- def create_srt_tab():
13
- with gr.Row():
14
- gr.Markdown(translations["create_srt_markdown_2"])
15
- with gr.Row():
16
- with gr.Column():
17
- srt_content = gr.Textbox(label=translations["srt_content"], value="", lines=9, max_lines=9, interactive=False)
18
- with gr.Column():
19
- word_timestamps = gr.Checkbox(label=translations["word_timestamps"], info=translations["word_timestamps_info"], value=False, interactive=True)
20
- model_size = gr.Radio(label=translations["model_size"], info=translations["model_size_info"], choices=["tiny", "tiny.en", "base", "base.en", "small", "small.en", "medium", "medium.en", "large-v1", "large-v2", "large-v3", "large-v3-turbo"], value="medium", interactive=True)
21
- with gr.Row():
22
- convert_button = gr.Button(translations["convert_audio"], variant="primary")
23
- with gr.Row():
24
- with gr.Accordion(translations["input_output"], open=False):
25
- with gr.Column():
26
- input_audio = gr.Dropdown(label=translations["audio_path"], value="", choices=paths_for_files, info=translations["provide_audio"], allow_custom_value=True, interactive=True)
27
- output_file = gr.Textbox(label=translations["srt_output_file"], value="srt/output.srt", placeholder="srt/output.srt", interactive=True)
28
- with gr.Column():
29
- refresh = gr.Button(translations["refresh"])
30
- with gr.Row():
31
- input_file = gr.Files(label=translations["drop_audio"], file_types=file_types)
32
- with gr.Row():
33
- play_audio = gr.Audio(show_download_button=True, interactive=False, label=translations["input_audio"])
34
- with gr.Row():
35
- output_srt = gr.File(label=translations["srt_output_file"], file_types=[".srt"], interactive=False, visible=False)
36
- with gr.Row():
37
- input_file.upload(fn=lambda audio_in: [shutil_move(audio.name, configs["audios_path"]) for audio in audio_in][0], inputs=[input_file], outputs=[input_audio])
38
- input_audio.change(fn=lambda audio: audio if os.path.isfile(audio) else None, inputs=[input_audio], outputs=[play_audio])
39
- refresh.click(fn=change_audios_choices, inputs=[input_audio], outputs=[input_audio])
40
- with gr.Row():
41
- convert_button.click(
42
- fn=create_srt,
43
- inputs=[
44
- model_size,
45
- input_audio,
46
- output_file,
47
- word_timestamps
48
- ],
49
- outputs=[
50
- output_srt,
51
- srt_content
52
- ],
53
- api_name="create_srt"
54
- )
55
-
56
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
main/app/tabs/extra/child/f0_extract.py DELETED
@@ -1,51 +0,0 @@
1
- import os
2
- import sys
3
-
4
- import gradio as gr
5
-
6
- sys.path.append(os.getcwd())
7
-
8
- from main.app.core.f0_extract import f0_extract
9
- from main.app.core.ui import change_audios_choices, unlock_f0, shutil_move
10
- from main.app.variables import translations, paths_for_files, method_f0, configs, file_types
11
-
12
- def f0_extract_tab():
13
- with gr.Row():
14
- gr.Markdown(translations["f0_extractor_markdown_2"])
15
- with gr.Row():
16
- extractor_button = gr.Button(translations["extract_button"].replace("2. ", ""), variant="primary")
17
- with gr.Row():
18
- with gr.Column():
19
- upload_audio_file = gr.Files(label=translations["drop_audio"], file_types=file_types)
20
- audioplay = gr.Audio(show_download_button=True, interactive=False, label=translations["input_audio"])
21
- with gr.Column():
22
- with gr.Accordion(translations["f0_method"], open=False):
23
- with gr.Group():
24
- with gr.Row():
25
- onnx_f0_mode3 = gr.Checkbox(label=translations["f0_onnx_mode"], info=translations["f0_onnx_mode_info"], value=False, interactive=True)
26
- unlock_full_method = gr.Checkbox(label=translations["f0_unlock"], info=translations["f0_unlock_info"], value=False, interactive=True)
27
- f0_method_extract = gr.Radio(label=translations["f0_method"], info=translations["f0_method_info"], choices=[m for m in method_f0 if m != "hybrid"], value="rmvpe", interactive=True)
28
- with gr.Accordion(translations["audio_path"], open=True):
29
- input_audio_path = gr.Dropdown(label=translations["audio_path"], value="", choices=paths_for_files, allow_custom_value=True, interactive=True)
30
- refresh_audio_button = gr.Button(translations["refresh"])
31
- with gr.Row():
32
- gr.Markdown("___")
33
- with gr.Row():
34
- file_output = gr.File(label="", file_types=[".txt"], interactive=False)
35
- image_output = gr.Image(label="", interactive=False, show_download_button=True)
36
- with gr.Row():
37
- upload_audio_file.upload(fn=lambda audio_in: [shutil_move(audio.name, configs["audios_path"]) for audio in audio_in][0], inputs=[upload_audio_file], outputs=[input_audio_path])
38
- input_audio_path.change(fn=lambda audio: audio if os.path.isfile(audio) else None, inputs=[input_audio_path], outputs=[audioplay])
39
- refresh_audio_button.click(fn=change_audios_choices, inputs=[input_audio_path], outputs=[input_audio_path])
40
- with gr.Row():
41
- unlock_full_method.change(fn=lambda method: {"choices": [m for m in unlock_f0(method)["choices"] if m != "hybrid"], "value": "rmvpe", "__type__": "update"}, inputs=[unlock_full_method], outputs=[f0_method_extract])
42
- extractor_button.click(
43
- fn=f0_extract,
44
- inputs=[
45
- input_audio_path,
46
- f0_method_extract,
47
- onnx_f0_mode3
48
- ],
49
- outputs=[file_output, image_output],
50
- api_name="f0_extract"
51
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
main/app/tabs/extra/child/fushion.py DELETED
@@ -1,45 +0,0 @@
1
- import os
2
- import sys
3
-
4
- import gradio as gr
5
-
6
- sys.path.append(os.getcwd())
7
-
8
- from main.app.core.ui import visible, shutil_move
9
- from main.app.core.model_utils import fushion_model
10
- from main.app.variables import translations, configs
11
-
12
- def fushion_tab():
13
- with gr.Row():
14
- gr.Markdown(translations["fushion_markdown_2"])
15
- with gr.Row():
16
- name_to_save = gr.Textbox(label=translations["modelname"], placeholder="Model.pth", value="", max_lines=1, interactive=True)
17
- with gr.Row():
18
- fushion_button = gr.Button(translations["fushion"], variant="primary", scale=4)
19
- with gr.Column():
20
- with gr.Row():
21
- model_a = gr.File(label=f"{translations['model_name']} 1", file_types=[".pth", ".onnx"])
22
- model_b = gr.File(label=f"{translations['model_name']} 2", file_types=[".pth", ".onnx"])
23
- with gr.Row():
24
- model_path_a = gr.Textbox(label=f"{translations['model_path']} 1", value="", placeholder="assets/weights/Model_1.pth")
25
- model_path_b = gr.Textbox(label=f"{translations['model_path']} 2", value="", placeholder="assets/weights/Model_2.pth")
26
- with gr.Row():
27
- ratio = gr.Slider(minimum=0, maximum=1, label=translations["model_ratio"], info=translations["model_ratio_info"], value=0.5, interactive=True)
28
- with gr.Row():
29
- output_model = gr.File(label=translations["output_model_path"], file_types=[".pth", ".onnx"], interactive=False, visible=False)
30
- with gr.Row():
31
- model_a.upload(fn=lambda model: shutil_move(model.name, configs["weights_path"]), inputs=[model_a], outputs=[model_path_a])
32
- model_b.upload(fn=lambda model: shutil_move(model.name, configs["weights_path"]), inputs=[model_b], outputs=[model_path_b])
33
- with gr.Row():
34
- fushion_button.click(
35
- fn=fushion_model,
36
- inputs=[
37
- name_to_save,
38
- model_path_a,
39
- model_path_b,
40
- ratio
41
- ],
42
- outputs=[name_to_save, output_model],
43
- api_name="fushion_model"
44
- )
45
- fushion_button.click(fn=lambda: visible(True), inputs=[], outputs=[output_model])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
main/app/tabs/extra/child/read_model.py DELETED
@@ -1,29 +0,0 @@
1
- import os
2
- import sys
3
-
4
- import gradio as gr
5
-
6
- sys.path.append(os.getcwd())
7
-
8
- from main.app.core.ui import shutil_move
9
- from main.app.core.model_utils import model_info
10
- from main.app.variables import translations, configs
11
-
12
- def read_model_tab():
13
- with gr.Row():
14
- gr.Markdown(translations["read_model_markdown_2"])
15
- with gr.Row():
16
- model = gr.File(label=translations["drop_model"], file_types=[".pth", ".onnx"])
17
- with gr.Row():
18
- read_button = gr.Button(translations["readmodel"], variant="primary", scale=2)
19
- with gr.Column():
20
- model_path = gr.Textbox(label=translations["model_path"], value="", placeholder="assets/weights/Model.pth", info=translations["model_path_info"], interactive=True)
21
- output_info = gr.Textbox(label=translations["modelinfo"], value="", interactive=False, scale=6)
22
- with gr.Row():
23
- model.upload(fn=lambda model: shutil_move(model.name, configs["weights_path"]), inputs=[model], outputs=[model_path])
24
- read_button.click(
25
- fn=model_info,
26
- inputs=[model_path],
27
- outputs=[output_info],
28
- api_name="read_model"
29
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
main/app/tabs/extra/child/settings.py DELETED
@@ -1,61 +0,0 @@
1
- import os
2
- import sys
3
-
4
- import gradio as gr
5
-
6
- sys.path.append(os.getcwd())
7
-
8
- from main.app.core.ui import change_fp
9
- from main.app.core.utils import stop_pid
10
- from main.app.core.restart import change_font, change_language, change_theme
11
- from main.app.variables import translations, theme, font, configs, language, config
12
-
13
- def settings_tab(app):
14
- with gr.Row():
15
- gr.Markdown(translations["settings_markdown_2"])
16
- with gr.Row():
17
- toggle_button = gr.Button(translations["change_light_dark"], variant="secondary", scale=2)
18
- with gr.Row():
19
- with gr.Column():
20
- language_dropdown = gr.Dropdown(label=translations["lang"], interactive=True, info=translations["lang_restart"], choices=configs.get("support_language", "vi-VN"), value=language)
21
- change_lang = gr.Button(translations["change_lang"], variant="primary", scale=2)
22
- with gr.Column():
23
- theme_dropdown = gr.Dropdown(label=translations["theme"], interactive=True, info=translations["theme_restart"], choices=configs.get("themes", theme), value=theme, allow_custom_value=True)
24
- changetheme = gr.Button(translations["theme_button"], variant="primary", scale=2)
25
- with gr.Row():
26
- with gr.Column():
27
- fp_choice = gr.Radio(choices=["fp16","fp32"], value="fp16" if configs.get("fp16", False) else "fp32", label=translations["precision"], info=translations["precision_info"], interactive=config.device not in ["cpu", "mps", "ocl:0"])
28
- fp_button = gr.Button(translations["update_precision"], variant="secondary", scale=2)
29
- with gr.Column():
30
- font_choice = gr.Textbox(label=translations["font"], info=translations["font_info"], value=font, interactive=True)
31
- font_button = gr.Button(translations["change_font"])
32
- with gr.Row():
33
- with gr.Column():
34
- with gr.Accordion(translations["stop"], open=False, visible=True):
35
- separate_stop = gr.Button(translations["stop_separate"])
36
- convert_stop = gr.Button(translations["stop_convert"])
37
- create_dataset_stop = gr.Button(translations["stop_create_dataset"])
38
- with gr.Accordion(translations["stop_training"], open=False):
39
- model_name_stop = gr.Textbox(label=translations["modelname"], info=translations["training_model_name"], value="", placeholder=translations["modelname"], interactive=True)
40
- preprocess_stop = gr.Button(translations["stop_preprocess"])
41
- extract_stop = gr.Button(translations["stop_extract"])
42
- train_stop = gr.Button(translations["stop_training"])
43
- with gr.Row():
44
- toggle_button.click(fn=None, js="() => {document.body.classList.toggle('dark')}")
45
- fp_button.click(fn=change_fp, inputs=[fp_choice], outputs=[fp_choice])
46
- with gr.Row():
47
- change_lang.click(fn=lambda a: change_language(a, app), inputs=[language_dropdown], outputs=[])
48
- changetheme.click(fn=lambda a: change_theme(a, app) , inputs=[theme_dropdown], outputs=[])
49
- font_button.click(fn=lambda a: change_font(a, app), inputs=[font_choice], outputs=[])
50
- with gr.Row():
51
- change_lang.click(fn=None, js="setTimeout(function() {location.reload()}, 30000)", inputs=[], outputs=[])
52
- changetheme.click(fn=None, js="setTimeout(function() {location.reload()}, 30000)", inputs=[], outputs=[])
53
- font_button.click(fn=None, js="setTimeout(function() {location.reload()}, 30000)", inputs=[], outputs=[])
54
- with gr.Row():
55
- separate_stop.click(fn=lambda: stop_pid("separate_pid", None, False), inputs=[], outputs=[])
56
- convert_stop.click(fn=lambda: stop_pid("convert_pid", None, False), inputs=[], outputs=[])
57
- create_dataset_stop.click(fn=lambda: stop_pid("create_dataset_pid", None, False), inputs=[], outputs=[])
58
- with gr.Row():
59
- preprocess_stop.click(fn=lambda model_name_stop: stop_pid("preprocess_pid", model_name_stop, False), inputs=[model_name_stop], outputs=[])
60
- extract_stop.click(fn=lambda model_name_stop: stop_pid("extract_pid", model_name_stop, False), inputs=[model_name_stop], outputs=[])
61
- train_stop.click(fn=lambda model_name_stop: stop_pid("train_pid", model_name_stop, True), inputs=[model_name_stop], outputs=[])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
main/app/tabs/extra/extra.py DELETED
@@ -1,40 +0,0 @@
1
- import os
2
- import sys
3
-
4
- import gradio as gr
5
-
6
- sys.path.append(os.getcwd())
7
-
8
- from main.app.variables import translations, configs
9
- from main.app.tabs.extra.child.fushion import fushion_tab
10
- from main.app.tabs.extra.child.settings import settings_tab
11
- from main.app.tabs.extra.child.read_model import read_model_tab
12
- from main.app.tabs.extra.child.f0_extract import f0_extract_tab
13
- from main.app.tabs.extra.child.create_srt import create_srt_tab
14
- from main.app.tabs.extra.child.convert_model import convert_model_tab
15
-
16
- def extra_tab(app):
17
- with gr.TabItem(translations["extra"], visible=configs.get("extra_tab", True)):
18
- with gr.TabItem(translations["fushion"], visible=configs.get("fushion_tab", True)):
19
- gr.Markdown(translations["fushion_markdown"])
20
- fushion_tab()
21
-
22
- with gr.TabItem(translations["read_model"], visible=configs.get("read_tab", True)):
23
- gr.Markdown(translations["read_model_markdown"])
24
- read_model_tab()
25
-
26
- with gr.TabItem(translations["convert_model"], visible=configs.get("onnx_tab", True)):
27
- gr.Markdown(translations["pytorch2onnx"])
28
- convert_model_tab()
29
-
30
- with gr.TabItem(translations["f0_extractor_tab"], visible=configs.get("f0_extractor_tab", True)):
31
- gr.Markdown(translations["f0_extractor_markdown"])
32
- f0_extract_tab()
33
-
34
- with gr.TabItem(translations["create_srt_tab"], visible=configs.get("create_srt_tab", True)):
35
- gr.Markdown(translations["create_srt_markdown"])
36
- create_srt_tab()
37
-
38
- with gr.TabItem(translations["settings"], visible=configs.get("settings_tab", True)):
39
- gr.Markdown(translations["settings_markdown"])
40
- settings_tab(app)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
main/app/tabs/inference/child/convert.py DELETED
@@ -1,328 +0,0 @@
1
- import os
2
- import sys
3
-
4
- import gradio as gr
5
-
6
- sys.path.append(os.getcwd())
7
-
8
- from main.app.core.presets import load_presets, save_presets
9
- from main.app.core.inference import convert_audio, convert_selection
10
- from main.app.variables import translations, paths_for_files, sample_rate_choice, model_name, index_path, method_f0, f0_file, embedders_mode, embedders_model, presets_file, configs, file_types, export_format_choices, hybrid_f0_method
11
- from main.app.core.ui import visible, valueFalse_interactive, change_audios_choices, change_f0_choices, unlock_f0, change_preset_choices, change_backing_choices, hoplength_show, change_models_choices, get_index, index_strength_show, change_embedders_mode, shutil_move
12
-
13
- def convert_tab():
14
- with gr.Row():
15
- gr.Markdown(translations["convert_info"])
16
- with gr.Row():
17
- with gr.Column():
18
- with gr.Group():
19
- with gr.Row():
20
- cleaner0 = gr.Checkbox(label=translations["clear_audio"], value=False, interactive=True)
21
- autotune = gr.Checkbox(label=translations["autotune"], value=False, interactive=True)
22
- use_audio = gr.Checkbox(label=translations["use_audio"], value=False, interactive=True)
23
- checkpointing = gr.Checkbox(label=translations["memory_efficient_training"], value=False, interactive=True)
24
- with gr.Row():
25
- use_original = gr.Checkbox(label=translations["convert_original"], value=False, interactive=True, visible=use_audio.value)
26
- convert_backing = gr.Checkbox(label=translations["convert_backing"], value=False, interactive=True, visible=use_audio.value)
27
- not_merge_backing = gr.Checkbox(label=translations["not_merge_backing"], value=False, interactive=True, visible=use_audio.value)
28
- merge_instrument = gr.Checkbox(label=translations["merge_instruments"], value=False, interactive=True, visible=use_audio.value)
29
- with gr.Row():
30
- pitch = gr.Slider(minimum=-20, maximum=20, step=1, info=translations["pitch_info"], label=translations["pitch"], value=0, interactive=True)
31
- clean_strength0 = gr.Slider(label=translations["clean_strength"], info=translations["clean_strength_info"], minimum=0, maximum=1, value=0.5, step=0.1, interactive=True, visible=cleaner0.value)
32
- with gr.Row():
33
- with gr.Column():
34
- audio_select = gr.Dropdown(label=translations["select_separate"], choices=[], value="", interactive=True, allow_custom_value=True, visible=False)
35
- convert_button_2 = gr.Button(translations["convert_audio"], visible=False)
36
- with gr.Row():
37
- with gr.Column():
38
- convert_button = gr.Button(translations["convert_audio"], variant="primary")
39
- with gr.Row():
40
- with gr.Column():
41
- input0 = gr.Files(label=translations["drop_audio"], file_types=file_types)
42
- play_audio = gr.Audio(show_download_button=True, interactive=False, label=translations["input_audio"])
43
- with gr.Column():
44
- with gr.Accordion(translations["model_accordion"], open=True):
45
- with gr.Row():
46
- model_pth = gr.Dropdown(label=translations["model_name"], choices=model_name, value=model_name[0] if len(model_name) >= 1 else "", interactive=True, allow_custom_value=True)
47
- model_index = gr.Dropdown(label=translations["index_path"], choices=index_path, value=index_path[0] if len(index_path) >= 1 else "", interactive=True, allow_custom_value=True)
48
- with gr.Row():
49
- refresh = gr.Button(translations["refresh"])
50
- with gr.Row():
51
- index_strength = gr.Slider(label=translations["index_strength"], info=translations["index_strength_info"], minimum=0, maximum=1, value=0.5, step=0.01, interactive=True, visible=model_index.value != "")
52
- with gr.Accordion(translations["input_output"], open=False):
53
- with gr.Column():
54
- export_format = gr.Radio(label=translations["export_format"], info=translations["export_info"], choices=export_format_choices, value="wav", interactive=True)
55
- input_audio0 = gr.Dropdown(label=translations["audio_path"], value="", choices=paths_for_files, info=translations["provide_audio"], allow_custom_value=True, interactive=True)
56
- output_audio = gr.Textbox(label=translations["output_path"], value="audios/output.wav", placeholder="audios/output.wav", info=translations["output_path_info"], interactive=True)
57
- with gr.Column():
58
- refresh0 = gr.Button(translations["refresh"])
59
- with gr.Accordion(translations["setting"], open=False):
60
- with gr.Accordion(translations["f0_method"], open=False):
61
- with gr.Group():
62
- with gr.Row():
63
- onnx_f0_mode = gr.Checkbox(label=translations["f0_onnx_mode"], info=translations["f0_onnx_mode_info"], value=False, interactive=True)
64
- unlock_full_method = gr.Checkbox(label=translations["f0_unlock"], info=translations["f0_unlock_info"], value=False, interactive=True)
65
- method = gr.Radio(label=translations["f0_method"], info=translations["f0_method_info"], choices=method_f0, value="rmvpe", interactive=True)
66
- hybrid_method = gr.Dropdown(label=translations["f0_method_hybrid"], info=translations["f0_method_hybrid_info"], choices=hybrid_f0_method, value=hybrid_f0_method[0], interactive=True, allow_custom_value=True, visible=method.value == "hybrid")
67
- hop_length = gr.Slider(label=translations['hop_length'], info=translations["hop_length_info"], minimum=64, maximum=512, value=160, step=1, interactive=True, visible=False)
68
- alpha = gr.Slider(label=translations["alpha_label"], info=translations["alpha_info"], minimum=0.1, maximum=1, value=0.5, step=0.1, interactive=True, visible=False)
69
- with gr.Accordion(translations["f0_file"], open=False):
70
- upload_f0_file = gr.File(label=translations["upload_f0"], file_types=[".txt"])
71
- f0_file_dropdown = gr.Dropdown(label=translations["f0_file_2"], value="", choices=f0_file, allow_custom_value=True, interactive=True)
72
- refresh_f0_file = gr.Button(translations["refresh"])
73
- with gr.Accordion(translations["hubert_model"], open=False):
74
- embed_mode = gr.Radio(label=translations["embed_mode"], info=translations["embed_mode_info"], value="fairseq", choices=embedders_mode, interactive=True, visible=True)
75
- embedders = gr.Radio(label=translations["hubert_model"], info=translations["hubert_info"], choices=embedders_model, value="hubert_base", interactive=True)
76
- custom_embedders = gr.Textbox(label=translations["modelname"], info=translations["modelname_info"], value="", placeholder="hubert_base", interactive=True, visible=embedders.value == "custom")
77
- with gr.Accordion(translations["use_presets"], open=False):
78
- with gr.Row():
79
- presets_name = gr.Dropdown(label=translations["file_preset"], choices=presets_file, value=presets_file[0] if len(presets_file) > 0 else '', interactive=True, allow_custom_value=True)
80
- with gr.Row():
81
- load_click = gr.Button(translations["load_file"], variant="primary")
82
- refresh_click = gr.Button(translations["refresh"])
83
- with gr.Accordion(translations["export_file"], open=False):
84
- with gr.Row():
85
- with gr.Column():
86
- with gr.Group():
87
- with gr.Row():
88
- cleaner_chbox = gr.Checkbox(label=translations["save_clean"], value=True, interactive=True)
89
- autotune_chbox = gr.Checkbox(label=translations["save_autotune"], value=True, interactive=True)
90
- pitch_chbox = gr.Checkbox(label=translations["save_pitch"], value=True, interactive=True)
91
- index_strength_chbox = gr.Checkbox(label=translations["save_index_2"], value=True, interactive=True)
92
- resample_sr_chbox = gr.Checkbox(label=translations["save_resample"], value=True, interactive=True)
93
- filter_radius_chbox = gr.Checkbox(label=translations["save_filter"], value=True, interactive=True)
94
- rms_mix_rate_chbox = gr.Checkbox(label=translations["save_envelope"], value=True, interactive=True)
95
- protect_chbox = gr.Checkbox(label=translations["save_protect"], value=True, interactive=True)
96
- split_audio_chbox = gr.Checkbox(label=translations["save_split"], value=True, interactive=True)
97
- formant_shifting_chbox = gr.Checkbox(label=translations["formantshift"], value=True, interactive=True)
98
- with gr.Row():
99
- with gr.Column():
100
- name_to_save_file = gr.Textbox(label=translations["filename_to_save"])
101
- save_file_button = gr.Button(translations["export_file"])
102
- with gr.Row():
103
- upload_presets = gr.Files(label=translations["upload_presets"], file_types=[".conversion.json"])
104
- with gr.Column():
105
- with gr.Group():
106
- with gr.Row():
107
- split_audio = gr.Checkbox(label=translations["split_audio"], value=False, interactive=True)
108
- formant_shifting = gr.Checkbox(label=translations["formantshift"], value=False, interactive=True)
109
- with gr.Row():
110
- proposal_pitch = gr.Checkbox(label=translations["proposal_pitch"], value=False, interactive=True)
111
- audio_processing = gr.Checkbox(label=translations["audio_processing"], value=False, interactive=True)
112
- resample_sr = gr.Radio(choices=[0]+sample_rate_choice, label=translations["resample"], info=translations["resample_info"], value=0, interactive=True)
113
- proposal_pitch_threshold = gr.Slider(minimum=50.0, maximum=1200.0, label=translations["proposal_pitch_threshold"], info=translations["proposal_pitch_threshold_info"], value=255.0, step=0.1, interactive=True, visible=proposal_pitch.value)
114
- f0_autotune_strength = gr.Slider(minimum=0, maximum=1, label=translations["autotune_rate"], info=translations["autotune_rate_info"], value=1, step=0.1, interactive=True, visible=autotune.value)
115
- filter_radius = gr.Slider(minimum=0, maximum=7, label=translations["filter_radius"], info=translations["filter_radius_info"], value=3, step=1, interactive=True)
116
- rms_mix_rate = gr.Slider(minimum=0, maximum=1, label=translations["rms_mix_rate"], info=translations["rms_mix_rate_info"], value=1, step=0.1, interactive=True)
117
- protect = gr.Slider(minimum=0, maximum=1, label=translations["protect"], info=translations["protect_info"], value=0.5, step=0.01, interactive=True)
118
- with gr.Row():
119
- formant_qfrency = gr.Slider(value=1.0, label=translations["formant_qfrency"], info=translations["formant_qfrency"], minimum=0.0, maximum=16.0, step=0.1, interactive=True, visible=False)
120
- formant_timbre = gr.Slider(value=1.0, label=translations["formant_timbre"], info=translations["formant_timbre"], minimum=0.0, maximum=16.0, step=0.1, interactive=True, visible=False)
121
- with gr.Row():
122
- gr.Markdown(translations["output_convert"])
123
- with gr.Row():
124
- main_convert = gr.Audio(show_download_button=True, interactive=False, label=translations["main_convert"])
125
- backing_convert = gr.Audio(show_download_button=True, interactive=False, label=translations["convert_backing"], visible=convert_backing.value)
126
- main_backing = gr.Audio(show_download_button=True, interactive=False, label=translations["main_or_backing"], visible=convert_backing.value)
127
- with gr.Row():
128
- original_convert = gr.Audio(show_download_button=True, interactive=False, label=translations["convert_original"], visible=use_original.value)
129
- vocal_instrument = gr.Audio(show_download_button=True, interactive=False, label=translations["voice_or_instruments"], visible=merge_instrument.value)
130
- with gr.Row():
131
- upload_f0_file.upload(fn=lambda inp: shutil_move(inp.name, configs["f0_path"]), inputs=[upload_f0_file], outputs=[f0_file_dropdown])
132
- refresh_f0_file.click(fn=change_f0_choices, inputs=[], outputs=[f0_file_dropdown])
133
- unlock_full_method.change(fn=unlock_f0, inputs=[unlock_full_method], outputs=[method])
134
- with gr.Row():
135
- load_click.click(
136
- fn=load_presets,
137
- inputs=[
138
- presets_name,
139
- cleaner0,
140
- autotune,
141
- pitch,
142
- clean_strength0,
143
- index_strength,
144
- resample_sr,
145
- filter_radius,
146
- rms_mix_rate,
147
- protect,
148
- split_audio,
149
- f0_autotune_strength,
150
- formant_shifting,
151
- formant_qfrency,
152
- formant_timbre,
153
- proposal_pitch,
154
- proposal_pitch_threshold
155
- ],
156
- outputs=[
157
- cleaner0,
158
- autotune,
159
- pitch,
160
- clean_strength0,
161
- index_strength,
162
- resample_sr,
163
- filter_radius,
164
- rms_mix_rate,
165
- protect,
166
- split_audio,
167
- f0_autotune_strength,
168
- formant_shifting,
169
- formant_qfrency,
170
- formant_timbre,
171
- proposal_pitch,
172
- proposal_pitch_threshold
173
- ]
174
- )
175
- refresh_click.click(fn=change_preset_choices, inputs=[], outputs=[presets_name])
176
- save_file_button.click(
177
- fn=save_presets,
178
- inputs=[
179
- name_to_save_file,
180
- cleaner0,
181
- autotune,
182
- pitch,
183
- clean_strength0,
184
- index_strength,
185
- resample_sr,
186
- filter_radius,
187
- rms_mix_rate,
188
- protect,
189
- split_audio,
190
- f0_autotune_strength,
191
- cleaner_chbox,
192
- autotune_chbox,
193
- pitch_chbox,
194
- index_strength_chbox,
195
- resample_sr_chbox,
196
- filter_radius_chbox,
197
- rms_mix_rate_chbox,
198
- protect_chbox,
199
- split_audio_chbox,
200
- formant_shifting_chbox,
201
- formant_shifting,
202
- formant_qfrency,
203
- formant_timbre,
204
- proposal_pitch,
205
- proposal_pitch_threshold
206
- ],
207
- outputs=[presets_name]
208
- )
209
- with gr.Row():
210
- upload_presets.upload(fn=lambda presets_in: [shutil_move(preset.name, configs["presets_path"]) for preset in presets_in][0], inputs=[upload_presets], outputs=[presets_name])
211
- autotune.change(fn=visible, inputs=[autotune], outputs=[f0_autotune_strength])
212
- use_audio.change(fn=lambda a: [visible(a), visible(a), visible(a), visible(a), visible(a), valueFalse_interactive(a), valueFalse_interactive(a), valueFalse_interactive(a), valueFalse_interactive(a), visible(not a), visible(not a), visible(not a), visible(not a)], inputs=[use_audio], outputs=[main_backing, use_original, convert_backing, not_merge_backing, merge_instrument, use_original, convert_backing, not_merge_backing, merge_instrument, input_audio0, output_audio, input0, play_audio])
213
- with gr.Row():
214
- convert_backing.change(fn=lambda a,b: [change_backing_choices(a, b), visible(a)], inputs=[convert_backing, not_merge_backing], outputs=[use_original, backing_convert])
215
- use_original.change(fn=lambda audio, original: [visible(original), visible(not original), visible(audio and not original), valueFalse_interactive(not original), valueFalse_interactive(not original)], inputs=[use_audio, use_original], outputs=[original_convert, main_convert, main_backing, convert_backing, not_merge_backing])
216
- cleaner0.change(fn=visible, inputs=[cleaner0], outputs=[clean_strength0])
217
- with gr.Row():
218
- merge_instrument.change(fn=visible, inputs=[merge_instrument], outputs=[vocal_instrument])
219
- not_merge_backing.change(fn=lambda audio, merge, cvb: [visible(audio and not merge), change_backing_choices(cvb, merge)], inputs=[use_audio, not_merge_backing, convert_backing], outputs=[main_backing, use_original])
220
- method.change(fn=lambda method, hybrid: [visible(method == "hybrid"), visible(method == "hybrid"), hoplength_show(method, hybrid)], inputs=[method, hybrid_method], outputs=[hybrid_method, alpha, hop_length])
221
- with gr.Row():
222
- hybrid_method.change(fn=hoplength_show, inputs=[method, hybrid_method], outputs=[hop_length])
223
- refresh.click(fn=change_models_choices, inputs=[], outputs=[model_pth, model_index])
224
- model_pth.change(fn=get_index, inputs=[model_pth], outputs=[model_index])
225
- with gr.Row():
226
- input0.upload(fn=lambda audio_in: [shutil_move(audio.name, configs["audios_path"]) for audio in audio_in][0], inputs=[input0], outputs=[input_audio0])
227
- input_audio0.change(fn=lambda audio: audio if os.path.isfile(audio) else None, inputs=[input_audio0], outputs=[play_audio])
228
- formant_shifting.change(fn=lambda a: [visible(a) for _ in range(2)], inputs=[formant_shifting], outputs=[formant_qfrency, formant_timbre])
229
- with gr.Row():
230
- embedders.change(fn=lambda embedders: visible(embedders == "custom"), inputs=[embedders], outputs=[custom_embedders])
231
- refresh0.click(fn=change_audios_choices, inputs=[input_audio0], outputs=[input_audio0])
232
- model_index.change(fn=index_strength_show, inputs=[model_index], outputs=[index_strength])
233
- with gr.Row():
234
- convert_button.click(fn=lambda: visible(False), inputs=[], outputs=[convert_button])
235
- convert_button_2.click(fn=lambda: [visible(False), visible(False)], inputs=[], outputs=[audio_select, convert_button_2])
236
- with gr.Row():
237
- proposal_pitch.change(fn=visible, inputs=[proposal_pitch], outputs=[proposal_pitch_threshold])
238
- embed_mode.change(fn=change_embedders_mode, inputs=[embed_mode], outputs=[embedders])
239
- with gr.Row():
240
- convert_button.click(
241
- fn=convert_selection,
242
- inputs=[
243
- cleaner0,
244
- autotune,
245
- use_audio,
246
- use_original,
247
- convert_backing,
248
- not_merge_backing,
249
- merge_instrument,
250
- pitch,
251
- clean_strength0,
252
- model_pth,
253
- model_index,
254
- index_strength,
255
- input_audio0,
256
- output_audio,
257
- export_format,
258
- method,
259
- hybrid_method,
260
- hop_length,
261
- embedders,
262
- custom_embedders,
263
- resample_sr,
264
- filter_radius,
265
- rms_mix_rate,
266
- protect,
267
- split_audio,
268
- f0_autotune_strength,
269
- checkpointing,
270
- onnx_f0_mode,
271
- formant_shifting,
272
- formant_qfrency,
273
- formant_timbre,
274
- f0_file_dropdown,
275
- embed_mode,
276
- proposal_pitch,
277
- proposal_pitch_threshold,
278
- audio_processing,
279
- alpha
280
- ],
281
- outputs=[audio_select, main_convert, backing_convert, main_backing, original_convert, vocal_instrument, convert_button, convert_button_2],
282
- api_name="convert_selection"
283
- )
284
- convert_button_2.click(
285
- fn=convert_audio,
286
- inputs=[
287
- cleaner0,
288
- autotune,
289
- use_audio,
290
- use_original,
291
- convert_backing,
292
- not_merge_backing,
293
- merge_instrument,
294
- pitch,
295
- clean_strength0,
296
- model_pth,
297
- model_index,
298
- index_strength,
299
- input_audio0,
300
- output_audio,
301
- export_format,
302
- method,
303
- hybrid_method,
304
- hop_length,
305
- embedders,
306
- custom_embedders,
307
- resample_sr,
308
- filter_radius,
309
- rms_mix_rate,
310
- protect,
311
- split_audio,
312
- f0_autotune_strength,
313
- audio_select,
314
- checkpointing,
315
- onnx_f0_mode,
316
- formant_shifting,
317
- formant_qfrency,
318
- formant_timbre,
319
- f0_file_dropdown,
320
- embed_mode,
321
- proposal_pitch,
322
- proposal_pitch_threshold,
323
- audio_processing,
324
- alpha
325
- ],
326
- outputs=[main_convert, backing_convert, main_backing, original_convert, vocal_instrument, convert_button],
327
- api_name="convert_audio"
328
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
main/app/tabs/inference/child/convert_tts.py DELETED
@@ -1,280 +0,0 @@
1
- import os
2
- import sys
3
-
4
- import gradio as gr
5
-
6
- sys.path.append(os.getcwd())
7
-
8
- from main.app.core.tts import TTS
9
- from main.app.core.process import process_input
10
- from main.app.core.inference import convert_tts
11
- from main.app.core.utils import google_translate
12
- from main.app.core.presets import save_presets, load_presets
13
- from main.app.core.ui import visible, change_f0_choices, unlock_f0, hoplength_show, change_models_choices, get_index, index_strength_show, change_embedders_mode, change_tts_voice_choices, shutil_move, change_preset_choices
14
- from main.app.variables import translations, sample_rate_choice, model_name, index_path, method_f0, f0_file, embedders_mode, embedders_model, edgetts, google_tts_voice, configs, presets_file, export_format_choices, hybrid_f0_method
15
-
16
- def convert_tts_tab():
17
- with gr.Row():
18
- gr.Markdown(translations["convert_text_markdown_2"])
19
- with gr.Row():
20
- with gr.Column():
21
- with gr.Group():
22
- with gr.Row():
23
- use_txt = gr.Checkbox(label=translations["input_txt"], value=False, interactive=True)
24
- google_tts_check_box = gr.Checkbox(label=translations["googletts"], value=False, interactive=True)
25
- prompt = gr.Textbox(label=translations["text_to_speech"], value="", placeholder="Hello Words", lines=3)
26
- with gr.Column():
27
- speed = gr.Slider(label=translations["voice_speed"], info=translations["voice_speed_info"], minimum=-100, maximum=100, value=0, step=1)
28
- pitch0 = gr.Slider(minimum=-20, maximum=20, step=1, info=translations["pitch_info"], label=translations["pitch"], value=0, interactive=True)
29
- with gr.Row():
30
- tts_button = gr.Button(translations["tts_1"], variant="primary", scale=2)
31
- convert_button0 = gr.Button(translations["tts_2"], variant="secondary", scale=2)
32
- with gr.Row():
33
- with gr.Column():
34
- txt_input = gr.File(label=translations["drop_text"], file_types=[".txt", ".srt", ".docx"], visible=use_txt.value)
35
- tts_voice = gr.Dropdown(label=translations["voice"], choices=edgetts, interactive=True, value="vi-VN-NamMinhNeural")
36
- tts_pitch = gr.Slider(minimum=-20, maximum=20, step=1, info=translations["pitch_info_2"], label=translations["pitch"], value=0, interactive=True)
37
- with gr.Accordion(translations["translate"], open=False):
38
- with gr.Row():
39
- source_lang = gr.Dropdown(label=translations["source_lang"], choices=["auto"]+google_tts_voice, interactive=True, value="auto")
40
- target_lang = gr.Dropdown(label=translations["target_lang"], choices=google_tts_voice, interactive=True, value="en")
41
- translate_button = gr.Button(translations["translate"])
42
- with gr.Column():
43
- with gr.Accordion(translations["model_accordion"], open=True):
44
- with gr.Row():
45
- model_pth0 = gr.Dropdown(label=translations["model_name"], choices=model_name, value=model_name[0] if len(model_name) >= 1 else "", interactive=True, allow_custom_value=True)
46
- model_index0 = gr.Dropdown(label=translations["index_path"], choices=index_path, value=index_path[0] if len(index_path) >= 1 else "", interactive=True, allow_custom_value=True)
47
- with gr.Row():
48
- refresh1 = gr.Button(translations["refresh"])
49
- with gr.Row():
50
- index_strength0 = gr.Slider(label=translations["index_strength"], info=translations["index_strength_info"], minimum=0, maximum=1, value=0.5, step=0.01, interactive=True, visible=model_index0.value != "")
51
- with gr.Accordion(translations["output_path"], open=False):
52
- export_format0 = gr.Radio(label=translations["export_format"], info=translations["export_info"], choices=export_format_choices, value="wav", interactive=True)
53
- output_audio0 = gr.Textbox(label=translations["output_tts"], value="audios/tts.wav", placeholder="audios/tts.wav", info=translations["tts_output"], interactive=True)
54
- output_audio1 = gr.Textbox(label=translations["output_tts_convert"], value="audios/tts-convert.wav", placeholder="audios/tts-convert.wav", info=translations["tts_output"], interactive=True)
55
- with gr.Accordion(translations["setting"], open=False):
56
- with gr.Accordion(translations["f0_method"], open=False):
57
- with gr.Group():
58
- with gr.Row():
59
- onnx_f0_mode1 = gr.Checkbox(label=translations["f0_onnx_mode"], info=translations["f0_onnx_mode_info"], value=False, interactive=True)
60
- unlock_full_method3 = gr.Checkbox(label=translations["f0_unlock"], info=translations["f0_unlock_info"], value=False, interactive=True)
61
- method0 = gr.Radio(label=translations["f0_method"], info=translations["f0_method_info"], choices=method_f0, value="rmvpe", interactive=True)
62
- hybrid_method0 = gr.Dropdown(label=translations["f0_method_hybrid"], info=translations["f0_method_hybrid_info"], choices=hybrid_f0_method, value=hybrid_f0_method[0], interactive=True, allow_custom_value=True, visible=method0.value == "hybrid")
63
- hop_length0 = gr.Slider(label=translations['hop_length'], info=translations["hop_length_info"], minimum=64, maximum=512, value=160, step=1, interactive=True, visible=False)
64
- alpha = gr.Slider(label=translations["alpha_label"], info=translations["alpha_info"], minimum=0.1, maximum=1, value=0.5, step=0.1, interactive=True, visible=False)
65
- with gr.Accordion(translations["f0_file"], open=False):
66
- upload_f0_file0 = gr.File(label=translations["upload_f0"], file_types=[".txt"])
67
- f0_file_dropdown0 = gr.Dropdown(label=translations["f0_file_2"], value="", choices=f0_file, allow_custom_value=True, interactive=True)
68
- refresh_f0_file0 = gr.Button(translations["refresh"])
69
- with gr.Accordion(translations["hubert_model"], open=False):
70
- embed_mode1 = gr.Radio(label=translations["embed_mode"], info=translations["embed_mode_info"], value="fairseq", choices=embedders_mode, interactive=True, visible=True)
71
- embedders0 = gr.Radio(label=translations["hubert_model"], info=translations["hubert_info"], choices=embedders_model, value="hubert_base", interactive=True)
72
- custom_embedders0 = gr.Textbox(label=translations["modelname"], info=translations["modelname_info"], value="", placeholder="hubert_base", interactive=True, visible=embedders0.value == "custom")
73
- with gr.Accordion(translations["use_presets"], open=False):
74
- with gr.Row():
75
- presets_name = gr.Dropdown(label=translations["file_preset"], choices=presets_file, value=presets_file[0] if len(presets_file) > 0 else '', interactive=True, allow_custom_value=True)
76
- with gr.Row():
77
- load_click = gr.Button(translations["load_file"], variant="primary")
78
- refresh_click = gr.Button(translations["refresh"])
79
- with gr.Accordion(translations["export_file"], open=False):
80
- with gr.Row():
81
- with gr.Column():
82
- with gr.Group():
83
- with gr.Row():
84
- cleaner_chbox = gr.Checkbox(label=translations["save_clean"], value=True, interactive=True)
85
- autotune_chbox = gr.Checkbox(label=translations["save_autotune"], value=True, interactive=True)
86
- pitch_chbox = gr.Checkbox(label=translations["save_pitch"], value=True, interactive=True)
87
- index_strength_chbox = gr.Checkbox(label=translations["save_index_2"], value=True, interactive=True)
88
- resample_sr_chbox = gr.Checkbox(label=translations["save_resample"], value=True, interactive=True)
89
- filter_radius_chbox = gr.Checkbox(label=translations["save_filter"], value=True, interactive=True)
90
- rms_mix_rate_chbox = gr.Checkbox(label=translations["save_envelope"], value=True, interactive=True)
91
- protect_chbox = gr.Checkbox(label=translations["save_protect"], value=True, interactive=True)
92
- split_audio_chbox = gr.Checkbox(label=translations["save_split"], value=True, interactive=True)
93
- formant_shifting_chbox = gr.Checkbox(label=translations["formantshift"], value=True, interactive=True)
94
- with gr.Row():
95
- with gr.Column():
96
- name_to_save_file = gr.Textbox(label=translations["filename_to_save"])
97
- save_file_button = gr.Button(translations["export_file"])
98
- with gr.Row():
99
- upload_presets = gr.Files(label=translations["upload_presets"], file_types=[".conversion.json"])
100
- with gr.Group():
101
- with gr.Row():
102
- audio_processing = gr.Checkbox(label=translations["audio_processing"], value=False, interactive=True)
103
- with gr.Row():
104
- formant_shifting1 = gr.Checkbox(label=translations["formantshift"], value=False, interactive=True)
105
- split_audio0 = gr.Checkbox(label=translations["split_audio"], value=False, interactive=True)
106
- cleaner1 = gr.Checkbox(label=translations["clear_audio"], value=False, interactive=True)
107
- with gr.Row():
108
- autotune3 = gr.Checkbox(label=translations["autotune"], value=False, interactive=True)
109
- checkpointing0 = gr.Checkbox(label=translations["memory_efficient_training"], value=False, interactive=True)
110
- proposal_pitch = gr.Checkbox(label=translations["proposal_pitch"], value=False, interactive=True)
111
- with gr.Column():
112
- resample_sr0 = gr.Radio(choices=[0]+sample_rate_choice, label=translations["resample"], info=translations["resample_info"], value=0, interactive=True)
113
- proposal_pitch_threshold = gr.Slider(minimum=50.0, maximum=1200.0, label=translations["proposal_pitch_threshold"], info=translations["proposal_pitch_threshold_info"], value=255.0, step=0.1, interactive=True, visible=proposal_pitch.value)
114
- f0_autotune_strength0 = gr.Slider(minimum=0, maximum=1, label=translations["autotune_rate"], info=translations["autotune_rate_info"], value=1, step=0.1, interactive=True, visible=autotune3.value)
115
- clean_strength1 = gr.Slider(label=translations["clean_strength"], info=translations["clean_strength_info"], minimum=0, maximum=1, value=0.5, step=0.1, interactive=True, visible=cleaner1.value)
116
- filter_radius0 = gr.Slider(minimum=0, maximum=7, label=translations["filter_radius"], info=translations["filter_radius_info"], value=3, step=1, interactive=True)
117
- rms_mix_rate0 = gr.Slider(minimum=0, maximum=1, label=translations["rms_mix_rate"], info=translations["rms_mix_rate_info"], value=1, step=0.1, interactive=True)
118
- protect0 = gr.Slider(minimum=0, maximum=1, label=translations["protect"], info=translations["protect_info"], value=0.5, step=0.01, interactive=True)
119
- with gr.Row():
120
- formant_qfrency1 = gr.Slider(value=1.0, label=translations["formant_qfrency"], info=translations["formant_qfrency"], minimum=0.0, maximum=16.0, step=0.1, interactive=True, visible=False)
121
- formant_timbre1 = gr.Slider(value=1.0, label=translations["formant_timbre"], info=translations["formant_timbre"], minimum=0.0, maximum=16.0, step=0.1, interactive=True, visible=False)
122
- with gr.Row():
123
- gr.Markdown(translations["output_tts_markdown"])
124
- with gr.Row():
125
- tts_voice_audio = gr.Audio(show_download_button=True, interactive=False, label=translations["output_text_to_speech"])
126
- tts_voice_convert = gr.Audio(show_download_button=True, interactive=False, label=translations["output_file_tts_convert"])
127
- with gr.Row():
128
- load_click.click(
129
- fn=load_presets,
130
- inputs=[
131
- presets_name,
132
- cleaner1,
133
- autotune3,
134
- pitch0,
135
- clean_strength1,
136
- index_strength0,
137
- resample_sr0,
138
- filter_radius0,
139
- rms_mix_rate0,
140
- protect0,
141
- split_audio0,
142
- f0_autotune_strength0,
143
- formant_shifting1,
144
- formant_qfrency1,
145
- formant_timbre1,
146
- proposal_pitch,
147
- proposal_pitch_threshold
148
- ],
149
- outputs=[
150
- cleaner1,
151
- autotune3,
152
- pitch0,
153
- clean_strength1,
154
- index_strength0,
155
- resample_sr0,
156
- filter_radius0,
157
- rms_mix_rate0,
158
- protect0,
159
- split_audio0,
160
- f0_autotune_strength0,
161
- formant_shifting1,
162
- formant_qfrency1,
163
- formant_timbre1,
164
- proposal_pitch,
165
- proposal_pitch_threshold
166
- ]
167
- )
168
- refresh_click.click(fn=change_preset_choices, inputs=[], outputs=[presets_name])
169
- save_file_button.click(
170
- fn=save_presets,
171
- inputs=[
172
- name_to_save_file,
173
- cleaner1,
174
- autotune3,
175
- pitch0,
176
- clean_strength1,
177
- index_strength0,
178
- resample_sr0,
179
- filter_radius0,
180
- rms_mix_rate0,
181
- protect0,
182
- split_audio0,
183
- f0_autotune_strength0,
184
- cleaner_chbox,
185
- autotune_chbox,
186
- pitch_chbox,
187
- index_strength_chbox,
188
- resample_sr_chbox,
189
- filter_radius_chbox,
190
- rms_mix_rate_chbox,
191
- protect_chbox,
192
- split_audio_chbox,
193
- formant_shifting_chbox,
194
- formant_shifting1,
195
- formant_qfrency1,
196
- formant_timbre1,
197
- proposal_pitch,
198
- proposal_pitch_threshold
199
- ],
200
- outputs=[presets_name]
201
- )
202
- with gr.Row():
203
- proposal_pitch.change(fn=visible, inputs=[proposal_pitch], outputs=[proposal_pitch_threshold])
204
- upload_presets.upload(fn=lambda presets_in: [shutil_move(preset.name, configs["presets_path"]) for preset in presets_in][0], inputs=[upload_presets], outputs=[presets_name])
205
- translate_button.click(fn=google_translate, inputs=[prompt, source_lang, target_lang], outputs=[prompt], api_name="google_translate")
206
- with gr.Row():
207
- unlock_full_method3.change(fn=unlock_f0, inputs=[unlock_full_method3], outputs=[method0])
208
- upload_f0_file0.upload(fn=lambda inp: shutil_move(inp.name, configs["f0_path"]), inputs=[upload_f0_file0], outputs=[f0_file_dropdown0])
209
- refresh_f0_file0.click(fn=change_f0_choices, inputs=[], outputs=[f0_file_dropdown0])
210
- with gr.Row():
211
- embed_mode1.change(fn=change_embedders_mode, inputs=[embed_mode1], outputs=[embedders0])
212
- autotune3.change(fn=visible, inputs=[autotune3], outputs=[f0_autotune_strength0])
213
- model_pth0.change(fn=get_index, inputs=[model_pth0], outputs=[model_index0])
214
- with gr.Row():
215
- cleaner1.change(fn=visible, inputs=[cleaner1], outputs=[clean_strength1])
216
- method0.change(fn=lambda method, hybrid: [visible(method == "hybrid"), visible(method == "hybrid"), hoplength_show(method, hybrid)], inputs=[method0, hybrid_method0], outputs=[hybrid_method0, alpha, hop_length0])
217
- hybrid_method0.change(fn=hoplength_show, inputs=[method0, hybrid_method0], outputs=[hop_length0])
218
- with gr.Row():
219
- refresh1.click(fn=change_models_choices, inputs=[], outputs=[model_pth0, model_index0])
220
- embedders0.change(fn=lambda embedders: visible(embedders == "custom"), inputs=[embedders0], outputs=[custom_embedders0])
221
- formant_shifting1.change(fn=lambda a: [visible(a) for _ in range(2)], inputs=[formant_shifting1], outputs=[formant_qfrency1, formant_timbre1])
222
- with gr.Row():
223
- model_index0.change(fn=index_strength_show, inputs=[model_index0], outputs=[index_strength0])
224
- txt_input.upload(fn=process_input, inputs=[txt_input], outputs=[prompt])
225
- use_txt.change(fn=visible, inputs=[use_txt], outputs=[txt_input])
226
- with gr.Row():
227
- google_tts_check_box.change(fn=change_tts_voice_choices, inputs=[google_tts_check_box], outputs=[tts_voice])
228
- tts_button.click(
229
- fn=TTS,
230
- inputs=[
231
- prompt,
232
- tts_voice,
233
- speed,
234
- output_audio0,
235
- tts_pitch,
236
- google_tts_check_box,
237
- txt_input
238
- ],
239
- outputs=[tts_voice_audio],
240
- api_name="text-to-speech"
241
- )
242
- convert_button0.click(
243
- fn=convert_tts,
244
- inputs=[
245
- cleaner1,
246
- autotune3,
247
- pitch0,
248
- clean_strength1,
249
- model_pth0,
250
- model_index0,
251
- index_strength0,
252
- output_audio0,
253
- output_audio1,
254
- export_format0,
255
- method0,
256
- hybrid_method0,
257
- hop_length0,
258
- embedders0,
259
- custom_embedders0,
260
- resample_sr0,
261
- filter_radius0,
262
- rms_mix_rate0,
263
- protect0,
264
- split_audio0,
265
- f0_autotune_strength0,
266
- checkpointing0,
267
- onnx_f0_mode1,
268
- formant_shifting1,
269
- formant_qfrency1,
270
- formant_timbre1,
271
- f0_file_dropdown0,
272
- embed_mode1,
273
- proposal_pitch,
274
- proposal_pitch_threshold,
275
- audio_processing,
276
- alpha
277
- ],
278
- outputs=[tts_voice_convert],
279
- api_name="convert_tts"
280
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
main/app/tabs/inference/child/convert_with_whisper.py DELETED
@@ -1,164 +0,0 @@
1
- import os
2
- import sys
3
-
4
- import gradio as gr
5
-
6
- sys.path.append(os.getcwd())
7
-
8
- from main.app.core.inference import convert_with_whisper
9
- from main.app.core.ui import visible, change_audios_choices, unlock_f0, hoplength_show, change_models_choices, get_index, index_strength_show, change_embedders_mode, shutil_move
10
- from main.app.variables import translations, paths_for_files, sample_rate_choice, model_name, index_path, method_f0, embedders_mode, embedders_model, configs, file_types, export_format_choices, whisper_model, hybrid_f0_method
11
-
12
- def convert_with_whisper_tab():
13
- with gr.Row():
14
- gr.Markdown(translations["convert_with_whisper_info"])
15
- with gr.Row():
16
- with gr.Column():
17
- with gr.Group():
18
- with gr.Row():
19
- cleaner2 = gr.Checkbox(label=translations["clear_audio"], value=False, interactive=True)
20
- autotune2 = gr.Checkbox(label=translations["autotune"], value=False, interactive=True)
21
- checkpointing2 = gr.Checkbox(label=translations["memory_efficient_training"], value=False, interactive=True)
22
- formant_shifting2 = gr.Checkbox(label=translations["formantshift"], value=False, interactive=True)
23
- proposal_pitch = gr.Checkbox(label=translations["proposal_pitch"], value=False, interactive=True)
24
- audio_processing = gr.Checkbox(label=translations["audio_processing"], value=False, interactive=True)
25
- with gr.Row():
26
- num_spk = gr.Slider(minimum=2, maximum=8, step=1, info=translations["num_spk_info"], label=translations["num_spk"], value=2, interactive=True)
27
- with gr.Row():
28
- with gr.Column():
29
- convert_button3 = gr.Button(translations["convert_audio"], variant="primary")
30
- with gr.Row():
31
- with gr.Column():
32
- with gr.Accordion(translations["model_accordion"] + " 1", open=True):
33
- with gr.Row():
34
- model_pth2 = gr.Dropdown(label=translations["model_name"], choices=model_name, value=model_name[0] if len(model_name) >= 1 else "", interactive=True, allow_custom_value=True)
35
- model_index2 = gr.Dropdown(label=translations["index_path"], choices=index_path, value=index_path[0] if len(index_path) >= 1 else "", interactive=True, allow_custom_value=True)
36
- with gr.Row():
37
- refresh2 = gr.Button(translations["refresh"])
38
- with gr.Row():
39
- pitch3 = gr.Slider(minimum=-20, maximum=20, step=1, info=translations["pitch_info"], label=translations["pitch"], value=0, interactive=True)
40
- index_strength2 = gr.Slider(label=translations["index_strength"], info=translations["index_strength_info"], minimum=0, maximum=1, value=0.5, step=0.01, interactive=True, visible=model_index2.value != "")
41
- with gr.Accordion(translations["input_output"], open=False):
42
- with gr.Column():
43
- export_format2 = gr.Radio(label=translations["export_format"], info=translations["export_info"], choices=export_format_choices, value="wav", interactive=True)
44
- input_audio1 = gr.Dropdown(label=translations["audio_path"], value="", choices=paths_for_files, info=translations["provide_audio"], allow_custom_value=True, interactive=True)
45
- output_audio2 = gr.Textbox(label=translations["output_path"], value="audios/output.wav", placeholder="audios/output.wav", info=translations["output_path_info"], interactive=True)
46
- with gr.Column():
47
- refresh4 = gr.Button(translations["refresh"])
48
- with gr.Row():
49
- input2 = gr.Files(label=translations["drop_audio"], file_types=file_types)
50
- with gr.Column():
51
- with gr.Accordion(translations["model_accordion"] + " 2", open=True):
52
- with gr.Row():
53
- model_pth3 = gr.Dropdown(label=translations["model_name"], choices=model_name, value=model_name[0] if len(model_name) >= 1 else "", interactive=True, allow_custom_value=True)
54
- model_index3 = gr.Dropdown(label=translations["index_path"], choices=index_path, value=index_path[0] if len(index_path) >= 1 else "", interactive=True, allow_custom_value=True)
55
- with gr.Row():
56
- refresh3 = gr.Button(translations["refresh"])
57
- with gr.Row():
58
- pitch4 = gr.Slider(minimum=-20, maximum=20, step=1, info=translations["pitch_info"], label=translations["pitch"], value=0, interactive=True)
59
- index_strength3 = gr.Slider(label=translations["index_strength"], info=translations["index_strength_info"], minimum=0, maximum=1, value=0.5, step=0.01, interactive=True, visible=model_index3.value != "")
60
- with gr.Accordion(translations["setting"], open=False):
61
- with gr.Row():
62
- model_size = gr.Radio(label=translations["model_size"], info=translations["model_size_info"], choices=whisper_model, value="medium", interactive=True)
63
- with gr.Accordion(translations["f0_method"], open=False):
64
- with gr.Group():
65
- with gr.Row():
66
- onnx_f0_mode4 = gr.Checkbox(label=translations["f0_onnx_mode"], info=translations["f0_onnx_mode_info"], value=False, interactive=True)
67
- unlock_full_method2 = gr.Checkbox(label=translations["f0_unlock"], info=translations["f0_unlock_info"], value=False, interactive=True)
68
- method3 = gr.Radio(label=translations["f0_method"], info=translations["f0_method_info"], choices=method_f0, value="rmvpe", interactive=True)
69
- hybrid_method3 = gr.Dropdown(label=translations["f0_method_hybrid"], info=translations["f0_method_hybrid_info"], choices=hybrid_f0_method, value=hybrid_f0_method[0], interactive=True, allow_custom_value=True, visible=method3.value == "hybrid")
70
- hop_length3 = gr.Slider(label=translations['hop_length'], info=translations["hop_length_info"], minimum=64, maximum=512, value=160, step=1, interactive=True, visible=False)
71
- alpha = gr.Slider(label=translations["alpha_label"], info=translations["alpha_info"], minimum=0.1, maximum=1, value=0.5, step=0.1, interactive=True, visible=False)
72
- with gr.Accordion(translations["hubert_model"], open=False):
73
- embed_mode3 = gr.Radio(label=translations["embed_mode"], info=translations["embed_mode_info"], value="fairseq", choices=embedders_mode, interactive=True, visible=True)
74
- embedders3 = gr.Radio(label=translations["hubert_model"], info=translations["hubert_info"], choices=embedders_model, value="hubert_base", interactive=True)
75
- custom_embedders3 = gr.Textbox(label=translations["modelname"], info=translations["modelname_info"], value="", placeholder="hubert_base", interactive=True, visible=embedders3.value == "custom")
76
- with gr.Column():
77
- resample_sr3 = gr.Radio(choices=[0]+sample_rate_choice, label=translations["resample"], info=translations["resample_info"], value=0, interactive=True)
78
- proposal_pitch_threshold = gr.Slider(minimum=50.0, maximum=1200.0, label=translations["proposal_pitch_threshold"], info=translations["proposal_pitch_threshold_info"], value=255.0, step=0.1, interactive=True, visible=proposal_pitch.value)
79
- clean_strength3 = gr.Slider(label=translations["clean_strength"], info=translations["clean_strength_info"], minimum=0, maximum=1, value=0.5, step=0.1, interactive=True, visible=cleaner2.value)
80
- f0_autotune_strength3 = gr.Slider(minimum=0, maximum=1, label=translations["autotune_rate"], info=translations["autotune_rate_info"], value=1, step=0.1, interactive=True, visible=autotune2.value)
81
- filter_radius3 = gr.Slider(minimum=0, maximum=7, label=translations["filter_radius"], info=translations["filter_radius_info"], value=3, step=1, interactive=True)
82
- rms_mix_rate3 = gr.Slider(minimum=0, maximum=1, label=translations["rms_mix_rate"], info=translations["rms_mix_rate_info"], value=1, step=0.1, interactive=True)
83
- protect3 = gr.Slider(minimum=0, maximum=1, label=translations["protect"], info=translations["protect_info"], value=0.5, step=0.01, interactive=True)
84
- with gr.Row():
85
- formant_qfrency3 = gr.Slider(value=1.0, label=translations["formant_qfrency"] + " 1", info=translations["formant_qfrency"], minimum=0.0, maximum=16.0, step=0.1, interactive=True, visible=False)
86
- formant_timbre3 = gr.Slider(value=1.0, label=translations["formant_timbre"] + " 1", info=translations["formant_timbre"], minimum=0.0, maximum=16.0, step=0.1, interactive=True, visible=False)
87
- with gr.Row():
88
- formant_qfrency4 = gr.Slider(value=1.0, label=translations["formant_qfrency"] + " 2", info=translations["formant_qfrency"], minimum=0.0, maximum=16.0, step=0.1, interactive=True, visible=False)
89
- formant_timbre4 = gr.Slider(value=1.0, label=translations["formant_timbre"] + " 2", info=translations["formant_timbre"], minimum=0.0, maximum=16.0, step=0.1, interactive=True, visible=False)
90
- with gr.Row():
91
- gr.Markdown(translations["input_output"])
92
- with gr.Row():
93
- play_audio2 = gr.Audio(show_download_button=True, interactive=False, label=translations["input_audio"])
94
- play_audio3 = gr.Audio(show_download_button=True, interactive=False, label=translations["output_file_tts_convert"])
95
- with gr.Row():
96
- autotune2.change(fn=visible, inputs=[autotune2], outputs=[f0_autotune_strength3])
97
- cleaner2.change(fn=visible, inputs=[cleaner2], outputs=[clean_strength3])
98
- method3.change(fn=lambda method, hybrid: [visible(method == "hybrid"), visible(method == "hybrid"), hoplength_show(method, hybrid)], inputs=[method3, hybrid_method3], outputs=[hybrid_method3, alpha, hop_length3])
99
- with gr.Row():
100
- hybrid_method3.change(fn=hoplength_show, inputs=[method3, hybrid_method3], outputs=[hop_length3])
101
- refresh2.click(fn=change_models_choices, inputs=[], outputs=[model_pth2, model_index2])
102
- model_pth2.change(fn=get_index, inputs=[model_pth2], outputs=[model_index2])
103
- with gr.Row():
104
- refresh3.click(fn=change_models_choices, inputs=[], outputs=[model_pth3, model_index3])
105
- model_pth3.change(fn=get_index, inputs=[model_pth3], outputs=[model_index3])
106
- input2.upload(fn=lambda audio_in: [shutil_move(audio.name, configs["audios_path"]) for audio in audio_in][0], inputs=[input2], outputs=[input_audio1])
107
- with gr.Row():
108
- input_audio1.change(fn=lambda audio: audio if os.path.isfile(audio) else None, inputs=[input_audio1], outputs=[play_audio2])
109
- formant_shifting2.change(fn=lambda a: [visible(a) for _ in range(4)], inputs=[formant_shifting2], outputs=[formant_qfrency3, formant_timbre3, formant_qfrency4, formant_timbre4])
110
- embedders3.change(fn=lambda embedders: visible(embedders == "custom"), inputs=[embedders3], outputs=[custom_embedders3])
111
- with gr.Row():
112
- refresh4.click(fn=change_audios_choices, inputs=[input_audio1], outputs=[input_audio1])
113
- model_index2.change(fn=index_strength_show, inputs=[model_index2], outputs=[index_strength2])
114
- model_index3.change(fn=index_strength_show, inputs=[model_index3], outputs=[index_strength3])
115
- with gr.Row():
116
- unlock_full_method2.change(fn=unlock_f0, inputs=[unlock_full_method2], outputs=[method3])
117
- embed_mode3.change(fn=change_embedders_mode, inputs=[embed_mode3], outputs=[embedders3])
118
- proposal_pitch.change(fn=visible, inputs=[proposal_pitch], outputs=[proposal_pitch_threshold])
119
- with gr.Row():
120
- convert_button3.click(
121
- fn=convert_with_whisper,
122
- inputs=[
123
- num_spk,
124
- model_size,
125
- cleaner2,
126
- clean_strength3,
127
- autotune2,
128
- f0_autotune_strength3,
129
- checkpointing2,
130
- model_pth2,
131
- model_pth3,
132
- model_index2,
133
- model_index3,
134
- pitch3,
135
- pitch4,
136
- index_strength2,
137
- index_strength3,
138
- export_format2,
139
- input_audio1,
140
- output_audio2,
141
- onnx_f0_mode4,
142
- method3,
143
- hybrid_method3,
144
- hop_length3,
145
- embed_mode3,
146
- embedders3,
147
- custom_embedders3,
148
- resample_sr3,
149
- filter_radius3,
150
- rms_mix_rate3,
151
- protect3,
152
- formant_shifting2,
153
- formant_qfrency3,
154
- formant_timbre3,
155
- formant_qfrency4,
156
- formant_timbre4,
157
- proposal_pitch,
158
- proposal_pitch_threshold,
159
- audio_processing,
160
- alpha
161
- ],
162
- outputs=[play_audio3],
163
- api_name="convert_with_whisper"
164
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
main/app/tabs/inference/child/separate.py DELETED
@@ -1,263 +0,0 @@
1
- import os
2
- import sys
3
-
4
- import gradio as gr
5
-
6
- sys.path.append(os.getcwd())
7
-
8
- from main.app.core.downloads import download_url
9
- from main.app.core.separate import separate_music
10
- from main.app.core.ui import visible, valueFalse_interactive, change_audios_choices, shutil_move, separate_change
11
- from main.app.variables import translations, uvr_model, karaoke_models, reverb_models, vr_models, denoise_models, mdx_models, paths_for_files, sample_rate_choice, configs, file_types, export_format_choices
12
-
13
- def separate_tab():
14
- with gr.Row():
15
- gr.Markdown(translations["4_part"])
16
- with gr.Row():
17
- with gr.Column():
18
- with gr.Group():
19
- with gr.Row():
20
- enable_denoise = gr.Checkbox(label=translations["denoise_mdx"], value=False, interactive=False)
21
- separate_backing = gr.Checkbox(label=translations["separator_backing"], value=False, interactive=True)
22
- separate_reverb = gr.Checkbox(label=translations["dereveb_audio"], value=False, interactive=True)
23
- enable_tta = gr.Checkbox(label=translations["enable_tta"], value=False, interactive=False)
24
- high_end_process = gr.Checkbox(label=translations["high_end_process"], value=False, interactive=False)
25
- enable_post_process = gr.Checkbox(label=translations["enable_post_process"], value=False, interactive=False)
26
- with gr.Row():
27
- model_name = gr.Dropdown(label=translations["separator_model"], value=uvr_model[0], choices=uvr_model, interactive=True)
28
- karaoke_model = gr.Dropdown(label=translations["separator_backing_model"], value=list(karaoke_models.keys())[0], choices=list(karaoke_models.keys()), interactive=True, visible=separate_backing.value)
29
- reverb_model = gr.Dropdown(label=translations["dereveb_model"], value=list(reverb_models.keys())[0], choices=list(reverb_models.keys()), interactive=True, visible=separate_reverb.value)
30
- denoise_model = gr.Dropdown(label=translations["denoise_model"], value=list(denoise_models.keys())[0], choices=list(denoise_models.keys()), interactive=True, visible=enable_denoise.value and model_name.value in list(vr_models.keys()))
31
- with gr.Row():
32
- with gr.Column():
33
- separate_button = gr.Button(translations["separator_tab"], variant="primary")
34
- with gr.Row():
35
- with gr.Column():
36
- with gr.Group():
37
- with gr.Row():
38
- shifts = gr.Slider(label=translations["shift"], info=translations["shift_info"], minimum=1, maximum=20, value=2, step=1, interactive=True)
39
- batch_size = gr.Slider(label=translations["batch_size"], info=translations["mdx_batch_size_info"], minimum=1, maximum=64, value=1, step=1, interactive=True, visible=False)
40
- with gr.Row():
41
- segments_size = gr.Slider(label=translations["segments_size"], info=translations["segments_size_info"], minimum=32, maximum=3072, value=256, step=32, interactive=True)
42
- aggression = gr.Slider(label=translations['aggression'], info=translations["aggression_info"], minimum=1, maximum=50, value=5, step=1, interactive=True, visible=False)
43
- drop_audio = gr.Files(label=translations["drop_audio"], file_types=file_types)
44
- with gr.Accordion(translations["use_url"], open=False):
45
- url = gr.Textbox(label=translations["url_audio"], value="", placeholder="https://www.youtube.com/...", scale=6)
46
- download_button = gr.Button(translations["downloads"])
47
- with gr.Column():
48
- with gr.Group():
49
- with gr.Row():
50
- overlap = gr.Radio(label=translations["overlap"], info=translations["overlap_info"], choices=["0.25", "0.5", "0.75", "0.99"], value="0.25", interactive=True)
51
- with gr.Row():
52
- window_size = gr.Slider(label=translations["window_size"], info=translations["window_size_info"], minimum=320, maximum=1024, value=512, step=32, interactive=True, visible=False)
53
- hop_length = gr.Slider(label=translations['hop_length'], info=translations["hop_length_info"], minimum=64, maximum=8192, value=1024, step=1, interactive=True, visible=False)
54
- post_process_threshold = gr.Slider(label=translations['post_process_threshold'], info=translations["post_process_threshold_info"], minimum=0.1, maximum=0.3, value=0.2, step=0.1, interactive=True, visible=False)
55
- sample_rate = gr.Radio(choices=sample_rate_choice, value=44100, label=translations["sr"], info=translations["sr_info"], interactive=True)
56
- with gr.Accordion(translations["input_output"], open=False):
57
- export_format = gr.Radio(label=translations["export_format"], info=translations["export_info"], choices=export_format_choices, value="wav", interactive=True)
58
- input_audio = gr.Dropdown(label=translations["audio_path"], value="", choices=paths_for_files, allow_custom_value=True, interactive=True)
59
- refresh_audio = gr.Button(translations["refresh"])
60
- output_dirs = gr.Textbox(label=translations["output_folder"], value="audios", placeholder="audios", info=translations["output_folder_info"], interactive=True)
61
- audio_input = gr.Audio(show_download_button=True, interactive=False, label=translations["input_audio"])
62
- with gr.Row():
63
- gr.Markdown(translations["output_separator"])
64
- with gr.Row():
65
- instruments_audio = gr.Audio(show_download_button=True, interactive=False, label=translations["instruments"])
66
- original_vocals = gr.Audio(show_download_button=True, interactive=False, label=translations["original_vocal"])
67
- main_vocals = gr.Audio(show_download_button=True, interactive=False, label=translations["main_vocal"], visible=separate_backing.value)
68
- backing_vocals = gr.Audio(show_download_button=True, interactive=False, label=translations["backing_vocal"], visible=separate_backing.value)
69
- with gr.Row():
70
- model_name.change(fn=lambda a: valueFalse_interactive(a in list(mdx_models.keys()) + list(vr_models.keys())), inputs=[model_name], outputs=[enable_denoise])
71
- separate_backing.change(fn=lambda a, b: valueFalse_interactive(a or b), inputs=[separate_backing, separate_reverb], outputs=[enable_denoise])
72
- separate_reverb.change(fn=lambda a, b: valueFalse_interactive(a or b), inputs=[separate_backing, separate_reverb], outputs=[enable_denoise])
73
- with gr.Row():
74
- input_audio.change(fn=lambda audio: audio if os.path.isfile(audio) else None, inputs=[input_audio], outputs=[audio_input])
75
- drop_audio.upload(fn=lambda audio_in: [shutil_move(audio.name, configs["audios_path"]) for audio in audio_in][0], inputs=[drop_audio], outputs=[input_audio])
76
- refresh_audio.click(fn=change_audios_choices, inputs=[input_audio], outputs=[input_audio])
77
- with gr.Row():
78
- separate_backing.change(fn=lambda a: [visible(a) for _ in range(2)], inputs=[separate_backing], outputs=[main_vocals, backing_vocals])
79
- download_button.click(
80
- fn=download_url,
81
- inputs=[url],
82
- outputs=[input_audio, audio_input, url],
83
- api_name='download_url'
84
- )
85
- with gr.Row():
86
- model_name.change(
87
- fn=separate_change,
88
- inputs=[model_name, karaoke_model, reverb_model, enable_post_process, separate_backing, separate_reverb, enable_denoise],
89
- outputs=[
90
- karaoke_model,
91
- reverb_model,
92
- overlap,
93
- segments_size,
94
- hop_length,
95
- batch_size,
96
- shifts,
97
- window_size,
98
- aggression,
99
- post_process_threshold,
100
- denoise_model,
101
- enable_tta,
102
- high_end_process,
103
- enable_post_process,
104
- ]
105
- )
106
- with gr.Row():
107
- karaoke_model.change(
108
- fn=separate_change,
109
- inputs=[model_name, karaoke_model, reverb_model, enable_post_process, separate_backing, separate_reverb, enable_denoise],
110
- outputs=[
111
- karaoke_model,
112
- reverb_model,
113
- overlap,
114
- segments_size,
115
- hop_length,
116
- batch_size,
117
- shifts,
118
- window_size,
119
- aggression,
120
- post_process_threshold,
121
- denoise_model,
122
- enable_tta,
123
- high_end_process,
124
- enable_post_process,
125
- ]
126
- )
127
- separate_backing.change(
128
- fn=separate_change,
129
- inputs=[model_name, karaoke_model, reverb_model, enable_post_process, separate_backing, separate_reverb, enable_denoise],
130
- outputs=[
131
- karaoke_model,
132
- reverb_model,
133
- overlap,
134
- segments_size,
135
- hop_length,
136
- batch_size,
137
- shifts,
138
- window_size,
139
- aggression,
140
- post_process_threshold,
141
- denoise_model,
142
- enable_tta,
143
- high_end_process,
144
- enable_post_process,
145
- ]
146
- )
147
- with gr.Row():
148
- reverb_model.change(
149
- fn=separate_change,
150
- inputs=[model_name, karaoke_model, reverb_model, enable_post_process, separate_backing, separate_reverb, enable_denoise],
151
- outputs=[
152
- karaoke_model,
153
- reverb_model,
154
- overlap,
155
- segments_size,
156
- hop_length,
157
- batch_size,
158
- shifts,
159
- window_size,
160
- aggression,
161
- post_process_threshold,
162
- denoise_model,
163
- enable_tta,
164
- high_end_process,
165
- enable_post_process,
166
- ]
167
- )
168
- separate_reverb.change(
169
- fn=separate_change,
170
- inputs=[model_name, karaoke_model, reverb_model, enable_post_process, separate_backing, separate_reverb, enable_denoise],
171
- outputs=[
172
- karaoke_model,
173
- reverb_model,
174
- overlap,
175
- segments_size,
176
- hop_length,
177
- batch_size,
178
- shifts,
179
- window_size,
180
- aggression,
181
- post_process_threshold,
182
- denoise_model,
183
- enable_tta,
184
- high_end_process,
185
- enable_post_process,
186
- ]
187
- )
188
- with gr.Row():
189
- enable_denoise.change(
190
- fn=separate_change,
191
- inputs=[model_name, karaoke_model, reverb_model, enable_post_process, separate_backing, separate_reverb, enable_denoise],
192
- outputs=[
193
- karaoke_model,
194
- reverb_model,
195
- overlap,
196
- segments_size,
197
- hop_length,
198
- batch_size,
199
- shifts,
200
- window_size,
201
- aggression,
202
- post_process_threshold,
203
- denoise_model,
204
- enable_tta,
205
- high_end_process,
206
- enable_post_process,
207
- ]
208
- )
209
- enable_post_process.change(
210
- fn=separate_change,
211
- inputs=[model_name, karaoke_model, reverb_model, enable_post_process, separate_backing, separate_reverb, enable_denoise],
212
- outputs=[
213
- karaoke_model,
214
- reverb_model,
215
- overlap,
216
- segments_size,
217
- hop_length,
218
- batch_size,
219
- shifts,
220
- window_size,
221
- aggression,
222
- post_process_threshold,
223
- denoise_model,
224
- enable_tta,
225
- high_end_process,
226
- enable_post_process,
227
- ]
228
- )
229
- with gr.Row():
230
- separate_button.click(
231
- fn=separate_music,
232
- inputs=[
233
- input_audio,
234
- output_dirs,
235
- export_format,
236
- model_name,
237
- karaoke_model,
238
- reverb_model,
239
- denoise_model,
240
- sample_rate,
241
- shifts,
242
- batch_size,
243
- overlap,
244
- aggression,
245
- hop_length,
246
- window_size,
247
- segments_size,
248
- post_process_threshold,
249
- enable_tta,
250
- enable_denoise,
251
- high_end_process,
252
- enable_post_process,
253
- separate_backing,
254
- separate_reverb
255
- ],
256
- outputs=[
257
- original_vocals,
258
- instruments_audio,
259
- main_vocals,
260
- backing_vocals
261
- ],
262
- api_name="separate_music"
263
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
main/app/tabs/inference/inference.py DELETED
@@ -1,30 +0,0 @@
1
- import os
2
- import sys
3
-
4
- import gradio as gr
5
-
6
- sys.path.append(os.getcwd())
7
-
8
- from main.app.variables import translations, configs
9
- from main.app.tabs.inference.child.convert import convert_tab
10
- from main.app.tabs.inference.child.separate import separate_tab
11
- from main.app.tabs.inference.child.convert_tts import convert_tts_tab
12
- from main.app.tabs.inference.child.convert_with_whisper import convert_with_whisper_tab
13
-
14
- def inference_tab():
15
- with gr.TabItem(translations["inference"], visible=configs.get("inference_tab", True)):
16
- with gr.TabItem(translations["separator_tab"], visible=configs.get("separator_tab", True)):
17
- gr.Markdown(f"## {translations['separator_tab']}")
18
- separate_tab()
19
-
20
- with gr.TabItem(translations["convert_audio"], visible=configs.get("convert_tab", True)):
21
- gr.Markdown(f"## {translations['convert_audio']}")
22
- convert_tab()
23
-
24
- with gr.TabItem(translations["convert_with_whisper"], visible=configs.get("convert_with_whisper", True)):
25
- gr.Markdown(f"## {translations['convert_with_whisper']}")
26
- convert_with_whisper_tab()
27
-
28
- with gr.TabItem(translations["convert_text"], visible=configs.get("tts_tab", True)):
29
- gr.Markdown(translations["convert_text_markdown"])
30
- convert_tts_tab()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
main/app/tabs/realtime/realtime.py DELETED
@@ -1,226 +0,0 @@
1
- import os
2
- import sys
3
-
4
- import gradio as gr
5
-
6
- sys.path.append(os.getcwd())
7
-
8
- from main.app.core.realtime import realtime_start, realtime_stop
9
- from main.app.variables import translations, configs, model_name, index_path, method_f0, embedders_mode, embedders_model
10
- from main.app.core.ui import change_models_choices, get_index, index_strength_show, unlock_f0, hoplength_show, change_embedders_mode, visible, audio_device, change_audio_device_choices, update_audio_device
11
-
12
- input_channels_map, output_channels_map = audio_device()
13
-
14
- def realtime_tab():
15
- with gr.TabItem(translations["realtime"], visible=configs.get("realtime_tab", True)):
16
- gr.Markdown(translations["realtime_markdown"])
17
- with gr.Row():
18
- gr.Markdown(translations["realtime_markdown_2"])
19
- with gr.Row():
20
- status = gr.Label(label=translations["realtime_latency"], value=translations["realtime_not_startup"])
21
- with gr.Row():
22
- monitor = gr.Checkbox(label=translations["monitor"], value=False, interactive=True)
23
- exclusive_mode = gr.Checkbox(label=translations["exclusive_mode"], value=False, interactive=True)
24
- vad_enabled = gr.Checkbox(label=translations["vad_enabled"], value=False, interactive=True)
25
- clean_audio = gr.Checkbox(label=translations["clear_audio"], value=False, interactive=True)
26
- with gr.Row():
27
- with gr.Accordion(translations["audio_device"], open=True):
28
- with gr.Row():
29
- input_audio_device = gr.Dropdown(label=translations["input_audio_device_label"], info=translations["input_audio_device_info"], choices=list(input_channels_map.keys()), value=list(input_channels_map.keys())[0] if len(list(input_channels_map.keys())) >= 1 else "", interactive=True)
30
- output_audio_device = gr.Dropdown(label=translations["output_audio_device_label"], info=translations["output_audio_device_info"], choices=list(output_channels_map.keys()), value=list(output_channels_map.keys())[0] if len(list(output_channels_map.keys())) >= 1 else "", interactive=True)
31
- monitor_output_device = gr.Dropdown(label=translations["monitor_output_device_label"], info=translations["monitor_output_device_info"], choices=list(output_channels_map.keys()), value=list(output_channels_map.keys())[0] if len(list(output_channels_map.keys())) >= 1 else "", interactive=True, visible=False)
32
- with gr.Row():
33
- input_audio_gain = gr.Slider(minimum=0, maximum=2500, label=translations["input_audio_gain_label"], info=translations["input_audio_gain_info"], value=100, step=1, interactive=True)
34
- output_audio_gain = gr.Slider(minimum=0, maximum=4000, label=translations["output_audio_gain_label"], info=translations["output_audio_gain_info"], value=100, step=1, interactive=True)
35
- monitor_audio_gain = gr.Slider(minimum=0, maximum=4000, label=translations["monitor_audio_gain_label"], info=translations["monitor_audio_gain_info"], value=100, step=1, interactive=True, visible=False)
36
- with gr.Row(visible=False) as asio_row:
37
- input_asio_channels = gr.Slider(minimum=-1, maximum=128, label=translations["input_asio_channels_label"], info=translations["input_asio_channels_info"], value=-1, step=1, interactive=True, visible=False)
38
- output_asio_channels = gr.Slider(minimum=-1, maximum=128, label=translations["output_asio_channels_label"], info=translations["output_asio_channels_info"], value=-1, step=1, interactive=True, visible=False)
39
- monitor_asio_channels = gr.Slider(minimum=-1, maximum=128, label=translations["monitor_asio_channels_label"], info=translations["monitor_asio_channels_info"], value=-1, step=1, interactive=True, visible=False)
40
- with gr.Row():
41
- refresh_audio_device = gr.Button(value=translations["refresh_audio_device"], variant="secondary")
42
- with gr.Row():
43
- start_realtime = gr.Button(value=translations["start_realtime_button"], variant="primary", interactive=True)
44
- stop_realtime = gr.Button(value=translations["stop_realtime_button"], variant="stop", interactive=False)
45
- with gr.Row():
46
- chunk_size = gr.Slider(minimum=2.7, maximum=2730.7, step=0.1, label=translations["chunk_size"], info=translations["chunk_size_info"], value=1024, interactive=True)
47
- pitch = gr.Slider(minimum=-20, maximum=20, step=1, info=translations["pitch_info"], label=translations["pitch"], value=0, interactive=True)
48
- with gr.Row():
49
- with gr.Column():
50
- with gr.Accordion(translations["model_accordion"], open=True):
51
- with gr.Row():
52
- model_pth = gr.Dropdown(label=translations["model_name"], choices=model_name, value=model_name[0] if len(model_name) >= 1 else "", interactive=True, allow_custom_value=True)
53
- model_index = gr.Dropdown(label=translations["index_path"], choices=index_path, value=index_path[0] if len(index_path) >= 1 else "", interactive=True, allow_custom_value=True)
54
- with gr.Row():
55
- model_refresh = gr.Button(translations["refresh"])
56
- with gr.Row():
57
- index_strength = gr.Slider(label=translations["index_strength"], info=translations["index_strength_info"], minimum=0, maximum=1, value=0.5, step=0.01, interactive=True, visible=model_index.value != "")
58
- with gr.Column():
59
- with gr.Accordion(translations["f0_method"], open=True):
60
- with gr.Group():
61
- with gr.Row():
62
- onnx_f0_mode = gr.Checkbox(label=translations["f0_onnx_mode"], info=translations["f0_onnx_mode_info"], value=False, interactive=True)
63
- unlock_full_method = gr.Checkbox(label=translations["f0_unlock"], info=translations["f0_unlock_info"], value=False, interactive=True)
64
- f0_method = gr.Radio(label=translations["f0_method"], info=translations["f0_method_info"], choices=[m for m in method_f0 if m != "hybrid"], value="rmvpe", interactive=True)
65
- hop_length = gr.Slider(label=translations['hop_length'], info=translations["hop_length_info"], minimum=64, maximum=512, value=160, step=1, interactive=True, visible=False)
66
- with gr.Column():
67
- with gr.Accordion(translations["hubert_model"], open=True):
68
- embed_mode = gr.Radio(label=translations["embed_mode"], info=translations["embed_mode_info"], value="fairseq", choices=embedders_mode, interactive=True, visible=True)
69
- embedders = gr.Radio(label=translations["hubert_model"], info=translations["hubert_info"], choices=embedders_model, value="hubert_base", interactive=True)
70
- custom_embedders = gr.Textbox(label=translations["modelname"], info=translations["modelname_info"], value="", placeholder="hubert_base", interactive=True, visible=embedders.value == "custom")
71
- with gr.Row():
72
- with gr.Accordion(translations["setting"], open=True):
73
- with gr.Row():
74
- f0_autotune = gr.Checkbox(label=translations["autotune"], value=False, interactive=True)
75
- proposal_pitch = gr.Checkbox(label=translations["proposal_pitch"], value=False, interactive=True)
76
- with gr.Group():
77
- with gr.Row():
78
- f0_autotune_strength = gr.Slider(minimum=0, maximum=1, label=translations["autotune_rate"], info=translations["autotune_rate_info"], value=1, step=0.1, interactive=True, visible=f0_autotune.value)
79
- proposal_pitch_threshold = gr.Slider(minimum=50.0, maximum=1200.0, label=translations["proposal_pitch_threshold"], info=translations["proposal_pitch_threshold_info"], value=255.0, step=0.1, interactive=True, visible=proposal_pitch.value)
80
- with gr.Row():
81
- rms_mix_rate = gr.Slider(minimum=0, maximum=1, label=translations["rms_mix_rate"], info=translations["rms_mix_rate_info"], value=1, step=0.1, interactive=True)
82
- protect = gr.Slider(minimum=0, maximum=1, label=translations["protect"], info=translations["protect_info"], value=0.5, step=0.01, interactive=True)
83
- with gr.Row():
84
- clean_strength = gr.Slider(label=translations["clean_strength"], info=translations["clean_strength_info"], minimum=0, maximum=1, value=0.5, step=0.1, interactive=True, visible=False)
85
- filter_radius = gr.Slider(minimum=0, maximum=7, label=translations["filter_radius"], info=translations["filter_radius_info"], value=3, step=1, interactive=True)
86
- with gr.Column():
87
- silent_threshold = gr.Slider(minimum=-90, maximum=-60, label=translations["silent_threshold_label"], info=translations["silent_threshold_info"], value=-90, step=1, interactive=True)
88
- extra_convert_size = gr.Slider(minimum=0.1, maximum=5, label=translations["extra_convert_size_label"], info=translations["extra_convert_size_info"], value=0.5, step=0.1, interactive=True)
89
- cross_fade_overlap_size = gr.Slider(minimum=0.05, maximum=0.2, label=translations["cross_fade_overlap_size_label"], info=translations["cross_fade_overlap_size_info"], value=0.1, step=0.01, interactive=True)
90
- with gr.Row():
91
- vad_sensitivity = gr.Slider(minimum=0, maximum=3, label=translations["vad_sensitivity_label"], info=translations["vad_sensitivity_info"], value=3, step=1, interactive=True, visible=vad_enabled.value)
92
- vad_frame_ms = gr.Slider(minimum=10, maximum=30, label=translations["vad_frame_ms_label"], info=translations["vad_frame_ms_info"], value=30, step=10, interactive=True, visible=vad_enabled.value)
93
- with gr.Row():
94
- model_pth.change(
95
- fn=get_index,
96
- inputs=[model_pth],
97
- outputs=[model_index]
98
- )
99
- model_index.change(
100
- fn=index_strength_show,
101
- inputs=[model_index],
102
- outputs=[index_strength]
103
- )
104
- model_refresh.click(
105
- fn=change_models_choices,
106
- inputs=[],
107
- outputs=[model_pth, model_index]
108
- )
109
- with gr.Row():
110
- unlock_full_method.change(
111
- fn=lambda f0_method: {"choices": [m for m in unlock_f0(f0_method)["choices"] if m != "hybrid"], "value": "rmvpe", "__type__": "update"},
112
- inputs=[unlock_full_method],
113
- outputs=[f0_method]
114
- )
115
- f0_method.change(
116
- fn=lambda f0_method: hoplength_show(f0_method, None),
117
- inputs=[f0_method],
118
- outputs=[hop_length]
119
- )
120
- embed_mode.change(
121
- fn=change_embedders_mode,
122
- inputs=[embed_mode],
123
- outputs=[embedders]
124
- )
125
- with gr.Row():
126
- embedders.change(
127
- fn=lambda embedders: visible(embedders == "custom"),
128
- inputs=[embedders],
129
- outputs=[custom_embedders]
130
- )
131
- input_audio_device.change(
132
- fn=update_audio_device,
133
- inputs=[input_audio_device, output_audio_device, monitor_output_device, monitor],
134
- outputs=[monitor_output_device, monitor_audio_gain, monitor_asio_channels, asio_row, input_asio_channels, output_asio_channels, monitor_asio_channels]
135
- )
136
- output_audio_device.change(
137
- fn=update_audio_device,
138
- inputs=[input_audio_device, output_audio_device, monitor_output_device, monitor],
139
- outputs=[monitor_output_device, monitor_audio_gain, monitor_asio_channels, asio_row, input_asio_channels, output_asio_channels, monitor_asio_channels]
140
- )
141
- with gr.Row():
142
- monitor_output_device.change(
143
- fn=update_audio_device,
144
- inputs=[input_audio_device, output_audio_device, monitor_output_device, monitor],
145
- outputs=[monitor_output_device, monitor_audio_gain, monitor_asio_channels, asio_row, input_asio_channels, output_asio_channels, monitor_asio_channels]
146
- )
147
- monitor.change(
148
- fn=update_audio_device,
149
- inputs=[input_audio_device, output_audio_device, monitor_output_device, monitor],
150
- outputs=[monitor_output_device, monitor_audio_gain, monitor_asio_channels, asio_row, input_asio_channels, output_asio_channels, monitor_asio_channels]
151
- )
152
- f0_autotune.change(
153
- fn=visible,
154
- inputs=[f0_autotune],
155
- outputs=[f0_autotune_strength]
156
- )
157
- with gr.Row():
158
- proposal_pitch.change(
159
- fn=visible,
160
- inputs=[proposal_pitch],
161
- outputs=[proposal_pitch_threshold]
162
- )
163
- vad_enabled.change(
164
- fn=lambda a: [visible(a) for _ in range(2)],
165
- inputs=[vad_enabled],
166
- outputs=[vad_sensitivity, vad_frame_ms]
167
- )
168
- refresh_audio_device.click(
169
- fn=change_audio_device_choices,
170
- inputs=[],
171
- outputs=[input_audio_device, output_audio_device, monitor_output_device]
172
- )
173
- with gr.Row():
174
- clean_audio.change(
175
- fn=visible,
176
- inputs=[clean_audio],
177
- outputs=[clean_strength]
178
- )
179
- start_realtime.click(
180
- fn=realtime_start,
181
- inputs=[
182
- monitor,
183
- exclusive_mode,
184
- vad_enabled,
185
- input_audio_device,
186
- output_audio_device,
187
- monitor_output_device,
188
- input_audio_gain,
189
- output_audio_gain,
190
- monitor_audio_gain,
191
- input_asio_channels,
192
- output_asio_channels,
193
- monitor_asio_channels,
194
- chunk_size,
195
- pitch,
196
- model_pth,
197
- model_index,
198
- index_strength,
199
- onnx_f0_mode,
200
- f0_method,
201
- hop_length,
202
- embed_mode,
203
- embedders,
204
- custom_embedders,
205
- f0_autotune,
206
- proposal_pitch,
207
- f0_autotune_strength,
208
- proposal_pitch_threshold,
209
- rms_mix_rate,
210
- protect,
211
- filter_radius,
212
- silent_threshold,
213
- extra_convert_size,
214
- cross_fade_overlap_size,
215
- vad_sensitivity,
216
- vad_frame_ms,
217
- clean_audio,
218
- clean_strength
219
- ],
220
- outputs=[status, start_realtime, stop_realtime]
221
- )
222
- stop_realtime.click(
223
- fn=realtime_stop,
224
- inputs=[],
225
- outputs=[status, start_realtime, stop_realtime]
226
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
main/app/tabs/realtime/realtime_client.py DELETED
@@ -1,210 +0,0 @@
1
- import os
2
- import sys
3
-
4
- import gradio as gr
5
-
6
- sys.path.append(os.getcwd())
7
-
8
- from main.app.variables import translations, configs, model_name, index_path, method_f0, embedders_mode, embedders_model
9
- from main.app.core.ui import change_models_choices, get_index, index_strength_show, unlock_f0, hoplength_show, change_embedders_mode, visible, update_dropdowns_from_json, update_button_from_json
10
-
11
- def realtime_client_tab():
12
- with gr.TabItem(translations["realtime_client"], visible=configs.get("realtime_client_tab", True)):
13
- gr.Markdown(translations["realtime_markdown"])
14
- with gr.Row():
15
- gr.Markdown(translations["realtime_markdown_2"])
16
- with gr.Row():
17
- gr.Label(label=translations["realtime_latency"], value=translations["realtime_not_startup"], elem_id="realtime-status-info")
18
- with gr.Row():
19
- monitor = gr.Checkbox(label=translations["monitor"], value=False, interactive=True)
20
- vad_enabled = gr.Checkbox(label=translations["vad_enabled"], value=False, interactive=True)
21
- clean_audio = gr.Checkbox(label=translations["clear_audio"], value=False, interactive=True)
22
- with gr.Row():
23
- with gr.Accordion(translations["audio_device"], open=True):
24
- with gr.Row():
25
- input_audio_device = gr.Dropdown(label=translations["input_audio_device_label"], info=translations["input_audio_device_info"], choices=[], value=None, interactive=True)
26
- output_audio_device = gr.Dropdown(label=translations["output_audio_device_label"], info=translations["output_audio_device_info"], choices=[], value=None, interactive=True)
27
- monitor_output_device = gr.Dropdown(label=translations["monitor_output_device_label"], info=translations["monitor_output_device_info"], choices=[], value=None, interactive=True, visible=False)
28
- with gr.Row():
29
- input_audio_gain = gr.Slider(minimum=0, maximum=2500, label=translations["input_audio_gain_label"], info=translations["input_audio_gain_info"], value=100, step=1, interactive=True)
30
- output_audio_gain = gr.Slider(minimum=0, maximum=4000, label=translations["output_audio_gain_label"], info=translations["output_audio_gain_info"], value=100, step=1, interactive=True)
31
- monitor_audio_gain = gr.Slider(minimum=0, maximum=4000, label=translations["monitor_audio_gain_label"], info=translations["monitor_audio_gain_info"], value=100, step=1, interactive=True, visible=False)
32
- with gr.Row():
33
- refresh_audio_device = gr.Button(value=translations["refresh_audio_device"], variant="secondary")
34
- with gr.Row():
35
- start_realtime = gr.Button(value=translations["start_realtime_button"], variant="primary", interactive=True)
36
- stop_realtime = gr.Button(value=translations["stop_realtime_button"], variant="stop", interactive=False)
37
- with gr.Row():
38
- chunk_size = gr.Slider(minimum=2.7, maximum=2730.7, step=0.1, label=translations["chunk_size"], info=translations["chunk_size_info"], value=1024, interactive=True)
39
- pitch = gr.Slider(minimum=-20, maximum=20, step=1, info=translations["pitch_info"], label=translations["pitch"], value=0, interactive=True)
40
- with gr.Row():
41
- with gr.Column():
42
- with gr.Accordion(translations["model_accordion"], open=True):
43
- with gr.Row():
44
- model_pth = gr.Dropdown(label=translations["model_name"], choices=model_name, value=model_name[0] if len(model_name) >= 1 else "", interactive=True, allow_custom_value=True)
45
- model_index = gr.Dropdown(label=translations["index_path"], choices=index_path, value=index_path[0] if len(index_path) >= 1 else "", interactive=True, allow_custom_value=True)
46
- with gr.Row():
47
- model_refresh = gr.Button(translations["refresh"])
48
- with gr.Row():
49
- index_strength = gr.Slider(label=translations["index_strength"], info=translations["index_strength_info"], minimum=0, maximum=1, value=0.5, step=0.01, interactive=True, visible=model_index.value != "")
50
- with gr.Column():
51
- with gr.Accordion(translations["f0_method"], open=True):
52
- with gr.Group():
53
- with gr.Row():
54
- onnx_f0_mode = gr.Checkbox(label=translations["f0_onnx_mode"], info=translations["f0_onnx_mode_info"], value=False, interactive=True)
55
- unlock_full_method = gr.Checkbox(label=translations["f0_unlock"], info=translations["f0_unlock_info"], value=False, interactive=True)
56
- f0_method = gr.Radio(label=translations["f0_method"], info=translations["f0_method_info"], choices=[m for m in method_f0 if m != "hybrid"], value="rmvpe", interactive=True)
57
- hop_length = gr.Slider(label=translations['hop_length'], info=translations["hop_length_info"], minimum=64, maximum=512, value=160, step=1, interactive=True, visible=False)
58
- with gr.Column():
59
- with gr.Accordion(translations["hubert_model"], open=True):
60
- embed_mode = gr.Radio(label=translations["embed_mode"], info=translations["embed_mode_info"], value="fairseq", choices=embedders_mode, interactive=True, visible=True)
61
- embedders = gr.Radio(label=translations["hubert_model"], info=translations["hubert_info"], choices=embedders_model, value="hubert_base", interactive=True)
62
- custom_embedders = gr.Textbox(label=translations["modelname"], info=translations["modelname_info"], value="", placeholder="hubert_base", interactive=True, visible=embedders.value == "custom")
63
- with gr.Row():
64
- with gr.Accordion(translations["setting"], open=True):
65
- with gr.Row():
66
- f0_autotune = gr.Checkbox(label=translations["autotune"], value=False, interactive=True)
67
- proposal_pitch = gr.Checkbox(label=translations["proposal_pitch"], value=False, interactive=True)
68
- with gr.Group():
69
- with gr.Row():
70
- f0_autotune_strength = gr.Slider(minimum=0, maximum=1, label=translations["autotune_rate"], info=translations["autotune_rate_info"], value=1, step=0.1, interactive=True, visible=f0_autotune.value)
71
- proposal_pitch_threshold = gr.Slider(minimum=50.0, maximum=1200.0, label=translations["proposal_pitch_threshold"], info=translations["proposal_pitch_threshold_info"], value=255.0, step=0.1, interactive=True, visible=proposal_pitch.value)
72
- with gr.Row():
73
- rms_mix_rate = gr.Slider(minimum=0, maximum=1, label=translations["rms_mix_rate"], info=translations["rms_mix_rate_info"], value=1, step=0.1, interactive=True)
74
- protect = gr.Slider(minimum=0, maximum=1, label=translations["protect"], info=translations["protect_info"], value=0.5, step=0.01, interactive=True)
75
- with gr.Row():
76
- clean_strength = gr.Slider(label=translations["clean_strength"], info=translations["clean_strength_info"], minimum=0, maximum=1, value=0.5, step=0.1, interactive=True, visible=False)
77
- filter_radius = gr.Slider(minimum=0, maximum=7, label=translations["filter_radius"], info=translations["filter_radius_info"], value=3, step=1, interactive=True)
78
- with gr.Column():
79
- silent_threshold = gr.Slider(minimum=-90, maximum=-60, label=translations["silent_threshold_label"], info=translations["silent_threshold_info"], value=-90, step=1, interactive=True)
80
- extra_convert_size = gr.Slider(minimum=0.1, maximum=5, label=translations["extra_convert_size_label"], info=translations["extra_convert_size_info"], value=0.5, step=0.1, interactive=True)
81
- cross_fade_overlap_size = gr.Slider(minimum=0.05, maximum=0.2, label=translations["cross_fade_overlap_size_label"], info=translations["cross_fade_overlap_size_info"], value=0.1, step=0.01, interactive=True)
82
- with gr.Row():
83
- vad_sensitivity = gr.Slider(minimum=0, maximum=3, label=translations["vad_sensitivity_label"], info=translations["vad_sensitivity_info"], value=3, step=1, interactive=True, visible=vad_enabled.value)
84
- vad_frame_ms = gr.Slider(minimum=10, maximum=30, label=translations["vad_frame_ms_label"], info=translations["vad_frame_ms_info"], value=30, step=10, interactive=True, visible=vad_enabled.value)
85
- with gr.Row():
86
- json_audio_hidden = gr.JSON(visible=False)
87
- json_button_hidden = gr.JSON(visible=False)
88
- with gr.Row():
89
- model_pth.change(
90
- fn=get_index,
91
- inputs=[model_pth],
92
- outputs=[model_index]
93
- )
94
- model_index.change(
95
- fn=index_strength_show,
96
- inputs=[model_index],
97
- outputs=[index_strength]
98
- )
99
- model_refresh.click(
100
- fn=change_models_choices,
101
- inputs=[],
102
- outputs=[model_pth, model_index]
103
- )
104
- with gr.Row():
105
- unlock_full_method.change(
106
- fn=lambda f0_method: {"choices": [m for m in unlock_f0(f0_method)["choices"] if m != "hybrid"], "value": "rmvpe", "__type__": "update"},
107
- inputs=[unlock_full_method],
108
- outputs=[f0_method]
109
- )
110
- f0_method.change(
111
- fn=lambda f0_method: hoplength_show(f0_method, None),
112
- inputs=[f0_method],
113
- outputs=[hop_length]
114
- )
115
- embed_mode.change(
116
- fn=change_embedders_mode,
117
- inputs=[embed_mode],
118
- outputs=[embedders]
119
- )
120
- with gr.Row():
121
- embedders.change(
122
- fn=lambda embedders: visible(embedders == "custom"),
123
- inputs=[embedders],
124
- outputs=[custom_embedders]
125
- )
126
- f0_autotune.change(
127
- fn=visible,
128
- inputs=[f0_autotune],
129
- outputs=[f0_autotune_strength]
130
- )
131
- clean_audio.change(
132
- fn=visible,
133
- inputs=[clean_audio],
134
- outputs=[clean_strength]
135
- )
136
- with gr.Row():
137
- proposal_pitch.change(
138
- fn=visible,
139
- inputs=[proposal_pitch],
140
- outputs=[proposal_pitch_threshold]
141
- )
142
- vad_enabled.change(
143
- fn=lambda a: [visible(a) for _ in range(2)],
144
- inputs=[vad_enabled],
145
- outputs=[vad_sensitivity, vad_frame_ms]
146
- )
147
- refresh_audio_device.click(
148
- fn=None,
149
- js="getAudioDevices",
150
- inputs=[],
151
- outputs=json_audio_hidden
152
- )
153
- with gr.Row():
154
- json_audio_hidden.change(
155
- fn=update_dropdowns_from_json,
156
- inputs=[json_audio_hidden],
157
- outputs=[input_audio_device, output_audio_device, monitor_output_device]
158
- )
159
- json_button_hidden.change(
160
- fn=update_button_from_json,
161
- inputs=[json_button_hidden],
162
- outputs=[start_realtime, stop_realtime]
163
- )
164
- with gr.Row():
165
- start_realtime.click(
166
- fn=None,
167
- js="StreamAudioRealtime",
168
- inputs=[
169
- monitor,
170
- vad_enabled,
171
- input_audio_device,
172
- output_audio_device,
173
- monitor_output_device,
174
- input_audio_gain,
175
- output_audio_gain,
176
- monitor_audio_gain,
177
- chunk_size,
178
- pitch,
179
- model_pth,
180
- model_index,
181
- index_strength,
182
- onnx_f0_mode,
183
- f0_method,
184
- hop_length,
185
- embed_mode,
186
- embedders,
187
- custom_embedders,
188
- f0_autotune,
189
- proposal_pitch,
190
- f0_autotune_strength,
191
- proposal_pitch_threshold,
192
- rms_mix_rate,
193
- protect,
194
- filter_radius,
195
- silent_threshold,
196
- extra_convert_size,
197
- cross_fade_overlap_size,
198
- vad_sensitivity,
199
- vad_frame_ms,
200
- clean_audio,
201
- clean_strength
202
- ],
203
- outputs=[json_button_hidden]
204
- )
205
- stop_realtime.click(
206
- fn=None,
207
- js="StopAudioStream",
208
- inputs=[],
209
- outputs=[json_button_hidden]
210
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
main/app/tabs/training/child/create_dataset.py DELETED
@@ -1,282 +0,0 @@
1
- import os
2
- import sys
3
-
4
- import gradio as gr
5
-
6
- sys.path.append(os.getcwd())
7
-
8
- from main.app.core.training import create_dataset
9
- from main.app.core.ui import visible, valueFalse_interactive, create_dataset_change
10
- from main.app.variables import translations, sample_rate_choice, uvr_model, reverb_models, denoise_models, vr_models, mdx_models
11
-
12
- def create_dataset_tab():
13
- with gr.Row():
14
- gr.Markdown(translations["create_dataset_markdown_2"])
15
- with gr.Group():
16
- with gr.Row():
17
- separate = gr.Checkbox(label=translations["separator_tab"], value=False, interactive=True)
18
- clean_dataset = gr.Checkbox(label=translations["clear_audio"], value=False, interactive=True)
19
- skip_seconds = gr.Checkbox(label=translations["skip"], value=False, interactive=True)
20
- separate_reverb = gr.Checkbox(label=translations["dereveb_audio"], value=False, interactive=False)
21
- with gr.Row(visible=False) as row:
22
- enable_tta = gr.Checkbox(label=translations["enable_tta"], value=False, interactive=False)
23
- high_end_process = gr.Checkbox(label=translations["high_end_process"], value=False, interactive=False)
24
- enable_post_process = gr.Checkbox(label=translations["enable_post_process"], value=False, interactive=False)
25
- enable_denoise = gr.Checkbox(label=translations["denoise_mdx"], value=False, interactive=False)
26
- with gr.Row():
27
- dataset_url = gr.Textbox(label=translations["url_audio"], info=translations["create_dataset_url"], value="", placeholder="https://www.youtube.com/...", interactive=True, scale=5)
28
- output_dataset = gr.Textbox(label=translations["output_data"], info=translations["output_data_info"], value="dataset", placeholder="dataset", interactive=True)
29
- with gr.Row():
30
- create_dataset_button = gr.Button(translations["createdataset"], variant="primary", scale=2, min_width=4000)
31
- with gr.Row(visible=False) as row_2:
32
- model_name = gr.Dropdown(label=translations["separator_model"], value=uvr_model[0], choices=uvr_model, interactive=True)
33
- reverb_model = gr.Dropdown(label=translations["dereveb_model"], value=list(reverb_models.keys())[0], choices=list(reverb_models.keys()), interactive=True)
34
- denoise_model = gr.Dropdown(label=translations["denoise_model"], value=list(denoise_models.keys())[0], choices=list(denoise_models.keys()), interactive=True, visible=False)
35
- with gr.Row():
36
- with gr.Column(visible=False) as row_3:
37
- with gr.Group():
38
- with gr.Row():
39
- overlap = gr.Radio(label=translations["overlap"], info=translations["overlap_info"], choices=["0.25", "0.5", "0.75", "0.99"], value="0.25", interactive=True)
40
- with gr.Row():
41
- window_size = gr.Slider(label=translations["window_size"], info=translations["window_size_info"], minimum=320, maximum=1024, value=512, step=32, interactive=True, visible=False)
42
- with gr.Row():
43
- shifts = gr.Slider(label=translations["shift"], info=translations["shift_info"], minimum=1, maximum=20, value=2, step=1, interactive=True)
44
- segments_size = gr.Slider(label=translations["segments_size"], info=translations["segments_size_info"], minimum=32, maximum=3072, value=256, step=32, interactive=True)
45
- with gr.Row():
46
- batch_size = gr.Slider(label=translations["batch_size"], info=translations["mdx_batch_size_info"], minimum=1, maximum=64, value=1, step=1, interactive=True, visible=False)
47
- hop_length = gr.Slider(label=translations['hop_length'], info=translations["hop_length_info"], minimum=64, maximum=8192, value=1024, step=1, interactive=True, visible=False)
48
- with gr.Row():
49
- post_process_threshold = gr.Slider(label=translations['post_process_threshold'], info=translations["post_process_threshold_info"], minimum=0.1, maximum=0.3, value=0.2, step=0.1, interactive=True, visible=False)
50
- aggression = gr.Slider(label=translations['aggression'], info=translations["aggression_info"], minimum=1, maximum=50, value=5, step=1, interactive=True, visible=False)
51
- with gr.Column():
52
- sample_rate = gr.Radio(choices=sample_rate_choice, value=48000, label=translations["sr"], info=translations["sr_info"], interactive=True)
53
- clean_strength = gr.Slider(minimum=0, maximum=1, step=0.1, value=0.5, label=translations["clean_strength"], info=translations["clean_strength_info"], interactive=True, visible=False)
54
- with gr.Row():
55
- skip_start = gr.Textbox(label=translations["skip_start"], info=translations["skip_start_info"], value="", placeholder="0,...", interactive=True, visible=skip_seconds.value)
56
- skip_end = gr.Textbox(label=translations["skip_end"], info=translations["skip_end_info"], value="", placeholder="0,...", interactive=True, visible=skip_seconds.value)
57
- create_dataset_info = gr.Textbox(label=translations["create_dataset_info"], value="", interactive=False, lines=2)
58
- with gr.Row():
59
- separate.change(
60
- fn=lambda a: [visible(a) for _ in range(3)],
61
- inputs=[separate],
62
- outputs=[
63
- row,
64
- row_2,
65
- row_3
66
- ]
67
- )
68
- separate.change(
69
- fn=valueFalse_interactive,
70
- inputs=[separate],
71
- outputs=[separate_reverb]
72
- )
73
- separate.change(
74
- fn=create_dataset_change,
75
- inputs=[
76
- model_name,
77
- reverb_model,
78
- enable_post_process,
79
- separate_reverb,
80
- enable_denoise
81
- ],
82
- outputs=[
83
- reverb_model,
84
- overlap,
85
- segments_size,
86
- hop_length,
87
- batch_size,
88
- shifts,
89
- window_size,
90
- aggression,
91
- post_process_threshold,
92
- denoise_model,
93
- enable_tta,
94
- high_end_process,
95
- enable_post_process,
96
- ]
97
- )
98
- with gr.Row():
99
- model_name.change(
100
- fn=create_dataset_change,
101
- inputs=[
102
- model_name,
103
- reverb_model,
104
- enable_post_process,
105
- separate_reverb,
106
- enable_denoise
107
- ],
108
- outputs=[
109
- reverb_model,
110
- overlap,
111
- segments_size,
112
- hop_length,
113
- batch_size,
114
- shifts,
115
- window_size,
116
- aggression,
117
- post_process_threshold,
118
- denoise_model,
119
- enable_tta,
120
- high_end_process,
121
- enable_post_process,
122
- ]
123
- )
124
- reverb_model.change(
125
- fn=create_dataset_change,
126
- inputs=[
127
- model_name,
128
- reverb_model,
129
- enable_post_process,
130
- separate_reverb,
131
- enable_denoise
132
- ],
133
- outputs=[
134
- reverb_model,
135
- overlap,
136
- segments_size,
137
- hop_length,
138
- batch_size,
139
- shifts,
140
- window_size,
141
- aggression,
142
- post_process_threshold,
143
- denoise_model,
144
- enable_tta,
145
- high_end_process,
146
- enable_post_process,
147
- ]
148
- )
149
- denoise_model.change(
150
- fn=create_dataset_change,
151
- inputs=[
152
- model_name,
153
- reverb_model,
154
- enable_post_process,
155
- separate_reverb,
156
- enable_denoise
157
- ],
158
- outputs=[
159
- reverb_model,
160
- overlap,
161
- segments_size,
162
- hop_length,
163
- batch_size,
164
- shifts,
165
- window_size,
166
- aggression,
167
- post_process_threshold,
168
- denoise_model,
169
- enable_tta,
170
- high_end_process,
171
- enable_post_process,
172
- ]
173
- )
174
- with gr.Row():
175
- separate_reverb.change(
176
- fn=create_dataset_change,
177
- inputs=[
178
- model_name,
179
- reverb_model,
180
- enable_post_process,
181
- separate_reverb,
182
- enable_denoise
183
- ],
184
- outputs=[
185
- reverb_model,
186
- overlap,
187
- segments_size,
188
- hop_length,
189
- batch_size,
190
- shifts,
191
- window_size,
192
- aggression,
193
- post_process_threshold,
194
- denoise_model,
195
- enable_tta,
196
- high_end_process,
197
- enable_post_process,
198
- ]
199
- )
200
- enable_denoise.change(
201
- fn=create_dataset_change,
202
- inputs=[
203
- model_name,
204
- reverb_model,
205
- enable_post_process,
206
- separate_reverb,
207
- enable_denoise
208
- ],
209
- outputs=[
210
- reverb_model,
211
- overlap,
212
- segments_size,
213
- hop_length,
214
- batch_size,
215
- shifts,
216
- window_size,
217
- aggression,
218
- post_process_threshold,
219
- denoise_model,
220
- enable_tta,
221
- high_end_process,
222
- enable_post_process,
223
- ]
224
- )
225
- with gr.Row():
226
- skip_seconds.change(
227
- fn=lambda a: [visible(a) for _ in range(2)],
228
- inputs=[skip_seconds],
229
- outputs=[
230
- skip_start,
231
- skip_end
232
- ]
233
- )
234
- clean_dataset.change(
235
- fn=visible,
236
- inputs=[clean_dataset],
237
- outputs=[clean_strength]
238
- )
239
- with gr.Row():
240
- model_name.change(
241
- fn=lambda a: valueFalse_interactive(a in list(mdx_models.keys()) + list(vr_models.keys())),
242
- inputs=[model_name],
243
- outputs=[enable_denoise]
244
- )
245
- separate_reverb.change(
246
- fn=valueFalse_interactive,
247
- inputs=[separate_reverb],
248
- outputs=[enable_denoise]
249
- )
250
- with gr.Row():
251
- create_dataset_button.click(
252
- fn=create_dataset,
253
- inputs=[
254
- dataset_url,
255
- output_dataset,
256
- skip_seconds,
257
- skip_start,
258
- skip_end,
259
- separate,
260
- model_name,
261
- reverb_model,
262
- denoise_model,
263
- sample_rate,
264
- shifts,
265
- batch_size,
266
- overlap,
267
- aggression,
268
- hop_length,
269
- window_size,
270
- segments_size,
271
- post_process_threshold,
272
- enable_tta,
273
- enable_denoise,
274
- high_end_process,
275
- enable_post_process,
276
- separate_reverb,
277
- clean_dataset,
278
- clean_strength
279
- ],
280
- outputs=[create_dataset_info],
281
- api_name="create_dataset"
282
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
main/app/tabs/training/child/create_reference.py DELETED
@@ -1,97 +0,0 @@
1
- import os
2
- import sys
3
-
4
- import gradio as gr
5
-
6
- sys.path.append(os.getcwd())
7
-
8
- from main.app.core.training import create_reference
9
- from main.app.core.ui import visible, change_audios_choices, unlock_f0, shutil_move, change_embedders_mode
10
- from main.app.variables import translations, paths_for_files, method_f0, hybrid_f0_method, file_types, configs, embedders_model, embedders_mode
11
-
12
- def create_reference_tab():
13
- with gr.Row():
14
- gr.Markdown(translations["create_reference_markdown_2"])
15
- with gr.Row():
16
- pitch_guidance = gr.Checkbox(label=translations["training_pitch"], value=True, interactive=True)
17
- use_energy = gr.Checkbox(label=translations["train&energy"], value=False, interactive=True)
18
- f0_autotune = gr.Checkbox(label=translations["autotune"], value=False, interactive=True)
19
- proposal_pitch = gr.Checkbox(label=translations["proposal_pitch"], value=False, interactive=True)
20
- with gr.Row():
21
- create_reference_button = gr.Button(translations["create_reference"], variant="primary")
22
- with gr.Row():
23
- f0_up_key = gr.Slider(minimum=-20, maximum=20, step=1, info=translations["pitch_info"], label=translations["pitch"], value=0, interactive=True)
24
- proposal_pitch_threshold = gr.Slider(minimum=50.0, maximum=1200.0, label=translations["proposal_pitch_threshold"], info=translations["proposal_pitch_threshold_info"], value=255.0, step=0.1, interactive=True, visible=proposal_pitch.value)
25
- with gr.Row():
26
- filter_radius = gr.Slider(minimum=0, maximum=7, label=translations["filter_radius"], info=translations["filter_radius_info"], value=3, step=1, interactive=True)
27
- f0_autotune_strength = gr.Slider(minimum=0, maximum=1, label=translations["autotune_rate"], info=translations["autotune_rate_info"], value=1, step=0.1, interactive=True, visible=f0_autotune.value)
28
- with gr.Row():
29
- with gr.Column():
30
- with gr.Accordion(translations["input_output"], open=False):
31
- with gr.Column():
32
- input_audio = gr.Dropdown(label=translations["audio_path"], value="", choices=paths_for_files, info=translations["provide_audio"], allow_custom_value=True, interactive=True)
33
- reference_name = gr.Textbox(label=translations["reference_name"], value="reference", placeholder="reference", info=translations["reference_name_info"], interactive=True)
34
- with gr.Column():
35
- refresh_audio = gr.Button(translations["refresh"])
36
- with gr.Column():
37
- upload_audio = gr.Files(label=translations["drop_audio"], file_types=file_types)
38
- with gr.Column():
39
- play_audio = gr.Audio(show_download_button=True, interactive=False, label=translations["input_audio"])
40
- with gr.Column() as f0_method_column:
41
- with gr.Accordion(label=translations["f0_method"], open=False):
42
- with gr.Group():
43
- with gr.Row():
44
- onnx_f0 = gr.Checkbox(label=translations["f0_onnx_mode"], value=False, interactive=True)
45
- unlock_full_method = gr.Checkbox(label=translations["f0_unlock"], value=False, interactive=True)
46
- f0_method = gr.Radio(label=translations["f0_method"], info=translations["f0_method_info"], choices=method_f0, value="rmvpe", interactive=True)
47
- f0_hybrid_method = gr.Dropdown(label=translations["f0_method_hybrid"], info=translations["f0_method_hybrid_info"], choices=hybrid_f0_method, value=hybrid_f0_method[0], interactive=True, allow_custom_value=True, visible=f0_method.value == "hybrid")
48
- with gr.Row():
49
- alpha = gr.Slider(label=translations["alpha_label"], info=translations["alpha_info"], minimum=0.1, maximum=1, value=0.5, step=0.1, interactive=True, visible=False)
50
- with gr.Column():
51
- with gr.Accordion(label=translations["hubert_model"], open=False):
52
- with gr.Row():
53
- version = gr.Radio(label=translations["training_version"], info=translations["training_version_info"], choices=["v1", "v2"], value="v2", interactive=True)
54
- with gr.Group():
55
- embedder_mode = gr.Radio(label=translations["embed_mode"], info=translations["embed_mode_info"], value="fairseq", choices=embedders_mode, interactive=True, visible=True)
56
- embedders = gr.Radio(label=translations["hubert_model"], info=translations["hubert_info"], choices=embedders_model, value="hubert_base", interactive=True)
57
- with gr.Row():
58
- embedders_custom = gr.Textbox(label=translations["modelname"], info=translations["modelname_info"], value="", placeholder="hubert_base", interactive=True, visible=embedders.value == "custom")
59
- with gr.Row():
60
- create_reference_info = gr.Textbox(label=translations["reference_info"], value="", interactive=False, lines=2)
61
- with gr.Row():
62
- f0_autotune.change(fn=visible, inputs=[f0_autotune], outputs=[f0_autotune_strength])
63
- proposal_pitch.change(fn=visible, inputs=[proposal_pitch], outputs=[proposal_pitch_threshold])
64
- unlock_full_method.change(fn=unlock_f0, inputs=[unlock_full_method], outputs=[f0_method])
65
- with gr.Row():
66
- input_audio.change(fn=lambda audio: audio, inputs=[input_audio], outputs=[play_audio])
67
- refresh_audio.click(fn=change_audios_choices, inputs=[input_audio], outputs=[input_audio])
68
- f0_method.change(fn=lambda method: [visible(method == "hybrid") for _ in range(2)], inputs=[f0_method], outputs=[f0_hybrid_method, alpha])
69
- with gr.Row():
70
- upload_audio.upload(fn=lambda audio_in: [shutil_move(audio.name, configs["audios_path"]) for audio in audio_in][0], inputs=[upload_audio], outputs=[input_audio])
71
- embedder_mode.change(fn=change_embedders_mode, inputs=[embedder_mode], outputs=[embedders])
72
- embedders.change(fn=lambda embedders: visible(embedders == "custom"), inputs=[embedders], outputs=[embedders_custom])
73
- with gr.Row():
74
- pitch_guidance.change(fn=visible, inputs=[pitch_guidance], outputs=[f0_method_column])
75
- create_reference_button.click(
76
- fn=create_reference,
77
- inputs=[
78
- input_audio,
79
- reference_name,
80
- pitch_guidance,
81
- use_energy,
82
- version,
83
- embedders,
84
- embedder_mode,
85
- f0_method,
86
- onnx_f0,
87
- f0_up_key,
88
- filter_radius,
89
- f0_autotune,
90
- f0_autotune_strength,
91
- proposal_pitch,
92
- proposal_pitch_threshold,
93
- alpha
94
- ],
95
- outputs=[create_reference_info],
96
- api_name="create_reference"
97
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
main/app/tabs/training/child/training.py DELETED
@@ -1,259 +0,0 @@
1
- import os
2
- import sys
3
-
4
- import gradio as gr
5
-
6
- sys.path.append(os.getcwd())
7
-
8
- from main.app.core.process import zip_file
9
- from main.app.core.training import preprocess, extract, create_index, training
10
- from main.app.variables import translations, model_name, index_path, method_f0, embedders_mode, embedders_model, pretrainedD, pretrainedG, config, file_types, hybrid_f0_method, reference_list
11
- from main.app.core.ui import gr_warning, visible, unlock_f0, hoplength_show, change_models_choices, get_gpu_info, change_embedders_mode, pitch_guidance_lock, vocoders_lock, unlock_ver, unlock_vocoder, change_pretrained_choices, gpu_number_str, shutil_move, change_reference_choices
12
-
13
- def training_model_tab():
14
- with gr.Row():
15
- gr.Markdown(translations["training_markdown"])
16
- with gr.Row():
17
- with gr.Column():
18
- with gr.Row():
19
- with gr.Column():
20
- training_name = gr.Textbox(label=translations["modelname"], info=translations["training_model_name"], value="", placeholder=translations["modelname"], interactive=True)
21
- training_sr = gr.Radio(label=translations["sample_rate"], info=translations["sample_rate_info"], choices=["32k", "40k", "48k"], value="48k", interactive=True)
22
- training_ver = gr.Radio(label=translations["training_version"], info=translations["training_version_info"], choices=["v1", "v2"], value="v2", interactive=True)
23
- with gr.Row():
24
- clean_dataset = gr.Checkbox(label=translations["clear_dataset"], value=False, interactive=True)
25
- process_effects = gr.Checkbox(label=translations["preprocess_effect"], value=False, interactive=True)
26
- training_f0 = gr.Checkbox(label=translations["training_pitch"], value=True, interactive=True)
27
- custom_reference = gr.Checkbox(label=translations["custom_reference"], value=False, interactive=True)
28
- checkpointing1 = gr.Checkbox(label=translations["memory_efficient_training"], value=False, interactive=True)
29
- upload = gr.Checkbox(label=translations["upload_dataset"], value=False, interactive=True)
30
- with gr.Row():
31
- preprocess_split_audio_mode = gr.Radio(label=translations["split_audio_mode"], info=translations["split_audio_mode_info"], value="Automatic", choices=["Automatic", "Simple", "Skip"], interactive=True)
32
- preprocess_normalization_mode = gr.Radio(label=translations["normalization_mode"], info=translations["normalization_mode_info"], value="none", choices=["none", "pre", "post"], interactive=True)
33
- with gr.Row(visible=custom_reference.value) as custom_reference_row:
34
- with gr.Accordion(translations["custom_reference"], open=True):
35
- reference_name = gr.Dropdown(label=translations["reference_name"], info=translations["reference_name_info"], choices=reference_list, value=reference_list[0] if len(reference_list) >= 1 else "", allow_custom_value=True, interactive=True)
36
- reference_refresh = gr.Button(translations["refresh"], scale=2)
37
- with gr.Row(visible=clean_dataset.value) as clean_dataset_row:
38
- clean_dataset_strength = gr.Slider(label=translations["clean_strength"], info=translations["clean_strength_info"], minimum=0, maximum=1, value=0.7, step=0.1, interactive=True)
39
- with gr.Column():
40
- preprocess_button = gr.Button(translations["preprocess_button"], scale=2)
41
- upload_dataset = gr.Files(label=translations["drop_audio"], file_types=file_types, visible=upload.value)
42
- preprocess_info = gr.Textbox(label=translations["preprocess_info"], value="", interactive=False, container=True, lines=2)
43
- with gr.Column():
44
- with gr.Row():
45
- with gr.Column():
46
- with gr.Accordion(label=translations["f0_method"], open=False):
47
- with gr.Group():
48
- with gr.Row():
49
- onnx_f0_mode2 = gr.Checkbox(label=translations["f0_onnx_mode"], value=False, interactive=True)
50
- unlock_full_method4 = gr.Checkbox(label=translations["f0_unlock"], value=False, interactive=True)
51
- autotune = gr.Checkbox(label=translations["autotune"], value=False, interactive=True)
52
- extract_method = gr.Radio(label=translations["f0_method"], info=translations["f0_method_info"], choices=method_f0, value="rmvpe", interactive=True)
53
- extract_hybrid_method = gr.Dropdown(label=translations["f0_method_hybrid"], info=translations["f0_method_hybrid_info"], choices=hybrid_f0_method, value=hybrid_f0_method[0], interactive=True, allow_custom_value=True, visible=extract_method.value == "hybrid")
54
- extract_hop_length = gr.Slider(label=translations['hop_length'], info=translations["hop_length_info"], minimum=64, maximum=512, value=160, step=1, interactive=True, visible=False)
55
- f0_autotune_strength = gr.Slider(minimum=0, maximum=1, label=translations["autotune_rate"], info=translations["autotune_rate_info"], value=1, step=0.1, interactive=True, visible=autotune.value)
56
- alpha = gr.Slider(label=translations["alpha_label"], info=translations["alpha_info"], minimum=0.1, maximum=1, value=0.5, step=0.1, interactive=True, visible=False)
57
- with gr.Accordion(label=translations["hubert_model"], open=False):
58
- with gr.Group():
59
- embed_mode2 = gr.Radio(label=translations["embed_mode"], info=translations["embed_mode_info"], value="fairseq", choices=embedders_mode, interactive=True, visible=True)
60
- extract_embedders = gr.Radio(label=translations["hubert_model"], info=translations["hubert_info"], choices=embedders_model, value="hubert_base", interactive=True)
61
- with gr.Row():
62
- extract_embedders_custom = gr.Textbox(label=translations["modelname"], info=translations["modelname_info"], value="", placeholder="hubert_base", interactive=True, visible=extract_embedders.value == "custom")
63
- with gr.Column():
64
- extract_button = gr.Button(translations["extract_button"], scale=2)
65
- extract_info = gr.Textbox(label=translations["extract_info"], value="", interactive=False, lines=2)
66
- with gr.Column():
67
- with gr.Row():
68
- with gr.Column():
69
- total_epochs = gr.Slider(label=translations["total_epoch"], info=translations["total_epoch_info"], minimum=1, maximum=10000, value=300, step=1, interactive=True)
70
- save_epochs = gr.Slider(label=translations["save_epoch"], info=translations["save_epoch_info"], minimum=1, maximum=10000, value=50, step=1, interactive=True)
71
- with gr.Column():
72
- index_button = gr.Button(f"3. {translations['create_index']}", variant="primary", scale=2)
73
- training_button = gr.Button(f"4. {translations['training_model']}", variant="primary", scale=2)
74
- with gr.Row():
75
- with gr.Accordion(label=translations["setting"], open=False):
76
- with gr.Row():
77
- index_algorithm = gr.Radio(label=translations["index_algorithm"], info=translations["index_algorithm_info"], choices=["Auto", "Faiss", "KMeans"], value="Auto", interactive=True)
78
- with gr.Row():
79
- cache_in_gpu = gr.Checkbox(label=translations["cache_in_gpu"], info=translations["cache_in_gpu_info"], value=True, interactive=True)
80
- rms_extract = gr.Checkbox(label=translations["train&energy"], info=translations["train&energy_info"], value=False, interactive=True)
81
- overtraining_detector = gr.Checkbox(label=translations["overtraining_detector"], info=translations["overtraining_detector_info"], value=False, interactive=True)
82
- with gr.Row():
83
- custom_dataset = gr.Checkbox(label=translations["custom_dataset"], info=translations["custom_dataset_info"], value=False, interactive=True)
84
- save_only_latest = gr.Checkbox(label=translations["save_only_latest"], info=translations["save_only_latest_info"], value=True, interactive=True)
85
- save_every_weights = gr.Checkbox(label=translations["save_every_weights"], info=translations["save_every_weights_info"], value=True, interactive=True)
86
- with gr.Row():
87
- clean_up = gr.Checkbox(label=translations["cleanup_training"], info=translations["cleanup_training_info"], value=False, interactive=True)
88
- not_use_pretrain = gr.Checkbox(label=translations["not_use_pretrain_2"], info=translations["not_use_pretrain_info"], value=False, interactive=True)
89
- custom_pretrain = gr.Checkbox(label=translations["custom_pretrain"], info=translations["custom_pretrain_info"], value=False, interactive=True)
90
- with gr.Column():
91
- dataset_path = gr.Textbox(label=translations["dataset_folder"], value="dataset", interactive=True, visible=custom_dataset.value)
92
- with gr.Column():
93
- with gr.Row(visible=False) as simple_option:
94
- chunk_len = gr.Slider(minimum=0.5, maximum=5.0, value=3.0, step=0.1, label=translations["chunk_length"], info=translations["chunk_length_info"], interactive=True)
95
- overlap_len = gr.Slider(minimum=0.0, maximum=0.4, value=0.3, step=0.1, label=translations["overlap_length"], info=translations["overlap_length_info"], interactive=True)
96
- threshold = gr.Slider(minimum=1, maximum=100, value=50, step=1, label=translations["threshold"], interactive=True, visible=overtraining_detector.value)
97
- with gr.Accordion(translations["setting_cpu_gpu"], open=False):
98
- with gr.Column():
99
- gpu_number = gr.Textbox(label=translations["gpu_number"], value=gpu_number_str(), info=translations["gpu_number_info"], interactive=True)
100
- gpu_info = gr.Textbox(label=translations["gpu_info"], value=get_gpu_info(), info=translations["gpu_info_2"], interactive=False)
101
- cpu_core = gr.Slider(label=translations["cpu_core"], info=translations["cpu_core_info"], minimum=1, maximum=os.cpu_count(), value=os.cpu_count(), step=1, interactive=True)
102
- train_batch_size = gr.Slider(label=translations["batch_size"], info=translations["batch_size_info"], minimum=1, maximum=64, value=8, step=1, interactive=True)
103
- with gr.Group():
104
- multiscale_mel_loss = gr.Checkbox(label=translations["multiscale_mel_loss"], info=translations["multiscale_mel_loss_info"], value=False, interactive=True)
105
- vocoders = gr.Radio(label=translations["vocoder"], info=translations["vocoder_info"], choices=["Default", "MRF-HiFi-GAN", "RefineGAN"], value="Default", interactive=True)
106
- with gr.Row():
107
- deterministic = gr.Checkbox(label=translations["deterministic"], info=translations["deterministic_info"], value=False, interactive=config.device.startswith("cuda"))
108
- benchmark = gr.Checkbox(label=translations["benchmark"], info=translations["benchmark_info"], value=False, interactive=config.device.startswith("cuda"))
109
- with gr.Row():
110
- optimizer = gr.Radio(label=translations["optimizer"], info=translations["optimizer_info"], value="AdamW", choices=["AdamW", "RAdam", "AnyPrecisionAdamW"], interactive=True)
111
- with gr.Row():
112
- model_author = gr.Textbox(label=translations["training_author"], info=translations["training_author_info"], value="", placeholder=translations["training_author"], interactive=True)
113
- with gr.Row():
114
- with gr.Column():
115
- with gr.Accordion(translations["custom_pretrain_info"], open=False, visible=custom_pretrain.value and not not_use_pretrain.value) as pretrain_setting:
116
- pretrained_D = gr.Dropdown(label=translations["pretrain_file"].format(dg="D"), choices=pretrainedD, value=pretrainedD[0] if len(pretrainedD) > 0 else '', interactive=True, allow_custom_value=True)
117
- pretrained_G = gr.Dropdown(label=translations["pretrain_file"].format(dg="G"), choices=pretrainedG, value=pretrainedG[0] if len(pretrainedG) > 0 else '', interactive=True, allow_custom_value=True)
118
- refresh_pretrain = gr.Button(translations["refresh"], scale=2)
119
- with gr.Row():
120
- training_info = gr.Textbox(label=translations["train_info"], value="", interactive=False, lines=3)
121
- with gr.Row():
122
- with gr.Column():
123
- with gr.Accordion(translations["export_model"], open=False):
124
- with gr.Row():
125
- model_file = gr.Dropdown(label=translations["model_name"], choices=model_name, value=model_name[0] if len(model_name) >= 1 else "", interactive=True, allow_custom_value=True)
126
- index_file = gr.Dropdown(label=translations["index_path"], choices=index_path, value=index_path[0] if len(index_path) >= 1 else "", interactive=True, allow_custom_value=True)
127
- with gr.Row():
128
- refresh_file = gr.Button(f"1. {translations['refresh']}", scale=2)
129
- zip_model = gr.Button(translations["zip_model"], variant="primary", scale=2)
130
- with gr.Row():
131
- zip_output = gr.File(label=translations["output_zip"], file_types=[".zip"], interactive=False, visible=False)
132
- with gr.Row():
133
- vocoders.change(fn=pitch_guidance_lock, inputs=[vocoders], outputs=[training_f0])
134
- training_f0.change(fn=vocoders_lock, inputs=[training_f0, vocoders], outputs=[vocoders])
135
- unlock_full_method4.change(fn=unlock_f0, inputs=[unlock_full_method4], outputs=[extract_method])
136
- with gr.Row():
137
- refresh_file.click(fn=change_models_choices, inputs=[], outputs=[model_file, index_file])
138
- zip_model.click(fn=zip_file, inputs=[training_name, model_file, index_file], outputs=[zip_output])
139
- dataset_path.change(fn=lambda folder: os.makedirs(folder, exist_ok=True), inputs=[dataset_path], outputs=[])
140
- with gr.Row():
141
- upload.change(fn=visible, inputs=[upload], outputs=[upload_dataset])
142
- overtraining_detector.change(fn=visible, inputs=[overtraining_detector], outputs=[threshold])
143
- clean_dataset.change(fn=visible, inputs=[clean_dataset], outputs=[clean_dataset_row])
144
- with gr.Row():
145
- custom_dataset.change(fn=lambda custom_dataset: [visible(custom_dataset), "dataset"],inputs=[custom_dataset], outputs=[dataset_path, dataset_path])
146
- training_ver.change(fn=unlock_vocoder, inputs=[training_ver, vocoders], outputs=[vocoders])
147
- vocoders.change(fn=unlock_ver, inputs=[training_ver, vocoders], outputs=[training_ver])
148
- with gr.Row():
149
- custom_reference.change(fn=visible, inputs=[custom_reference], outputs=[custom_reference_row])
150
- extract_method.change(fn=lambda method, hybrid: [visible(method == "hybrid"), visible(method == "hybrid"), hoplength_show(method, hybrid)], inputs=[extract_method, extract_hybrid_method], outputs=[extract_hybrid_method, alpha, extract_hop_length])
151
- extract_hybrid_method.change(fn=hoplength_show, inputs=[extract_method, extract_hybrid_method], outputs=[extract_hop_length])
152
- with gr.Row():
153
- autotune.change(fn=visible, inputs=[autotune], outputs=[f0_autotune_strength])
154
- preprocess_split_audio_mode.change(fn=lambda a: visible(a == "Simple"), inputs=[preprocess_split_audio_mode], outputs=[simple_option])
155
- upload_dataset.upload(
156
- fn=lambda files, folder: [shutil_move(f.name, os.path.join(folder, os.path.split(f.name)[1])) for f in files] if folder != "" else gr_warning(translations["dataset_folder1"]),
157
- inputs=[upload_dataset, dataset_path],
158
- outputs=[],
159
- api_name="upload_dataset"
160
- )
161
- with gr.Row():
162
- not_use_pretrain.change(fn=lambda a, b: visible(a and not b), inputs=[custom_pretrain, not_use_pretrain], outputs=[pretrain_setting])
163
- custom_pretrain.change(fn=lambda a, b: visible(a and not b), inputs=[custom_pretrain, not_use_pretrain], outputs=[pretrain_setting])
164
- refresh_pretrain.click(fn=change_pretrained_choices, inputs=[], outputs=[pretrained_D, pretrained_G])
165
- with gr.Row():
166
- preprocess_button.click(
167
- fn=preprocess,
168
- inputs=[
169
- training_name,
170
- training_sr,
171
- cpu_core,
172
- preprocess_split_audio_mode,
173
- process_effects,
174
- dataset_path,
175
- clean_dataset,
176
- clean_dataset_strength,
177
- chunk_len,
178
- overlap_len,
179
- preprocess_normalization_mode
180
- ],
181
- outputs=[preprocess_info],
182
- api_name="preprocess"
183
- )
184
- with gr.Row():
185
- embed_mode2.change(fn=change_embedders_mode, inputs=[embed_mode2], outputs=[extract_embedders])
186
- extract_embedders.change(fn=lambda extract_embedders: visible(extract_embedders == "custom"), inputs=[extract_embedders], outputs=[extract_embedders_custom])
187
- reference_refresh.click(fn=change_reference_choices, inputs=[], outputs=[reference_name])
188
- with gr.Row():
189
- extract_button.click(
190
- fn=extract,
191
- inputs=[
192
- training_name,
193
- training_ver,
194
- extract_method,
195
- training_f0,
196
- extract_hop_length,
197
- cpu_core,
198
- gpu_number,
199
- training_sr,
200
- extract_embedders,
201
- extract_embedders_custom,
202
- onnx_f0_mode2,
203
- embed_mode2,
204
- autotune,
205
- f0_autotune_strength,
206
- extract_hybrid_method,
207
- rms_extract,
208
- alpha
209
- ],
210
- outputs=[extract_info],
211
- api_name="extract"
212
- )
213
- with gr.Row():
214
- index_button.click(
215
- fn=create_index,
216
- inputs=[
217
- training_name,
218
- training_ver,
219
- index_algorithm
220
- ],
221
- outputs=[training_info],
222
- api_name="create_index"
223
- )
224
- with gr.Row():
225
- training_button.click(
226
- fn=training,
227
- inputs=[
228
- training_name,
229
- training_ver,
230
- save_epochs,
231
- save_only_latest,
232
- save_every_weights,
233
- total_epochs,
234
- training_sr,
235
- train_batch_size,
236
- gpu_number,
237
- training_f0,
238
- not_use_pretrain,
239
- custom_pretrain,
240
- pretrained_G,
241
- pretrained_D,
242
- overtraining_detector,
243
- threshold,
244
- clean_up,
245
- cache_in_gpu,
246
- model_author,
247
- vocoders,
248
- checkpointing1,
249
- deterministic,
250
- benchmark,
251
- optimizer,
252
- rms_extract,
253
- custom_reference,
254
- reference_name,
255
- multiscale_mel_loss
256
- ],
257
- outputs=[training_info],
258
- api_name="training_model"
259
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
main/app/tabs/training/training.py DELETED
@@ -1,25 +0,0 @@
1
- import os
2
- import sys
3
-
4
- import gradio as gr
5
-
6
- sys.path.append(os.getcwd())
7
-
8
- from main.app.variables import translations, configs
9
- from main.app.tabs.training.child.training import training_model_tab
10
- from main.app.tabs.training.child.create_dataset import create_dataset_tab
11
- from main.app.tabs.training.child.create_reference import create_reference_tab
12
-
13
- def training_tab():
14
- with gr.TabItem(translations["training_model"], visible=configs.get("create_and_training_tab", True)):
15
- with gr.TabItem(translations["createdataset"], visible=configs.get("create_dataset_tab", True)):
16
- gr.Markdown(translations["create_dataset_markdown"])
17
- create_dataset_tab()
18
-
19
- with gr.TabItem(translations["create_reference"], visible=configs.get("create_reference_tab", True)):
20
- gr.Markdown(translations["create_reference_markdown"])
21
- create_reference_tab()
22
-
23
- with gr.TabItem(translations["training_model"], visible=configs.get("training_tab", True)):
24
- gr.Markdown(f"## {translations['training_model']}")
25
- training_model_tab()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
main/app/variables.py DELETED
@@ -1,117 +0,0 @@
1
- import os
2
- import sys
3
- import csv
4
- import json
5
- import codecs
6
- import logging
7
- import urllib.request
8
- import logging.handlers
9
-
10
- sys.path.append(os.getcwd())
11
-
12
- from main.configs.config import Config
13
-
14
- logger = logging.getLogger(__name__)
15
- logger.propagate = False
16
-
17
- config = Config()
18
- python = sys.executable
19
- translations = config.translations
20
- configs_json = os.path.join("main", "configs", "config.json")
21
- configs = json.load(open(configs_json, "r"))
22
-
23
- if not logger.hasHandlers():
24
- console_handler = logging.StreamHandler()
25
- console_formatter = logging.Formatter(fmt="\n%(asctime)s.%(msecs)03d | %(levelname)s | %(module)s | %(message)s", datefmt="%Y-%m-%d %H:%M:%S")
26
- console_handler.setFormatter(console_formatter)
27
- console_handler.setLevel(logging.DEBUG if config.debug_mode else logging.INFO)
28
- file_handler = logging.handlers.RotatingFileHandler(os.path.join(configs["logs_path"], "app.log"), maxBytes=5*1024*1024, backupCount=3, encoding='utf-8')
29
- file_formatter = logging.Formatter(fmt="\n%(asctime)s.%(msecs)03d | %(levelname)s | %(module)s | %(message)s", datefmt="%Y-%m-%d %H:%M:%S")
30
- file_handler.setFormatter(file_formatter)
31
- file_handler.setLevel(logging.DEBUG)
32
- logger.addHandler(console_handler)
33
- logger.addHandler(file_handler)
34
- logger.setLevel(logging.DEBUG)
35
-
36
- if config.device in ["cpu", "mps", "ocl:0"] and configs.get("fp16", False):
37
- logger.warning(translations["fp16_not_support"])
38
- configs["fp16"] = config.is_half = False
39
-
40
- with open(configs_json, "w") as f:
41
- json.dump(configs, f, indent=4)
42
-
43
- models = {}
44
- model_options = {}
45
-
46
- method_f0 = ["mangio-crepe-full", "crepe-full", "fcpe", "rmvpe", "harvest", "pyin", "hybrid"]
47
- method_f0_full = ["pm-ac", "pm-cc", "pm-shs", "dio", "mangio-crepe-tiny", "mangio-crepe-small", "mangio-crepe-medium", "mangio-crepe-large", "mangio-crepe-full", "crepe-tiny", "crepe-small", "crepe-medium", "crepe-large", "crepe-full", "fcpe", "fcpe-legacy", "fcpe-previous", "rmvpe", "rmvpe-clipping", "rmvpe-medfilt", "rmvpe-clipping-medfilt", "harvest", "yin", "pyin", "swipe", "piptrack", "penn", "mangio-penn", "djcm", "djcm-clipping", "djcm-medfilt", "djcm-clipping-medfilt", "swift", "pesto", "hybrid"]
48
- hybrid_f0_method = ["hybrid[pm+dio]", "hybrid[pm+crepe-tiny]", "hybrid[pm+crepe]", "hybrid[pm+fcpe]", "hybrid[pm+rmvpe]", "hybrid[pm+harvest]", "hybrid[pm+yin]", "hybrid[dio+crepe-tiny]", "hybrid[dio+crepe]", "hybrid[dio+fcpe]", "hybrid[dio+rmvpe]", "hybrid[dio+harvest]", "hybrid[dio+yin]", "hybrid[crepe-tiny+crepe]", "hybrid[crepe-tiny+fcpe]", "hybrid[crepe-tiny+rmvpe]", "hybrid[crepe-tiny+harvest]", "hybrid[crepe+fcpe]", "hybrid[crepe+rmvpe]", "hybrid[crepe+harvest]", "hybrid[crepe+yin]", "hybrid[fcpe+rmvpe]", "hybrid[fcpe+harvest]", "hybrid[fcpe+yin]", "hybrid[rmvpe+harvest]", "hybrid[rmvpe+yin]", "hybrid[harvest+yin]"]
49
-
50
- embedders_mode = ["fairseq", "onnx", "transformers", "spin", "whisper"]
51
- embedders_model = ["contentvec_base", "hubert_base", "vietnamese_hubert_base", "japanese_hubert_base", "korean_hubert_base", "chinese_hubert_base", "portuguese_hubert_base", "custom"]
52
- spin_model = ["spin-v1", "spin-v2"]
53
- whisper_model = ["tiny", "tiny.en", "base", "base.en", "small", "small.en", "medium", "medium.en", "large-v1", "large-v2", "large-v3", "large-v3-turbo"]
54
-
55
- paths_for_files = sorted([os.path.abspath(os.path.join(root, f)) for root, _, files in os.walk(configs["audios_path"]) for f in files if os.path.splitext(f)[1].lower() in (".wav", ".mp3", ".flac", ".ogg", ".opus", ".m4a", ".mp4", ".aac", ".alac", ".wma", ".aiff", ".webm", ".ac3")])
56
- reference_list = sorted([name for name in os.listdir(configs["reference_path"]) if os.path.exists(os.path.join(configs["reference_path"], name)) and os.path.isdir(os.path.join(configs["reference_path"], name))])
57
- model_name = sorted(list(model for model in os.listdir(configs["weights_path"]) if model.endswith((".pth", ".onnx")) and not model.startswith("G_") and not model.startswith("D_")))
58
- index_path = sorted([os.path.join(root, name) for root, _, files in os.walk(configs["logs_path"], topdown=False) for name in files if name.endswith(".index") and "trained" not in name])
59
-
60
- pretrainedD = [model for model in os.listdir(configs["pretrained_custom_path"]) if model.endswith(".pth") and "D" in model]
61
- pretrainedG = [model for model in os.listdir(configs["pretrained_custom_path"]) if model.endswith(".pth") and "G" in model]
62
-
63
- presets_file = sorted(list(f for f in os.listdir(configs["presets_path"]) if f.endswith(".conversion.json")))
64
- audio_effect_presets_file = sorted(list(f for f in os.listdir(configs["presets_path"]) if f.endswith(".effect.json")))
65
- f0_file = sorted([os.path.abspath(os.path.join(root, f)) for root, _, files in os.walk(configs["f0_path"]) for f in files if f.endswith(".txt")])
66
-
67
- file_types = [".wav", ".mp3", ".flac", ".ogg", ".opus", ".m4a", ".mp4", ".aac", ".alac", ".wma", ".aiff", ".webm", ".ac3"]
68
- export_format_choices = ["wav", "mp3", "flac", "ogg", "opus", "m4a", "mp4", "aac", "alac", "wma", "aiff", "webm", "ac3"]
69
-
70
- language = configs.get("language", "vi-VN")
71
- theme = configs.get("theme", "NoCrypt/miku")
72
-
73
- edgetts = configs.get("edge_tts", ["vi-VN-HoaiMyNeural", "vi-VN-NamMinhNeural"])
74
- google_tts_voice = configs.get("google_tts_voice", ["vi", "en"])
75
-
76
- vr_models = configs.get("vr_models", "")
77
- demucs_models = configs.get("demucs_models", "")
78
- mdx_models = configs.get("mdx_models", "")
79
- karaoke_models = configs.get("karaoke_models", "")
80
- reverb_models = configs.get("reverb_models", "")
81
- denoise_models = configs.get("denoise_models", "")
82
- uvr_model = list(demucs_models.keys()) + list(vr_models.keys()) + list(mdx_models.keys())
83
-
84
- font = configs.get("font", "https://fonts.googleapis.com/css2?family=Courgette&display=swap")
85
- sample_rate_choice = [8000, 11025, 12000, 16000, 22050, 24000, 32000, 44100, 48000, 96000]
86
- csv_path = configs["csv_path"]
87
-
88
- if "--allow_all_disk" in sys.argv and sys.platform == "win32":
89
- try:
90
- import win32api
91
- except:
92
- os.system(f"{python} -m pip install pywin32")
93
- import win32api
94
-
95
- allow_disk = win32api.GetLogicalDriveStrings().split('\x00')[:-1]
96
- else: allow_disk = []
97
-
98
- try:
99
- if os.path.exists(csv_path): reader = list(csv.DictReader(open(csv_path, newline='', encoding='utf-8')))
100
- else:
101
- reader = list(csv.DictReader([line.decode('utf-8') for line in urllib.request.urlopen(codecs.decode("uggcf://qbpf.tbbtyr.pbz/fcernqfurrgf/q/1gNHnDeRULtEfz1Yieaw14USUQjWJy0Oq9k0DrCrjApb/rkcbeg?sbezng=pfi&tvq=1977693859", "rot13")).readlines()]))
102
- writer = csv.DictWriter(open(csv_path, mode='w', newline='', encoding='utf-8'), fieldnames=reader[0].keys())
103
- writer.writeheader()
104
- writer.writerows(reader)
105
-
106
- for row in reader:
107
- filename = row['Filename']
108
- url = None
109
-
110
- for value in row.values():
111
- if isinstance(value, str) and "huggingface" in value:
112
- url = value
113
- break
114
-
115
- if url: models[filename] = url
116
- except:
117
- pass
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
main/configs/config.json DELETED
@@ -1,622 +0,0 @@
1
- {
2
- "language": "vi-VN",
3
- "support_language": [
4
- "en-US",
5
- "vi-VN"
6
- ],
7
- "theme": "NoCrypt/miku",
8
- "themes": [
9
- "NoCrypt/miku",
10
- "gstaff/xkcd",
11
- "JohnSmith9982/small_and_pretty",
12
- "ParityError/Interstellar",
13
- "earneleh/paris",
14
- "shivi/calm_seafoam",
15
- "Hev832/Applio",
16
- "YTheme/Minecraft",
17
- "gstaff/sketch",
18
- "SebastianBravo/simci_css",
19
- "allenai/gradio-theme",
20
- "Nymbo/Nymbo_Theme_5",
21
- "lone17/kotaemon",
22
- "Zarkel/IBM_Carbon_Theme",
23
- "SherlockRamos/Feliz",
24
- "freddyaboulton/dracula_revamped",
25
- "freddyaboulton/bad-theme-space",
26
- "gradio/dracula_revamped",
27
- "abidlabs/dracula_revamped",
28
- "gradio/dracula_test",
29
- "gradio/seafoam",
30
- "gradio/glass",
31
- "gradio/monochrome",
32
- "gradio/soft",
33
- "gradio/default",
34
- "gradio/base",
35
- "abidlabs/pakistan",
36
- "dawood/microsoft_windows",
37
- "ysharma/steampunk",
38
- "ysharma/huggingface",
39
- "abidlabs/Lime",
40
- "freddyaboulton/this-theme-does-not-exist-2",
41
- "aliabid94/new-theme",
42
- "aliabid94/test2",
43
- "aliabid94/test3",
44
- "aliabid94/test4",
45
- "abidlabs/banana",
46
- "freddyaboulton/test-blue",
47
- "gstaff/whiteboard",
48
- "ysharma/llamas",
49
- "abidlabs/font-test",
50
- "YenLai/Superhuman",
51
- "bethecloud/storj_theme",
52
- "sudeepshouche/minimalist",
53
- "knotdgaf/gradiotest",
54
- "ParityError/Anime",
55
- "Ajaxon6255/Emerald_Isle",
56
- "ParityError/LimeFace",
57
- "finlaymacklon/smooth_slate",
58
- "finlaymacklon/boxy_violet",
59
- "derekzen/stardust",
60
- "EveryPizza/Cartoony-Gradio-Theme",
61
- "Ifeanyi/Cyanister",
62
- "Tshackelton/IBMPlex-DenseReadable",
63
- "snehilsanyal/scikit-learn",
64
- "Himhimhim/xkcd",
65
- "nota-ai/theme",
66
- "rawrsor1/Everforest",
67
- "rottenlittlecreature/Moon_Goblin",
68
- "abidlabs/test-yellow",
69
- "abidlabs/test-yellow3",
70
- "idspicQstitho/dracula_revamped",
71
- "kfahn/AnimalPose",
72
- "HaleyCH/HaleyCH_Theme",
73
- "simulKitke/dracula_test",
74
- "braintacles/CrimsonNight",
75
- "wentaohe/whiteboardv2",
76
- "reilnuud/polite",
77
- "remilia/Ghostly",
78
- "Franklisi/darkmode",
79
- "coding-alt/soft",
80
- "xiaobaiyuan/theme_land",
81
- "step-3-profit/Midnight-Deep",
82
- "xiaobaiyuan/theme_demo",
83
- "Taithrah/Minimal",
84
- "Insuz/SimpleIndigo",
85
- "zkunn/Alipay_Gradio_theme",
86
- "Insuz/Mocha",
87
- "xiaobaiyuan/theme_brief",
88
- "Ama434/434-base-Barlow",
89
- "Ama434/def_barlow",
90
- "Ama434/neutral-barlow",
91
- "dawood/dracula_test",
92
- "nuttea/Softblue",
93
- "BlueDancer/Alien_Diffusion",
94
- "naughtondale/monochrome",
95
- "Dagfinn1962/standard",
96
- "default"
97
- ],
98
- "mdx_models": {
99
- "Main_340": "UVR-MDX-NET_Main_340.onnx",
100
- "Main_390": "UVR-MDX-NET_Main_390.onnx",
101
- "Main_406": "UVR-MDX-NET_Main_406.onnx",
102
- "Main_427": "UVR-MDX-NET_Main_427.onnx",
103
- "Main_438": "UVR-MDX-NET_Main_438.onnx",
104
- "Inst_full_292": "UVR-MDX-NET-Inst_full_292.onnx",
105
- "Inst_HQ_1": "UVR-MDX-NET-Inst_HQ_1.onnx",
106
- "Inst_HQ_2": "UVR-MDX-NET-Inst_HQ_2.onnx",
107
- "Inst_HQ_3": "UVR-MDX-NET-Inst_HQ_3.onnx",
108
- "Inst_HQ_4": "UVR-MDX-NET-Inst_HQ_4.onnx",
109
- "Inst_HQ_5": "UVR-MDX-NET-Inst_HQ_5.onnx",
110
- "Kim_Vocal_1": "Kim_Vocal_1.onnx",
111
- "Kim_Vocal_2": "Kim_Vocal_2.onnx",
112
- "Kim_Inst": "Kim_Inst.onnx",
113
- "Inst_187_beta": "UVR-MDX-NET_Inst_187_beta.onnx",
114
- "Inst_82_beta": "UVR-MDX-NET_Inst_82_beta.onnx",
115
- "Inst_90_beta": "UVR-MDX-NET_Inst_90_beta.onnx",
116
- "Voc_FT": "UVR-MDX-NET-Voc_FT.onnx",
117
- "Crowd_HQ": "UVR-MDX-NET_Crowd_HQ_1.onnx",
118
- "MDXNET_9482": "UVR_MDXNET_9482.onnx",
119
- "Inst_1": "UVR-MDX-NET-Inst_1.onnx",
120
- "Inst_2": "UVR-MDX-NET-Inst_2.onnx",
121
- "Inst_3": "UVR-MDX-NET-Inst_3.onnx",
122
- "MDXNET_1_9703": "UVR_MDXNET_1_9703.onnx",
123
- "MDXNET_2_9682": "UVR_MDXNET_2_9682.onnx",
124
- "MDXNET_3_9662": "UVR_MDXNET_3_9662.onnx",
125
- "Inst_Main": "UVR-MDX-NET-Inst_Main.onnx",
126
- "MDXNET_Main": "UVR_MDXNET_Main.onnx"
127
- },
128
- "demucs_models": {
129
- "HT-Tuned": "htdemucs_ft.yaml",
130
- "HT-Normal": "htdemucs.yaml",
131
- "HD_MMI": "hdemucs_mmi.yaml",
132
- "HT_6S": "htdemucs_6s.yaml"
133
- },
134
- "vr_models": {
135
- "HP-1": "1_HP-UVR.pth",
136
- "HP-2": "2_HP-UVR.pth",
137
- "HP-Vocal-1": "3_HP-Vocal-UVR.pth",
138
- "HP-Vocal-2": "4_HP-Vocal-UVR.pth",
139
- "HP2-1": "7_HP2-UVR.pth",
140
- "HP2-2": "8_HP2-UVR.pth",
141
- "HP2-3": "9_HP2-UVR.pth",
142
- "SP-2B-1": "10_SP-UVR-2B-32000-1.pth",
143
- "SP-2B-2": "11_SP-UVR-2B-32000-2.pth",
144
- "SP-3B-1": "12_SP-UVR-3B-44100.pth",
145
- "SP-4B-1": "13_SP-UVR-4B-44100-1.pth",
146
- "SP-4B-2": "14_SP-UVR-4B-44100-2.pth",
147
- "SP-MID-1": "15_SP-UVR-MID-44100-1.pth",
148
- "SP-MID-2": "16_SP-UVR-MID-44100-2.pth"
149
- },
150
- "karaoke_models": {
151
- "MDX-Version-1": "UVR_MDXNET_KARA.onnx",
152
- "MDX-Version-2": "UVR_MDXNET_KARA_2.onnx",
153
- "VR-Version-1": "5_HP-Karaoke-UVR.pth",
154
- "VR-Version-2": "6_HP-Karaoke-UVR.pth"
155
- },
156
- "reverb_models": {
157
- "MDX-Reverb": "Reverb_HQ_By_FoxJoy.onnx",
158
- "VR-Reverb": "UVR-DeEcho-DeReverb.pth",
159
- "Echo-Aggressive": "UVR-De-Echo-Aggressive.pth",
160
- "Echo-Normal": "UVR-De-Echo-Normal.pth"
161
- },
162
- "denoise_models": {
163
- "Lite": "UVR-DeNoise-Lite.pth",
164
- "Normal": "UVR-DeNoise.pth"
165
- },
166
- "edge_tts": [
167
- "af-ZA-AdriNeural",
168
- "af-ZA-WillemNeural",
169
- "sq-AL-AnilaNeural",
170
- "sq-AL-IlirNeural",
171
- "am-ET-AmehaNeural",
172
- "am-ET-MekdesNeural",
173
- "ar-DZ-AminaNeural",
174
- "ar-DZ-IsmaelNeural",
175
- "ar-BH-AliNeural",
176
- "ar-BH-LailaNeural",
177
- "ar-EG-SalmaNeural",
178
- "ar-EG-ShakirNeural",
179
- "ar-IQ-BasselNeural",
180
- "ar-IQ-RanaNeural",
181
- "ar-JO-SanaNeural",
182
- "ar-JO-TaimNeural",
183
- "ar-KW-FahedNeural",
184
- "ar-KW-NouraNeural",
185
- "ar-LB-LaylaNeural",
186
- "ar-LB-RamiNeural",
187
- "ar-LY-ImanNeural",
188
- "ar-LY-OmarNeural",
189
- "ar-MA-JamalNeural",
190
- "ar-MA-MounaNeural",
191
- "ar-OM-AbdullahNeural",
192
- "ar-OM-AyshaNeural",
193
- "ar-QA-AmalNeural",
194
- "ar-QA-MoazNeural",
195
- "ar-SA-HamedNeural",
196
- "ar-SA-ZariyahNeural",
197
- "ar-SY-AmanyNeural",
198
- "ar-SY-LaithNeural",
199
- "ar-TN-HediNeural",
200
- "ar-TN-ReemNeural",
201
- "ar-AE-FatimaNeural",
202
- "ar-AE-HamdanNeural",
203
- "ar-YE-MaryamNeural",
204
- "ar-YE-SalehNeural",
205
- "az-AZ-BabekNeural",
206
- "az-AZ-BanuNeural",
207
- "bn-BD-NabanitaNeural",
208
- "bn-BD-PradeepNeural",
209
- "bn-IN-BashkarNeural",
210
- "bn-IN-TanishaaNeural",
211
- "bs-BA-GoranNeural",
212
- "bs-BA-VesnaNeural",
213
- "bg-BG-BorislavNeural",
214
- "bg-BG-KalinaNeural",
215
- "my-MM-NilarNeural",
216
- "my-MM-ThihaNeural",
217
- "ca-ES-EnricNeural",
218
- "ca-ES-JoanaNeural",
219
- "zh-HK-HiuGaaiNeural",
220
- "zh-HK-HiuMaanNeural",
221
- "zh-HK-WanLungNeural",
222
- "zh-CN-XiaoxiaoNeural",
223
- "zh-CN-XiaoyiNeural",
224
- "zh-CN-YunjianNeural",
225
- "zh-CN-YunxiNeural",
226
- "zh-CN-YunxiaNeural",
227
- "zh-CN-YunyangNeural",
228
- "zh-CN-liaoning-XiaobeiNeural",
229
- "zh-TW-HsiaoChenNeural",
230
- "zh-TW-YunJheNeural",
231
- "zh-TW-HsiaoYuNeural",
232
- "zh-CN-shaanxi-XiaoniNeural",
233
- "hr-HR-GabrijelaNeural",
234
- "hr-HR-SreckoNeural",
235
- "cs-CZ-AntoninNeural",
236
- "cs-CZ-VlastaNeural",
237
- "da-DK-ChristelNeural",
238
- "da-DK-JeppeNeural",
239
- "nl-BE-ArnaudNeural",
240
- "nl-BE-DenaNeural",
241
- "nl-NL-ColetteNeural",
242
- "nl-NL-FennaNeural",
243
- "nl-NL-MaartenNeural",
244
- "en-AU-NatashaNeural",
245
- "en-AU-WilliamNeural",
246
- "en-CA-ClaraNeural",
247
- "en-CA-LiamNeural",
248
- "en-HK-SamNeural",
249
- "en-HK-YanNeural",
250
- "en-IN-NeerjaExpressiveNeural",
251
- "en-IN-NeerjaNeural",
252
- "en-IN-PrabhatNeural",
253
- "en-IE-ConnorNeural",
254
- "en-IE-EmilyNeural",
255
- "en-KE-AsiliaNeural",
256
- "en-KE-ChilembaNeural",
257
- "en-NZ-MitchellNeural",
258
- "en-NZ-MollyNeural",
259
- "en-NG-AbeoNeural",
260
- "en-NG-EzinneNeural",
261
- "en-PH-JamesNeural",
262
- "en-PH-RosaNeural",
263
- "en-SG-LunaNeural",
264
- "en-SG-WayneNeural",
265
- "en-ZA-LeahNeural",
266
- "en-ZA-LukeNeural",
267
- "en-TZ-ElimuNeural",
268
- "en-TZ-ImaniNeural",
269
- "en-GB-LibbyNeural",
270
- "en-GB-MaisieNeural",
271
- "en-GB-RyanNeural",
272
- "en-GB-SoniaNeural",
273
- "en-GB-ThomasNeural",
274
- "en-US-AvaMultilingualNeural",
275
- "en-US-AndrewMultilingualNeural",
276
- "en-US-EmmaMultilingualNeural",
277
- "en-US-BrianMultilingualNeural",
278
- "en-US-AvaNeural",
279
- "en-US-AndrewNeural",
280
- "en-US-EmmaNeural",
281
- "en-US-BrianNeural",
282
- "en-US-AnaNeural",
283
- "en-US-AriaNeural",
284
- "en-US-ChristopherNeural",
285
- "en-US-EricNeural",
286
- "en-US-GuyNeural",
287
- "en-US-JennyNeural",
288
- "en-US-MichelleNeural",
289
- "en-US-RogerNeural",
290
- "en-US-SteffanNeural",
291
- "et-EE-AnuNeural",
292
- "et-EE-KertNeural",
293
- "fil-PH-AngeloNeural",
294
- "fil-PH-BlessicaNeural",
295
- "fi-FI-HarriNeural",
296
- "fi-FI-NooraNeural",
297
- "fr-BE-CharlineNeural",
298
- "fr-BE-GerardNeural",
299
- "fr-CA-ThierryNeural",
300
- "fr-CA-AntoineNeural",
301
- "fr-CA-JeanNeural",
302
- "fr-CA-SylvieNeural",
303
- "fr-FR-VivienneMultilingualNeural",
304
- "fr-FR-RemyMultilingualNeural",
305
- "fr-FR-DeniseNeural",
306
- "fr-FR-EloiseNeural",
307
- "fr-FR-HenriNeural",
308
- "fr-CH-ArianeNeural",
309
- "fr-CH-FabriceNeural",
310
- "gl-ES-RoiNeural",
311
- "gl-ES-SabelaNeural",
312
- "ka-GE-EkaNeural",
313
- "ka-GE-GiorgiNeural",
314
- "de-AT-IngridNeural",
315
- "de-AT-JonasNeural",
316
- "de-DE-SeraphinaMultilingualNeural",
317
- "de-DE-FlorianMultilingualNeural",
318
- "de-DE-AmalaNeural",
319
- "de-DE-ConradNeural",
320
- "de-DE-KatjaNeural",
321
- "de-DE-KillianNeural",
322
- "de-CH-JanNeural",
323
- "de-CH-LeniNeural",
324
- "el-GR-AthinaNeural",
325
- "el-GR-NestorasNeural",
326
- "gu-IN-DhwaniNeural",
327
- "gu-IN-NiranjanNeural",
328
- "he-IL-AvriNeural",
329
- "he-IL-HilaNeural",
330
- "hi-IN-MadhurNeural",
331
- "hi-IN-SwaraNeural",
332
- "hu-HU-NoemiNeural",
333
- "hu-HU-TamasNeural",
334
- "is-IS-GudrunNeural",
335
- "is-IS-GunnarNeural",
336
- "id-ID-ArdiNeural",
337
- "id-ID-GadisNeural",
338
- "ga-IE-ColmNeural",
339
- "ga-IE-OrlaNeural",
340
- "it-IT-GiuseppeNeural",
341
- "it-IT-DiegoNeural",
342
- "it-IT-ElsaNeural",
343
- "it-IT-IsabellaNeural",
344
- "ja-JP-KeitaNeural",
345
- "ja-JP-NanamiNeural",
346
- "jv-ID-DimasNeural",
347
- "jv-ID-SitiNeural",
348
- "kn-IN-GaganNeural",
349
- "kn-IN-SapnaNeural",
350
- "kk-KZ-AigulNeural",
351
- "kk-KZ-DauletNeural",
352
- "km-KH-PisethNeural",
353
- "km-KH-SreymomNeural",
354
- "ko-KR-HyunsuNeural",
355
- "ko-KR-InJoonNeural",
356
- "ko-KR-SunHiNeural",
357
- "lo-LA-ChanthavongNeural",
358
- "lo-LA-KeomanyNeural",
359
- "lv-LV-EveritaNeural",
360
- "lv-LV-NilsNeural",
361
- "lt-LT-LeonasNeural",
362
- "lt-LT-OnaNeural",
363
- "mk-MK-AleksandarNeural",
364
- "mk-MK-MarijaNeural",
365
- "ms-MY-OsmanNeural",
366
- "ms-MY-YasminNeural",
367
- "ml-IN-MidhunNeural",
368
- "ml-IN-SobhanaNeural",
369
- "mt-MT-GraceNeural",
370
- "mt-MT-JosephNeural",
371
- "mr-IN-AarohiNeural",
372
- "mr-IN-ManoharNeural",
373
- "mn-MN-BataaNeural",
374
- "mn-MN-YesuiNeural",
375
- "ne-NP-HemkalaNeural",
376
- "ne-NP-SagarNeural",
377
- "nb-NO-FinnNeural",
378
- "nb-NO-PernilleNeural",
379
- "ps-AF-GulNawazNeural",
380
- "ps-AF-LatifaNeural",
381
- "fa-IR-DilaraNeural",
382
- "fa-IR-FaridNeural",
383
- "pl-PL-MarekNeural",
384
- "pl-PL-ZofiaNeural",
385
- "pt-BR-ThalitaNeural",
386
- "pt-BR-AntonioNeural",
387
- "pt-BR-FranciscaNeural",
388
- "pt-PT-DuarteNeural",
389
- "pt-PT-RaquelNeural",
390
- "ro-RO-AlinaNeural",
391
- "ro-RO-EmilNeural",
392
- "ru-RU-DmitryNeural",
393
- "ru-RU-SvetlanaNeural",
394
- "sr-RS-NicholasNeural",
395
- "sr-RS-SophieNeural",
396
- "si-LK-SameeraNeural",
397
- "si-LK-ThiliniNeural",
398
- "sk-SK-LukasNeural",
399
- "sk-SK-ViktoriaNeural",
400
- "sl-SI-PetraNeural",
401
- "sl-SI-RokNeural",
402
- "so-SO-MuuseNeural",
403
- "so-SO-UbaxNeural",
404
- "es-AR-ElenaNeural",
405
- "es-AR-TomasNeural",
406
- "es-BO-MarceloNeural",
407
- "es-BO-SofiaNeural",
408
- "es-CL-CatalinaNeural",
409
- "es-CL-LorenzoNeural",
410
- "es-ES-XimenaNeural",
411
- "es-CO-GonzaloNeural",
412
- "es-CO-SalomeNeural",
413
- "es-CR-JuanNeural",
414
- "es-CR-MariaNeural",
415
- "es-CU-BelkysNeural",
416
- "es-CU-ManuelNeural",
417
- "es-DO-EmilioNeural",
418
- "es-DO-RamonaNeural",
419
- "es-EC-AndreaNeural",
420
- "es-EC-LuisNeural",
421
- "es-SV-LorenaNeural",
422
- "es-SV-RodrigoNeural",
423
- "es-GQ-JavierNeural",
424
- "es-GQ-TeresaNeural",
425
- "es-GT-AndresNeural",
426
- "es-GT-MartaNeural",
427
- "es-HN-CarlosNeural",
428
- "es-HN-KarlaNeural",
429
- "es-MX-DaliaNeural",
430
- "es-MX-JorgeNeural",
431
- "es-NI-FedericoNeural",
432
- "es-NI-YolandaNeural",
433
- "es-PA-MargaritaNeural",
434
- "es-PA-RobertoNeural",
435
- "es-PY-MarioNeural",
436
- "es-PY-TaniaNeural",
437
- "es-PE-AlexNeural",
438
- "es-PE-CamilaNeural",
439
- "es-PR-KarinaNeural",
440
- "es-PR-VictorNeural",
441
- "es-ES-AlvaroNeural",
442
- "es-ES-ElviraNeural",
443
- "es-US-AlonsoNeural",
444
- "es-US-PalomaNeural",
445
- "es-UY-MateoNeural",
446
- "es-UY-ValentinaNeural",
447
- "es-VE-PaolaNeural",
448
- "es-VE-SebastianNeural",
449
- "su-ID-JajangNeural",
450
- "su-ID-TutiNeural",
451
- "sw-KE-RafikiNeural",
452
- "sw-KE-ZuriNeural",
453
- "sw-TZ-DaudiNeural",
454
- "sw-TZ-RehemaNeural",
455
- "sv-SE-MattiasNeural",
456
- "sv-SE-SofieNeural",
457
- "ta-IN-PallaviNeural",
458
- "ta-IN-ValluvarNeural",
459
- "ta-MY-KaniNeural",
460
- "ta-MY-SuryaNeural",
461
- "ta-SG-AnbuNeural",
462
- "ta-SG-VenbaNeural",
463
- "ta-LK-KumarNeural",
464
- "ta-LK-SaranyaNeural",
465
- "te-IN-MohanNeural",
466
- "te-IN-ShrutiNeural",
467
- "th-TH-NiwatNeural",
468
- "th-TH-PremwadeeNeural",
469
- "tr-TR-AhmetNeural",
470
- "tr-TR-EmelNeural",
471
- "uk-UA-OstapNeural",
472
- "uk-UA-PolinaNeural",
473
- "ur-IN-GulNeural",
474
- "ur-IN-SalmanNeural",
475
- "ur-PK-AsadNeural",
476
- "ur-PK-UzmaNeural",
477
- "uz-UZ-MadinaNeural",
478
- "uz-UZ-SardorNeural",
479
- "vi-VN-HoaiMyNeural",
480
- "vi-VN-NamMinhNeural",
481
- "cy-GB-AledNeural",
482
- "cy-GB-NiaNeural",
483
- "zu-ZA-ThandoNeural",
484
- "zu-ZA-ThembaNeural"
485
- ],
486
- "google_tts_voice": [
487
- "af",
488
- "am",
489
- "ar",
490
- "bg",
491
- "bn",
492
- "bs",
493
- "ca",
494
- "cs",
495
- "cy",
496
- "da",
497
- "de",
498
- "el",
499
- "en",
500
- "es",
501
- "et",
502
- "eu",
503
- "fi",
504
- "fr",
505
- "fr-CA",
506
- "gl",
507
- "gu",
508
- "ha",
509
- "hi",
510
- "hr",
511
- "hu",
512
- "id",
513
- "is",
514
- "it",
515
- "iw",
516
- "ja",
517
- "jw",
518
- "km",
519
- "kn",
520
- "ko",
521
- "la",
522
- "lt",
523
- "lv",
524
- "ml",
525
- "mr",
526
- "ms",
527
- "my",
528
- "ne",
529
- "nl",
530
- "no",
531
- "pa",
532
- "pl",
533
- "pt",
534
- "pt-PT",
535
- "ro",
536
- "ru",
537
- "si",
538
- "sk",
539
- "sq",
540
- "sr",
541
- "su",
542
- "sv",
543
- "sw",
544
- "ta",
545
- "te",
546
- "th",
547
- "tl",
548
- "tr",
549
- "uk",
550
- "ur",
551
- "vi",
552
- "yue",
553
- "zh-CN",
554
- "zh-TW",
555
- "zh"
556
- ],
557
- "fp16": false,
558
- "editing_tab": true,
559
- "inference_tab": true,
560
- "create_and_training_tab": true,
561
- "extra_tab": true,
562
- "separator_tab": true,
563
- "convert_tab": true,
564
- "convert_with_whisper": true,
565
- "tts_tab": true,
566
- "effects_tab": true,
567
- "quirk": true,
568
- "create_dataset_tab": true,
569
- "training_tab": true,
570
- "fushion_tab": true,
571
- "read_tab": true,
572
- "onnx_tab": true,
573
- "downloads_tab": true,
574
- "f0_extractor_tab": true,
575
- "settings_tab": true,
576
- "create_srt_tab": true,
577
- "realtime_tab": true,
578
- "realtime_client_tab": true,
579
- "create_reference_tab": true,
580
- "font": "https://fonts.googleapis.com/css2?family=Roboto&display=swap",
581
- "app_port": 7860,
582
- "tensorboard_port": 6870,
583
- "num_of_restart": 5,
584
- "server_name": "0.0.0.0",
585
- "app_show_error": true,
586
- "delete_exists_file": false,
587
- "audio_effects_path": "main/inference/audio_effects.py",
588
- "convert_path": "main/inference/conversion/convert.py",
589
- "separate_path": "main/inference/separate_music.py",
590
- "create_dataset_path": "main/inference/create_dataset.py",
591
- "preprocess_path": "main/inference/preprocess/preprocess.py",
592
- "extract_path": "main/inference/extracting/extract.py",
593
- "create_index_path": "main/inference/create_index.py",
594
- "train_path": "main/inference/training/train.py",
595
- "create_reference_path": "main/inference/create_reference.py",
596
- "ico_path": "assets/ico.png",
597
- "csv_path": "assets/spreadsheet.csv",
598
- "weights_path": "assets/weights",
599
- "logs_path": "assets/logs",
600
- "binary_path": "assets/binary",
601
- "f0_path": "assets/f0",
602
- "language_path": "assets/languages",
603
- "presets_path": "assets/presets",
604
- "embedders_path": "assets/models/embedders",
605
- "predictors_path": "assets/models/predictors",
606
- "pretrained_custom_path": "assets/models/pretrained_custom",
607
- "pretrained_v1_path": "assets/models/pretrained_v1",
608
- "pretrained_v2_path": "assets/models/pretrained_v2",
609
- "speaker_diarization_path": "assets/models/speaker_diarization",
610
- "uvr5_path": "assets/models/uvr5",
611
- "audios_path": "audios",
612
- "reference_path": "assets/logs/reference",
613
- "demucs_segments_enable": true,
614
- "demucs_cpu_mode": false,
615
- "limit_f0": 8,
616
- "debug_mode": false,
617
- "pretrain_verify_shape": true,
618
- "pretrain_strict": true,
619
- "cpu_mode": false,
620
- "brain": false,
621
- "discord_presence": true
622
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
main/configs/config.py DELETED
@@ -1,131 +0,0 @@
1
- import os
2
- import sys
3
- import json
4
- import torch
5
- import onnxruntime
6
-
7
- sys.path.append(os.getcwd())
8
-
9
- from main.library.backends import directml, opencl, zluda
10
-
11
- version_config_paths = [os.path.join(version, size) for version in ["v1", "v2"] for size in ["32000.json", "40000.json", "48000.json"]]
12
-
13
- def singleton(cls):
14
- instances = {}
15
-
16
- def get_instance(*args, **kwargs):
17
- if cls not in instances: instances[cls] = cls(*args, **kwargs)
18
- return instances[cls]
19
-
20
- return get_instance
21
-
22
- @singleton
23
- class Config:
24
- def __init__(self):
25
- self.configs_path = os.path.join("main", "configs", "config.json")
26
- self.configs = json.load(open(self.configs_path, "r"))
27
-
28
- self.cpu_mode = self.configs.get("cpu_mode", False)
29
- self.brain = self.configs.get("brain", False)
30
- self.debug_mode = self.configs.get("debug_mode", False)
31
-
32
- self.json_config = self.load_config_json()
33
- self.translations = self.multi_language()
34
-
35
- self.gpu_mem = None
36
- self.per_preprocess = 3.7
37
- self.device = self.get_default_device()
38
- self.providers = self.get_providers()
39
- self.is_half = self.is_fp16()
40
- self.x_pad, self.x_query, self.x_center, self.x_max = self.device_config()
41
-
42
- def multi_language(self):
43
- try:
44
- lang = self.configs.get("language", "vi-VN")
45
- if len([l for l in os.listdir(self.configs["language_path"]) if l.endswith(".json")]) < 1: raise FileNotFoundError("Không tìm thấy bất cứ gói ngôn ngữ nào(No package languages found)")
46
-
47
- if not lang: lang = "vi-VN"
48
- if lang not in self.configs["support_language"]: raise ValueError("Ngôn ngữ không được hỗ trợ(Language not supported)")
49
-
50
- lang_path = os.path.join(self.configs["language_path"], f"{lang}.json")
51
- if not os.path.exists(lang_path): lang_path = os.path.join(self.configs["language_path"], "vi-VN.json")
52
-
53
- with open(lang_path, encoding="utf-8") as f:
54
- translations = json.load(f)
55
- except json.JSONDecodeError:
56
- print(self.translations["empty_json"].format(file=lang))
57
- pass
58
-
59
- return translations
60
-
61
- def is_fp16(self):
62
- fp16 = self.configs.get("fp16", False)
63
-
64
- if self.device in ["cpu", "mps"] and fp16:
65
- self.configs["fp16"] = False
66
- fp16 = False
67
-
68
- with open(self.configs_path, "w") as f:
69
- json.dump(self.configs, f, indent=4)
70
-
71
- if not fp16: self.per_preprocess = 3.0
72
- return fp16
73
-
74
- def load_config_json(self):
75
- configs = {}
76
-
77
- for config_file in version_config_paths:
78
- try:
79
- with open(os.path.join("main", "configs", config_file), "r") as f:
80
- configs[config_file] = json.load(f)
81
- except json.JSONDecodeError:
82
- print(self.translations["empty_json"].format(file=config_file))
83
- pass
84
-
85
- return configs
86
-
87
- def device_config(self):
88
- if self.gpu_mem is not None and self.gpu_mem <= 4:
89
- self.per_preprocess = 3.0
90
- return 1, 5, 30, 32
91
-
92
- return (3, 10, 60, 65) if self.is_half else (1, 6, 38, 41)
93
-
94
- def get_default_device(self):
95
- if not self.cpu_mode:
96
- if torch.cuda.is_available():
97
- device = "cuda:0"
98
- self.gpu_mem = torch.cuda.get_device_properties(int(device.split(":")[-1])).total_memory // (1024**3)
99
- elif directml.is_available():
100
- device = "privateuseone:0"
101
- elif opencl.is_available():
102
- device = "ocl:0"
103
- elif torch.backends.mps.is_available():
104
- device = "mps"
105
- else:
106
- device = "cpu"
107
- else:
108
- torch.cuda.is_available = lambda : False
109
- directml.is_available = lambda : False
110
- opencl.is_available = lambda : False
111
- torch.backends.mps.is_available = lambda : False
112
-
113
- device = "cpu"
114
-
115
- return device
116
-
117
- def get_providers(self):
118
- ort_providers = onnxruntime.get_available_providers()
119
-
120
- if "CUDAExecutionProvider" in ort_providers and self.device.startswith("cuda"):
121
- providers = ["CUDAExecutionProvider"]
122
- elif "ROCMExecutionProvider" in ort_providers and self.device.startswith("cuda"):
123
- providers = ["ROCMExecutionProvider"]
124
- elif "DmlExecutionProvider" in ort_providers and self.device.startswith(("ocl", "privateuseone")):
125
- providers = ["DmlExecutionProvider"]
126
- elif "CoreMLExecutionProvider" in ort_providers and self.device.startswith("mps"):
127
- providers = ["CoreMLExecutionProvider"]
128
- else:
129
- providers = ["CPUExecutionProvider"]
130
-
131
- return providers
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
main/configs/rpc.py DELETED
@@ -1,78 +0,0 @@
1
- import os
2
- import sys
3
- import json
4
- import time
5
- import struct
6
- import codecs
7
-
8
- sys.path.append(os.getcwd())
9
-
10
- from main.app.variables import translations
11
-
12
- CLIENT_ID = "1392816674159202396"
13
-
14
- def create_payload(opcode, payload):
15
- data = json.dumps(payload).encode("utf-8")
16
-
17
- return struct.pack(
18
- "<I",
19
- opcode
20
- ) + struct.pack(
21
- "<I",
22
- len(data)
23
- ) + data
24
-
25
- def connect_discord_ipc():
26
- try:
27
- return open(
28
- r"\\?\pipe\discord-ipc-0",
29
- "r+b",
30
- buffering=0
31
- )
32
- except Exception:
33
- return None
34
-
35
- def send_discord_rpc(pipe):
36
- pipe.write(
37
- create_payload(
38
- 0, {
39
- "v": 1,
40
- "client_id": CLIENT_ID
41
- }
42
- )
43
- )
44
-
45
- pipe.read(8)
46
- pipe.read(
47
- struct.unpack(
48
- "<I",
49
- pipe.read(4)
50
- )[0]
51
- )
52
-
53
- pipe.write(
54
- create_payload(
55
- 1, {
56
- "cmd": "SET_ACTIVITY",
57
- "args": {
58
- "pid": os.getpid(),
59
- "activity": {
60
- "buttons": [{
61
- "label": "Github",
62
- "url": codecs.decode("uggcf://tvguho.pbz/CunzUhlauNau16/Ivrganzrfr-EIP", "rot13")
63
- }],
64
- "details": translations["details"],
65
- "timestamps": {
66
- "start": int(
67
- time.time()
68
- )
69
- },
70
- "state": translations["use"]
71
- }
72
- },
73
- "nonce": str(
74
- time.time()
75
- )
76
- }
77
- )
78
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
main/configs/v1/32000.json DELETED
@@ -1,46 +0,0 @@
1
- {
2
- "train": {
3
- "log_interval": 200,
4
- "seed": 1234,
5
- "epochs": 20000,
6
- "learning_rate": 0.0001,
7
- "betas": [0.8, 0.99],
8
- "eps": 1e-09,
9
- "batch_size": 4,
10
- "lr_decay": 0.999875,
11
- "segment_size": 12800,
12
- "init_lr_ratio": 1,
13
- "warmup_epochs": 0,
14
- "c_mel": 45,
15
- "c_kl": 1.0
16
- },
17
- "data": {
18
- "max_wav_value": 32768.0,
19
- "sample_rate": 32000,
20
- "filter_length": 1024,
21
- "hop_length": 320,
22
- "win_length": 1024,
23
- "n_mel_channels": 80,
24
- "mel_fmin": 0.0,
25
- "mel_fmax": null
26
- },
27
- "model": {
28
- "inter_channels": 192,
29
- "hidden_channels": 192,
30
- "filter_channels": 768,
31
- "text_enc_hidden_dim": 256,
32
- "n_heads": 2,
33
- "n_layers": 6,
34
- "kernel_size": 3,
35
- "p_dropout": 0,
36
- "resblock": "1",
37
- "resblock_kernel_sizes": [3, 7, 11],
38
- "resblock_dilation_sizes": [[1, 3, 5], [1, 3, 5], [1, 3, 5]],
39
- "upsample_rates": [10, 4, 2, 2, 2],
40
- "upsample_initial_channel": 512,
41
- "upsample_kernel_sizes": [16, 16, 4, 4, 4],
42
- "use_spectral_norm": false,
43
- "gin_channels": 256,
44
- "spk_embed_dim": 109
45
- }
46
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
main/configs/v1/40000.json DELETED
@@ -1,46 +0,0 @@
1
- {
2
- "train": {
3
- "log_interval": 200,
4
- "seed": 1234,
5
- "epochs": 20000,
6
- "learning_rate": 0.0001,
7
- "betas": [0.8, 0.99],
8
- "eps": 1e-09,
9
- "batch_size": 4,
10
- "lr_decay": 0.999875,
11
- "segment_size": 12800,
12
- "init_lr_ratio": 1,
13
- "warmup_epochs": 0,
14
- "c_mel": 45,
15
- "c_kl": 1.0
16
- },
17
- "data": {
18
- "max_wav_value": 32768.0,
19
- "sample_rate": 40000,
20
- "filter_length": 2048,
21
- "hop_length": 400,
22
- "win_length": 2048,
23
- "n_mel_channels": 125,
24
- "mel_fmin": 0.0,
25
- "mel_fmax": null
26
- },
27
- "model": {
28
- "inter_channels": 192,
29
- "hidden_channels": 192,
30
- "filter_channels": 768,
31
- "text_enc_hidden_dim": 256,
32
- "n_heads": 2,
33
- "n_layers": 6,
34
- "kernel_size": 3,
35
- "p_dropout": 0,
36
- "resblock": "1",
37
- "resblock_kernel_sizes": [3, 7, 11],
38
- "resblock_dilation_sizes": [[1, 3, 5], [1, 3, 5], [1, 3, 5]],
39
- "upsample_rates": [10, 10, 2, 2],
40
- "upsample_initial_channel": 512,
41
- "upsample_kernel_sizes": [16, 16, 4, 4],
42
- "use_spectral_norm": false,
43
- "gin_channels": 256,
44
- "spk_embed_dim": 109
45
- }
46
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
main/configs/v1/48000.json DELETED
@@ -1,46 +0,0 @@
1
- {
2
- "train": {
3
- "log_interval": 200,
4
- "seed": 1234,
5
- "epochs": 20000,
6
- "learning_rate": 0.0001,
7
- "betas": [0.8, 0.99],
8
- "eps": 1e-09,
9
- "batch_size": 4,
10
- "lr_decay": 0.999875,
11
- "segment_size": 11520,
12
- "init_lr_ratio": 1,
13
- "warmup_epochs": 0,
14
- "c_mel": 45,
15
- "c_kl": 1.0
16
- },
17
- "data": {
18
- "max_wav_value": 32768.0,
19
- "sample_rate": 48000,
20
- "filter_length": 2048,
21
- "hop_length": 480,
22
- "win_length": 2048,
23
- "n_mel_channels": 128,
24
- "mel_fmin": 0.0,
25
- "mel_fmax": null
26
- },
27
- "model": {
28
- "inter_channels": 192,
29
- "hidden_channels": 192,
30
- "filter_channels": 768,
31
- "text_enc_hidden_dim": 256,
32
- "n_heads": 2,
33
- "n_layers": 6,
34
- "kernel_size": 3,
35
- "p_dropout": 0,
36
- "resblock": "1",
37
- "resblock_kernel_sizes": [3, 7, 11],
38
- "resblock_dilation_sizes": [[1, 3, 5], [1, 3, 5], [1, 3, 5]],
39
- "upsample_rates": [10, 6, 2, 2, 2],
40
- "upsample_initial_channel": 512,
41
- "upsample_kernel_sizes": [16, 16, 4, 4, 4],
42
- "use_spectral_norm": false,
43
- "gin_channels": 256,
44
- "spk_embed_dim": 109
45
- }
46
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
main/configs/v2/32000.json DELETED
@@ -1,42 +0,0 @@
1
- {
2
- "train": {
3
- "log_interval": 200,
4
- "seed": 1234,
5
- "learning_rate": 0.0001,
6
- "betas": [0.8, 0.99],
7
- "eps": 1e-09,
8
- "lr_decay": 0.999875,
9
- "segment_size": 12800,
10
- "c_mel": 45,
11
- "c_kl": 1.0
12
- },
13
- "data": {
14
- "max_wav_value": 32768.0,
15
- "sample_rate": 32000,
16
- "filter_length": 1024,
17
- "hop_length": 320,
18
- "win_length": 1024,
19
- "n_mel_channels": 80,
20
- "mel_fmin": 0.0,
21
- "mel_fmax": null
22
- },
23
- "model": {
24
- "inter_channels": 192,
25
- "hidden_channels": 192,
26
- "filter_channels": 768,
27
- "text_enc_hidden_dim": 768,
28
- "n_heads": 2,
29
- "n_layers": 6,
30
- "kernel_size": 3,
31
- "p_dropout": 0,
32
- "resblock": "1",
33
- "resblock_kernel_sizes": [3, 7, 11],
34
- "resblock_dilation_sizes": [[1, 3, 5], [1, 3, 5], [1, 3, 5]],
35
- "upsample_rates": [10, 8, 2, 2],
36
- "upsample_initial_channel": 512,
37
- "upsample_kernel_sizes": [20, 16, 4, 4],
38
- "use_spectral_norm": false,
39
- "gin_channels": 256,
40
- "spk_embed_dim": 109
41
- }
42
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
main/configs/v2/40000.json DELETED
@@ -1,42 +0,0 @@
1
- {
2
- "train": {
3
- "log_interval": 200,
4
- "seed": 1234,
5
- "learning_rate": 0.0001,
6
- "betas": [0.8, 0.99],
7
- "eps": 1e-09,
8
- "lr_decay": 0.999875,
9
- "segment_size": 12800,
10
- "c_mel": 45,
11
- "c_kl": 1.0
12
- },
13
- "data": {
14
- "max_wav_value": 32768.0,
15
- "sample_rate": 40000,
16
- "filter_length": 2048,
17
- "hop_length": 400,
18
- "win_length": 2048,
19
- "n_mel_channels": 125,
20
- "mel_fmin": 0.0,
21
- "mel_fmax": null
22
- },
23
- "model": {
24
- "inter_channels": 192,
25
- "hidden_channels": 192,
26
- "filter_channels": 768,
27
- "text_enc_hidden_dim": 768,
28
- "n_heads": 2,
29
- "n_layers": 6,
30
- "kernel_size": 3,
31
- "p_dropout": 0,
32
- "resblock": "1",
33
- "resblock_kernel_sizes": [3, 7, 11],
34
- "resblock_dilation_sizes": [[1, 3, 5], [1, 3, 5], [1, 3, 5]],
35
- "upsample_rates": [10, 10, 2, 2],
36
- "upsample_initial_channel": 512,
37
- "upsample_kernel_sizes": [16, 16, 4, 4],
38
- "use_spectral_norm": false,
39
- "gin_channels": 256,
40
- "spk_embed_dim": 109
41
- }
42
- }