Commit
·
7f167fb
1
Parent(s):
dc868ec
Add sox and libsox-dev to requirements.txt
Browse files- .gitmodules +3 -0
- config.py +7 -4
- packages.txt +2 -0
- requirements.txt +25 -3
- tabs/audios/events.py +168 -22
- tabs/audios/load_models.py +25 -4
- tabs/audios/modules/CosyVoice +1 -0
- tabs/audios/ui.py +15 -19
.gitmodules
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[submodule "tabs/audios/modules/CosyVoice"]
|
| 2 |
+
path = tabs/audios/modules/CosyVoice
|
| 3 |
+
url = https://github.com/FunAudioLLM/CosyVoice.git
|
config.py
CHANGED
|
@@ -4,8 +4,10 @@ import json
|
|
| 4 |
import torch
|
| 5 |
|
| 6 |
|
| 7 |
-
# Setup
|
| 8 |
-
|
|
|
|
|
|
|
| 9 |
|
| 10 |
|
| 11 |
css = """
|
|
@@ -32,9 +34,10 @@ body {
|
|
| 32 |
class Config:
|
| 33 |
# General
|
| 34 |
SECRET_KEY = os.environ.get('SECRET_KEY', '12345678')
|
|
|
|
|
|
|
| 35 |
|
| 36 |
# Images
|
| 37 |
-
# IMAGE_MODELS = ["black-forest-labs/FLUX.1-dev", "stabilityai/stable-diffusion-xl-base-1.0"]
|
| 38 |
IMAGES_MODELS = [{"repo_id": "black-forest-labs/FLUX.1-dev", "loader": "flux", "compute_type": torch.bfloat16,}, {"repo_id": "stabilityai/stable-diffusion-xl-base-1.0", "loader": "sdxl", "compute_type": torch.float16,}]
|
| 39 |
with open('data/loras/sdxl.json') as f:
|
| 40 |
IMAGES_LORAS_SDXL = json.load(f)
|
|
@@ -80,4 +83,4 @@ class Config:
|
|
| 80 |
|
| 81 |
|
| 82 |
# Audios
|
| 83 |
-
AUDIOS_MODELS = [
|
|
|
|
| 4 |
import torch
|
| 5 |
|
| 6 |
|
| 7 |
+
# Setup Repo
|
| 8 |
+
|
| 9 |
+
# Audios
|
| 10 |
+
os.environ['PYTHONPATH'] = f'{os.path.dirname(__file__)}/modules/CosyVoice/third_party/Matcha-TTS:{os.environ.get("PYTHONPATH", "")}' # add tabs/audios/modules/CosyVoice/third_party/Matcha-TTS to PYTHONPATH
|
| 11 |
|
| 12 |
|
| 13 |
css = """
|
|
|
|
| 34 |
class Config:
|
| 35 |
# General
|
| 36 |
SECRET_KEY = os.environ.get('SECRET_KEY', '12345678')
|
| 37 |
+
MODEL_DOWNLOAD_DIR = os.environ.get('HF_HOME', os.environ.get('HF_HUB_CACHE', '/.cache'))
|
| 38 |
+
os.makedirs(MODEL_DOWNLOAD_DIR, exist_ok=True)
|
| 39 |
|
| 40 |
# Images
|
|
|
|
| 41 |
IMAGES_MODELS = [{"repo_id": "black-forest-labs/FLUX.1-dev", "loader": "flux", "compute_type": torch.bfloat16,}, {"repo_id": "stabilityai/stable-diffusion-xl-base-1.0", "loader": "sdxl", "compute_type": torch.float16,}]
|
| 42 |
with open('data/loras/sdxl.json') as f:
|
| 43 |
IMAGES_LORAS_SDXL = json.load(f)
|
|
|
|
| 83 |
|
| 84 |
|
| 85 |
# Audios
|
| 86 |
+
AUDIOS_MODELS = []
|
packages.txt
CHANGED
|
@@ -1,2 +1,4 @@
|
|
| 1 |
ffmpeg
|
| 2 |
libgl1-mesa-glx
|
|
|
|
|
|
|
|
|
| 1 |
ffmpeg
|
| 2 |
libgl1-mesa-glx
|
| 3 |
+
sox
|
| 4 |
+
libsox-dev
|
requirements.txt
CHANGED
|
@@ -1,4 +1,3 @@
|
|
| 1 |
-
--extra-index-url https://aiinfra.pkgs.visualstudio.com/PublicPackages/_packaging/onnxruntime-cuda-12/pypi/simple/
|
| 2 |
spaces
|
| 3 |
gradio
|
| 4 |
torch
|
|
@@ -15,7 +14,6 @@ mediapipe
|
|
| 15 |
controlnet_aux
|
| 16 |
insightface
|
| 17 |
omegaconf
|
| 18 |
-
git+https://github.com/TencentARC/PhotoMaker.git
|
| 19 |
torchao
|
| 20 |
git+https://github.com/xhinker/sd_embed.git
|
| 21 |
clip_interrogator
|
|
@@ -24,4 +22,28 @@ git+https://github.com/TencentARC/GFPGAN.git
|
|
| 24 |
git+https://github.com/xinntao/Real-ESRGAN.git
|
| 25 |
aura_sr
|
| 26 |
deepfilternet
|
| 27 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
spaces
|
| 2 |
gradio
|
| 3 |
torch
|
|
|
|
| 14 |
controlnet_aux
|
| 15 |
insightface
|
| 16 |
omegaconf
|
|
|
|
| 17 |
torchao
|
| 18 |
git+https://github.com/xhinker/sd_embed.git
|
| 19 |
clip_interrogator
|
|
|
|
| 22 |
git+https://github.com/xinntao/Real-ESRGAN.git
|
| 23 |
aura_sr
|
| 24 |
deepfilternet
|
| 25 |
+
conformer
|
| 26 |
+
deepspeed
|
| 27 |
+
gdown
|
| 28 |
+
grpcio
|
| 29 |
+
grpcio-tools
|
| 30 |
+
hydra-core
|
| 31 |
+
HyperPyYAML
|
| 32 |
+
inflect
|
| 33 |
+
librosa
|
| 34 |
+
lightning
|
| 35 |
+
matplotlib
|
| 36 |
+
modelscope
|
| 37 |
+
networkx
|
| 38 |
+
onnx
|
| 39 |
+
openai-whisper
|
| 40 |
+
protobuf
|
| 41 |
+
pydantic
|
| 42 |
+
rich
|
| 43 |
+
soundfile
|
| 44 |
+
tensorboard
|
| 45 |
+
WeTextProcessing
|
| 46 |
+
wget
|
| 47 |
+
fastapi-cli
|
| 48 |
+
spacy
|
| 49 |
+
spacy_langdetect
|
tabs/audios/events.py
CHANGED
|
@@ -1,3 +1,4 @@
|
|
|
|
|
| 1 |
import os
|
| 2 |
import gc
|
| 3 |
import tempfile
|
|
@@ -5,11 +6,13 @@ from uuid import uuid4
|
|
| 5 |
|
| 6 |
import spaces
|
| 7 |
import gradio as gr
|
|
|
|
| 8 |
import numpy as np
|
| 9 |
from df.enhance import enhance, load_audio, save_audio
|
| 10 |
|
| 11 |
from config import Config
|
| 12 |
from .load_models import *
|
|
|
|
| 13 |
|
| 14 |
|
| 15 |
# Helper functions
|
|
@@ -17,6 +20,103 @@ def create_temp_file():
|
|
| 17 |
return tempfile.NamedTemporaryFile(delete=False)
|
| 18 |
|
| 19 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 20 |
@spaces.GPU(duration=10)
|
| 21 |
def clear_audio(audio: np.ndarray):
|
| 22 |
# Save the audio file
|
|
@@ -36,30 +136,76 @@ def clear_audio(audio: np.ndarray):
|
|
| 36 |
|
| 37 |
|
| 38 |
@spaces.GPU(duration=20)
|
| 39 |
-
def gen_audio(
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
|
| 51 |
|
| 52 |
# Generate the audio
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 62 |
|
| 63 |
return gr.update( # output_audio
|
| 64 |
-
value=
|
| 65 |
)
|
|
|
|
| 1 |
+
import re
|
| 2 |
import os
|
| 3 |
import gc
|
| 4 |
import tempfile
|
|
|
|
| 6 |
|
| 7 |
import spaces
|
| 8 |
import gradio as gr
|
| 9 |
+
import torchaudio
|
| 10 |
import numpy as np
|
| 11 |
from df.enhance import enhance, load_audio, save_audio
|
| 12 |
|
| 13 |
from config import Config
|
| 14 |
from .load_models import *
|
| 15 |
+
from .modules.CosyVoice.cosyvoice.utils.file_utils import load_wav
|
| 16 |
|
| 17 |
|
| 18 |
# Helper functions
|
|
|
|
| 20 |
return tempfile.NamedTemporaryFile(delete=False)
|
| 21 |
|
| 22 |
|
| 23 |
+
|
| 24 |
+
def assign_language_tags(text):
|
| 25 |
+
# Process the text
|
| 26 |
+
# based on the language assign <|zh|><|en|><|jp|><|yue|><|ko|> for Chinese/English/Japanese/Cantonese/Korean
|
| 27 |
+
# at the start of the text for that language
|
| 28 |
+
# e.g. input: 你好 Hello こんにちは 你好 안녕하세요
|
| 29 |
+
# output: <|zh|>你好<|en|>Hello<|jp|>こんにちは<|yue|>你好<|ko|>안녕하세요
|
| 30 |
+
# Define language patterns
|
| 31 |
+
patterns = {
|
| 32 |
+
'zh': r'[\u4e00-\u9fff]+', # Chinese characters
|
| 33 |
+
'en': r'[a-zA-Z]+', # English letters
|
| 34 |
+
'jp': r'[\u3040-\u30ff\u31f0-\u31ff]+', # Japanese characters
|
| 35 |
+
'ko': r'[\uac00-\ud7a3]+', # Korean characters
|
| 36 |
+
}
|
| 37 |
+
|
| 38 |
+
# Find all matches
|
| 39 |
+
matches = []
|
| 40 |
+
for lang, pattern in patterns.items():
|
| 41 |
+
for match in re.finditer(pattern, text):
|
| 42 |
+
matches.append((match.start(), match.end(), lang, match.group()))
|
| 43 |
+
|
| 44 |
+
# Sort matches by start position
|
| 45 |
+
matches.sort(key=lambda x: x[0])
|
| 46 |
+
|
| 47 |
+
# Build the result string
|
| 48 |
+
result = []
|
| 49 |
+
last_end = 0
|
| 50 |
+
zh_count = 0
|
| 51 |
+
for start, end, lang, content in matches:
|
| 52 |
+
if start > last_end:
|
| 53 |
+
result.append(text[last_end:start])
|
| 54 |
+
if lang == 'zh':
|
| 55 |
+
zh_count += 1
|
| 56 |
+
if zh_count > 1:
|
| 57 |
+
lang = 'yue'
|
| 58 |
+
result.append(f'<|{lang}|>{content}')
|
| 59 |
+
last_end = end
|
| 60 |
+
|
| 61 |
+
if last_end < len(text):
|
| 62 |
+
result.append(text[last_end:])
|
| 63 |
+
|
| 64 |
+
return ''.join(result)
|
| 65 |
+
|
| 66 |
+
|
| 67 |
+
def update_mode(mode, sft_speaker, speaker_audio, voice_instructions):
|
| 68 |
+
if mode == 'SFT':
|
| 69 |
+
return (
|
| 70 |
+
gr.update( # sft_speaker
|
| 71 |
+
|
| 72 |
+
),
|
| 73 |
+
gr.update( # speaker_audio,
|
| 74 |
+
visible=False,
|
| 75 |
+
),
|
| 76 |
+
gr.update( # voice_instructions,
|
| 77 |
+
visible=False,
|
| 78 |
+
),
|
| 79 |
+
)
|
| 80 |
+
elif mode == 'VC':
|
| 81 |
+
return (
|
| 82 |
+
gr.update( # sft_speaker,
|
| 83 |
+
visible=False,
|
| 84 |
+
),
|
| 85 |
+
gr.update( # speaker_audio,
|
| 86 |
+
visible=True,
|
| 87 |
+
),
|
| 88 |
+
gr.update( # voice_instructions,
|
| 89 |
+
visible=True,
|
| 90 |
+
),
|
| 91 |
+
)
|
| 92 |
+
elif mode == 'VC-CrossLingual':
|
| 93 |
+
return (
|
| 94 |
+
gr.update( # sft_speaker,
|
| 95 |
+
visible=False,
|
| 96 |
+
),
|
| 97 |
+
gr.update( # speaker_audio,
|
| 98 |
+
visible=True,
|
| 99 |
+
),
|
| 100 |
+
gr.update( # voice_instructions,
|
| 101 |
+
visible=False,
|
| 102 |
+
),
|
| 103 |
+
)
|
| 104 |
+
elif mode == 'Instruct':
|
| 105 |
+
return (
|
| 106 |
+
gr.update( # sft_speaker,
|
| 107 |
+
visible=True,
|
| 108 |
+
),
|
| 109 |
+
gr.update( # speaker_audio,
|
| 110 |
+
visible=False,
|
| 111 |
+
),
|
| 112 |
+
gr.update( # voice_instructions,
|
| 113 |
+
visible=True,
|
| 114 |
+
),
|
| 115 |
+
)
|
| 116 |
+
else:
|
| 117 |
+
raise gr.Error('Invalid mode')
|
| 118 |
+
|
| 119 |
+
|
| 120 |
@spaces.GPU(duration=10)
|
| 121 |
def clear_audio(audio: np.ndarray):
|
| 122 |
# Save the audio file
|
|
|
|
| 136 |
|
| 137 |
|
| 138 |
@spaces.GPU(duration=20)
|
| 139 |
+
def gen_audio(text, mode, sft_speaker = None, speaker_audio = None, voice_instructions = None):
|
| 140 |
+
if mode == any(['VC', 'VC-CrossLingual']):
|
| 141 |
+
# Save the speaker audio file
|
| 142 |
+
speaker_audio_file = create_temp_file()
|
| 143 |
+
np.save(speaker_audio_file.name, speaker_audio)
|
| 144 |
+
prompt_speech_16k = load_wav('zero_shot_prompt.wav', 16000)
|
| 145 |
+
else:
|
| 146 |
+
speaker_audio_file = None
|
| 147 |
+
prompt_speech_16k = None
|
| 148 |
+
|
| 149 |
+
# Assign language tags
|
| 150 |
+
text = assign_language_tags(text)
|
| 151 |
|
| 152 |
# Generate the audio
|
| 153 |
+
out_file = create_temp_file()
|
| 154 |
+
if mode == 'SFT':
|
| 155 |
+
if not sft_speaker:
|
| 156 |
+
raise gr.Error('Please select a speaker')
|
| 157 |
+
|
| 158 |
+
for i, j in enumerate(cv_base.inference_sft(
|
| 159 |
+
tts_text=text,
|
| 160 |
+
spk_id=sft_speaker,
|
| 161 |
+
)):
|
| 162 |
+
torchaudio.save(
|
| 163 |
+
out_file.name.format(i),
|
| 164 |
+
j['tts_speech'],
|
| 165 |
+
22050,
|
| 166 |
+
)
|
| 167 |
+
elif mode == 'VC':
|
| 168 |
+
if not speaker_audio_file:
|
| 169 |
+
raise gr.Error('Please upload an audio')
|
| 170 |
+
|
| 171 |
+
for i, j in enumerate(cv_sft.inference_zero_shot(
|
| 172 |
+
tts_text=text,
|
| 173 |
+
prompt_speech_16k=voice_instructions,
|
| 174 |
+
prompt_speech_16k=prompt_speech_16k,
|
| 175 |
+
)):
|
| 176 |
+
torchaudio.save(
|
| 177 |
+
out_file.name.format(i),
|
| 178 |
+
j['tts_speech'],
|
| 179 |
+
22050,
|
| 180 |
+
)
|
| 181 |
+
elif mode == 'VC-CrossLingual':
|
| 182 |
+
if not speaker_audio_file:
|
| 183 |
+
raise gr.Error('Please upload an audio')
|
| 184 |
+
|
| 185 |
+
for i, j in enumerate(cv_sft.inference_cross_lingual(
|
| 186 |
+
tts_text=text,
|
| 187 |
+
prompt_speech_16k=prompt_speech_16k,
|
| 188 |
+
)):
|
| 189 |
+
torchaudio.save(
|
| 190 |
+
out_file.name.format(i),
|
| 191 |
+
j['tts_speech'],
|
| 192 |
+
22050,
|
| 193 |
+
)
|
| 194 |
+
elif mode == 'Instruct':
|
| 195 |
+
if not voice_instructions:
|
| 196 |
+
raise gr.Error('Please enter voice instructions')
|
| 197 |
+
|
| 198 |
+
for i, j in enumerate(cv_instruct.inference_instruct(
|
| 199 |
+
tts_text=text,
|
| 200 |
+
spk_id=sft_speaker,
|
| 201 |
+
instruct_text=voice_instructions,
|
| 202 |
+
)):
|
| 203 |
+
torchaudio.save(
|
| 204 |
+
out_file.name.format(i),
|
| 205 |
+
j['tts_speech'],
|
| 206 |
+
22050,
|
| 207 |
+
)
|
| 208 |
|
| 209 |
return gr.update( # output_audio
|
| 210 |
+
value=out_file.name,
|
| 211 |
)
|
tabs/audios/load_models.py
CHANGED
|
@@ -1,17 +1,38 @@
|
|
|
|
|
|
|
|
| 1 |
import torch
|
| 2 |
from df.enhance import init_df
|
| 3 |
-
from
|
| 4 |
|
| 5 |
from config import Config
|
|
|
|
| 6 |
|
| 7 |
|
| 8 |
def init_sys():
|
| 9 |
device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 10 |
|
|
|
|
| 11 |
df_model, df_state, _ = init_df()
|
| 12 |
|
| 13 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 14 |
|
| 15 |
-
return device, df_model, df_state,
|
| 16 |
|
| 17 |
-
device, df_model, df_state,
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
|
| 3 |
import torch
|
| 4 |
from df.enhance import init_df
|
| 5 |
+
from modelscope import snapshot_download
|
| 6 |
|
| 7 |
from config import Config
|
| 8 |
+
from .modules.CosyVoice.cosyvoice.cli.cosyvoice import CosyVoice
|
| 9 |
|
| 10 |
|
| 11 |
def init_sys():
|
| 12 |
device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 13 |
|
| 14 |
+
# Load DeepFilterNet2
|
| 15 |
df_model, df_state, _ = init_df()
|
| 16 |
|
| 17 |
+
# Download CosyVoice models
|
| 18 |
+
snapshot_download('iic/CosyVoice-300M', local_dir=f'{Config.MODEL_DOWNLOAD_DIR}/audios/CosyVoice-300M')
|
| 19 |
+
snapshot_download('iic/CosyVoice-300M-SFT', local_dir=f'{Config.MODEL_DOWNLOAD_DIR}/audios/CosyVoice-300M-SFT')
|
| 20 |
+
snapshot_download('iic/CosyVoice-300M-Instruct', local_dir=f'{Config.MODEL_DOWNLOAD_DIR}/audios/CosyVoice-300M-Instruct')
|
| 21 |
+
snapshot_download('iic/CosyVoice-ttsfrd', local_dir=f'{Config.MODEL_DOWNLOAD_DIR}/audios/CosyVoice-ttsfrd')
|
| 22 |
+
|
| 23 |
+
# Add `tabs/audios/modules/CosyVoice/third_party/Matcha-TTS` to your `PYTHONPATH`
|
| 24 |
+
os.environ['PYTHONPATH'] = f'{os.path.dirname(__file__)}/modules/CosyVoice/third_party/Matcha-TTS:{os.environ.get("PYTHONPATH", "")}'
|
| 25 |
+
|
| 26 |
+
# Load CosyVoice TTS
|
| 27 |
+
cv_base = CosyVoice('pretrained_models/CosyVoice-300M')
|
| 28 |
+
|
| 29 |
+
# Load CosyVoice SFT
|
| 30 |
+
cv_sft = CosyVoice('pretrained_models/CosyVoice-300M-SFT')
|
| 31 |
+
sft_speakers = cv_sft.list_avaliable_spks()
|
| 32 |
+
|
| 33 |
+
# Load CosyVoice Instruct
|
| 34 |
+
cv_instruct = CosyVoice('pretrained_models/CosyVoice-300M-Instruct')
|
| 35 |
|
| 36 |
+
return device, df_model, df_state, cv_base, cv_sft, sft_speakers, cv_instruct
|
| 37 |
|
| 38 |
+
device, df_model, df_state, cv_base, cv_sft, sft_speakers, cv_instruct = init_sys()
|
tabs/audios/modules/CosyVoice
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
Subproject commit c901a12789e0a9d8cec54c3caf1bc304533bdf82
|
tabs/audios/ui.py
CHANGED
|
@@ -10,30 +10,19 @@ def audio_tab():
|
|
| 10 |
with gr.Group():
|
| 11 |
with gr.Group():
|
| 12 |
text = gr.Textbox(lines=5, label="Enter text")
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
)
|
| 18 |
-
|
| 19 |
-
with gr.Accordion('Voice Clone', open=True):
|
| 20 |
-
speaker_audio = gr.Audio(label="Upload Audio", type='numpy')
|
| 21 |
clear_speaker_audio = gr.Button(label="Clear Audio")
|
|
|
|
|
|
|
|
|
|
| 22 |
|
| 23 |
with gr.Column():
|
| 24 |
output_audio = gr.Audio(label="Output Audio", interactive=False, show_download_button=True)
|
| 25 |
clear_output_audio = gr.Button(label="Clear Audio")
|
| 26 |
generate_audio = gr.Button(label="Generate Audio")
|
| 27 |
-
|
| 28 |
-
with gr.Accordion('Advance Settings', open=True):
|
| 29 |
-
settings = [
|
| 30 |
-
('Alpha', 'tts_alpha', 'float', 0.0, 1.0, 0.3, 0.1,),
|
| 31 |
-
('Beta', 'tts_beta', 'float', 0.0, 1.0, 0.7, 0.1,),
|
| 32 |
-
('Diffusion Steps', 'tts_diffusion_steps', 'int', 1, 100, 10, 1,),
|
| 33 |
-
('Embedding Scale', 'tts_embedding_scale', 'int', 0, 10, 1, 1,),
|
| 34 |
-
]
|
| 35 |
-
for label, key, type_, min_, max_, value, step in settings:
|
| 36 |
-
globals()[key] = gr.Slider(label=label, minimum=min_, maximum=max_, value=value, step=step)
|
| 37 |
|
| 38 |
|
| 39 |
# Events
|
|
@@ -41,9 +30,16 @@ def audio_tab():
|
|
| 41 |
clear_speaker_audio.click(clear_audio, speaker_audio, speaker_audio)
|
| 42 |
clear_output_audio.click(clear_audio, output_audio, output_audio)
|
| 43 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 44 |
# Generate Audio
|
| 45 |
generate_audio.click(
|
| 46 |
gen_audio,
|
| 47 |
-
[text,
|
| 48 |
[output_audio]
|
| 49 |
)
|
|
|
|
| 10 |
with gr.Group():
|
| 11 |
with gr.Group():
|
| 12 |
text = gr.Textbox(lines=5, label="Enter text")
|
| 13 |
+
mode = gr.Radio(["SFT", "VC", "VC-CrossLingual", "Instruct"], label="Mode", value="SFT",) # automate with speech recognition pipeline
|
| 14 |
+
sft_speaker = gr.Radio(sft_speakers, label="Select speaker")
|
| 15 |
+
with gr.Accordion('Voice Clone', open=False):
|
| 16 |
+
speaker_audio = gr.Audio(label="Upload Audio", type='numpy', visible=False)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 17 |
clear_speaker_audio = gr.Button(label="Clear Audio")
|
| 18 |
+
|
| 19 |
+
with gr.Accordion('Instruct', open=False):
|
| 20 |
+
voice_instructions = gr.Textbox(lines=5, label="Enter voice instructions", visible=False)
|
| 21 |
|
| 22 |
with gr.Column():
|
| 23 |
output_audio = gr.Audio(label="Output Audio", interactive=False, show_download_button=True)
|
| 24 |
clear_output_audio = gr.Button(label="Clear Audio")
|
| 25 |
generate_audio = gr.Button(label="Generate Audio")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 26 |
|
| 27 |
|
| 28 |
# Events
|
|
|
|
| 30 |
clear_speaker_audio.click(clear_audio, speaker_audio, speaker_audio)
|
| 31 |
clear_output_audio.click(clear_audio, output_audio, output_audio)
|
| 32 |
|
| 33 |
+
# Mode
|
| 34 |
+
mode.change(
|
| 35 |
+
update_mode,
|
| 36 |
+
[mode, sft_speaker, speaker_audio, voice_instructions],
|
| 37 |
+
[sft_speaker, speaker_audio, voice_instructions]
|
| 38 |
+
)
|
| 39 |
+
|
| 40 |
# Generate Audio
|
| 41 |
generate_audio.click(
|
| 42 |
gen_audio,
|
| 43 |
+
[text, mode, sft_speaker, speaker_audio, voice_instructions],
|
| 44 |
[output_audio]
|
| 45 |
)
|