|
|
import gradio as gr |
|
|
import base64 |
|
|
import tempfile |
|
|
import json |
|
|
import os |
|
|
from os.path import abspath |
|
|
import zipfile |
|
|
import random |
|
|
import xtts |
|
|
import re |
|
|
|
|
|
DO_CHECK = os.getenv('DO_CHECK', '1') |
|
|
OUTPUT = "./demo_outputs" |
|
|
cloned_speakers = {} |
|
|
|
|
|
print("Preparing file structure...") |
|
|
if not os.path.exists(OUTPUT): |
|
|
os.mkdir(OUTPUT) |
|
|
os.mkdir(os.path.join(OUTPUT, "cloned_speakers")) |
|
|
os.mkdir(os.path.join(OUTPUT, "generated_audios")) |
|
|
elif os.path.exists(os.path.join(OUTPUT, "cloned_speakers")): |
|
|
print("Loading existing cloned speakers...") |
|
|
for file in os.listdir(os.path.join(OUTPUT, "cloned_speakers")): |
|
|
if file.endswith(".json"): |
|
|
with open(os.path.join(OUTPUT, "cloned_speakers", file), "r") as fp: |
|
|
cloned_speakers[file[:-5]] = json.load(fp) |
|
|
print("Available cloned speakers:", ", ".join(cloned_speakers.keys())) |
|
|
|
|
|
AUDIOS_DIR = os.path.join("demo_outputs", "generated_audios"); |
|
|
ZIP_DIR = os.path.join("zip_outputs"); |
|
|
|
|
|
print("Checking zip at", ZIP_DIR) |
|
|
if not os.path.exists(ZIP_DIR): |
|
|
os.mkdir(ZIP_DIR) |
|
|
|
|
|
|
|
|
try: |
|
|
print("Getting metadata from server ...") |
|
|
LANUGAGES = xtts.get_languages() |
|
|
print("Available languages:", ", ".join(LANUGAGES)) |
|
|
STUDIO_SPEAKERS = xtts.get_speakers() |
|
|
print("Available studio speakers:", ", ".join(STUDIO_SPEAKERS.keys())) |
|
|
except: |
|
|
raise Exception("Please make sure the server is running first.") |
|
|
|
|
|
|
|
|
def ExtractVars(input_string): |
|
|
|
|
|
lines = input_string.split('\n') |
|
|
|
|
|
|
|
|
result_dict = { |
|
|
'prefix': None, |
|
|
'name': '', |
|
|
'speaker': None, |
|
|
'num': None, |
|
|
} |
|
|
|
|
|
|
|
|
filtered_lines = [] |
|
|
|
|
|
|
|
|
for line in lines: |
|
|
|
|
|
if line.strip().startswith('!'): |
|
|
|
|
|
|
|
|
try: |
|
|
|
|
|
key, value = line.strip()[1:].split('=') |
|
|
key = key.strip() |
|
|
value = value.strip() |
|
|
|
|
|
result_dict[key] = value |
|
|
except ValueError: |
|
|
|
|
|
continue |
|
|
elif len(line.strip()) > 0: |
|
|
|
|
|
filtered_lines.append(line) |
|
|
|
|
|
|
|
|
filtered_string = '\n'.join(filtered_lines) |
|
|
return result_dict, filtered_string |
|
|
|
|
|
|
|
|
def ParsePronucs(PronuncStr): |
|
|
|
|
|
lines = PronuncStr.split('\n') |
|
|
|
|
|
|
|
|
PronuncWords = [] |
|
|
|
|
|
|
|
|
for line in lines: |
|
|
if len(line.strip()) > 0: |
|
|
word,*text = line.strip().split('=',1) |
|
|
word = word.strip() |
|
|
text,*opts = text[0].split("|",1); |
|
|
text = text.strip(); |
|
|
|
|
|
if len(opts) > 0: |
|
|
opts = opts[0].strip().split(","); |
|
|
else: |
|
|
opts = []; |
|
|
|
|
|
|
|
|
PronuncWords.append({'word':word, 'text':text, 'opts':opts}) |
|
|
|
|
|
return PronuncWords |
|
|
|
|
|
|
|
|
def FindSpeakerByName(name, speakerType): |
|
|
|
|
|
srcItems = STUDIO_SPEAKERS if speakerType == "Studio" else cloned_speakers; |
|
|
|
|
|
for key, value in srcItems.items(): |
|
|
|
|
|
if key == name: |
|
|
return key,value |
|
|
|
|
|
if key.split(" ")[0] == name: |
|
|
return key,value; |
|
|
|
|
|
|
|
|
def clone_speaker(upload_file, clone_speaker_name, cloned_speaker_names): |
|
|
embeddings = xtts.predict_speaker(upload_file) |
|
|
with open(os.path.join(OUTPUT, "cloned_speakers", clone_speaker_name + ".json"), "w") as fp: |
|
|
json.dump(embeddings, fp) |
|
|
cloned_speakers[clone_speaker_name] = embeddings |
|
|
cloned_speaker_names.append(clone_speaker_name) |
|
|
return upload_file, clone_speaker_name, cloned_speaker_names, gr.Dropdown(choices=cloned_speaker_names) |
|
|
|
|
|
def tts(text, pronunc, speaker_type, speaker_name_studio, speaker_name_custom, lang, temperature |
|
|
,speed,top_p,top_k, AllFileList,progress=gr.Progress() |
|
|
): |
|
|
embeddings = STUDIO_SPEAKERS[speaker_name_studio] if speaker_type == 'Studio' else cloned_speakers[speaker_name_custom] |
|
|
|
|
|
|
|
|
|
|
|
lines = text.split("---"); |
|
|
totalLines = len(lines); |
|
|
print("Total parts:", len(lines)) |
|
|
|
|
|
audioNum = 0; |
|
|
|
|
|
DefaultPrefix = next(tempfile._get_candidate_names()); |
|
|
|
|
|
CurrentPrefix = DefaultPrefix |
|
|
|
|
|
|
|
|
|
|
|
Pronuncs = ParsePronucs(pronunc) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
AudioList = []; |
|
|
for line in progress.tqdm(lines, desc="Gerando fala..."): |
|
|
audioNum += 1; |
|
|
|
|
|
textVars,cleanLine = ExtractVars(line) |
|
|
|
|
|
if textVars['prefix']: |
|
|
CurrentPrefix = textVars['prefix'] |
|
|
|
|
|
audioName = textVars['name']; |
|
|
|
|
|
if audioName: |
|
|
audioName = '_'+audioName |
|
|
|
|
|
num = textVars['num']; |
|
|
|
|
|
if not num: |
|
|
num = audioNum; |
|
|
|
|
|
path = CurrentPrefix +"_n_" + str(num)+audioName+".wav" |
|
|
|
|
|
print("Generating audio for line", num, 'sequence', audioNum); |
|
|
|
|
|
speaker = textVars['speaker']; |
|
|
|
|
|
if not speaker: |
|
|
speaker = speaker_name_studio if speaker_type == 'Studio' else speaker_name_custom |
|
|
|
|
|
speakerName,embeddings = FindSpeakerByName(speaker, speaker_type) |
|
|
|
|
|
if not speakerName: |
|
|
raise ValueError("InvalidSpeaker: "+speakerName) |
|
|
|
|
|
|
|
|
|
|
|
FixedText = cleanLine; |
|
|
|
|
|
for pronunc in Pronuncs: |
|
|
word = pronunc['word'] |
|
|
text = pronunc['text'] |
|
|
opts = pronunc['opts']; |
|
|
|
|
|
flg = re.IGNORECASE |
|
|
|
|
|
if 'cs' in opts: |
|
|
flg = 0; |
|
|
|
|
|
FixedText = re.sub(f'\\b{word}\\b', text, FixedText, flags=flg) |
|
|
|
|
|
ipts = xtts.TTSInputs( |
|
|
speaker_embedding=embeddings["speaker_embedding"], |
|
|
gpt_cond_latent=embeddings["gpt_cond_latent"], |
|
|
text=FixedText, |
|
|
language=lang, |
|
|
temperature=temperature, |
|
|
speed=speed, |
|
|
top_k=top_k, |
|
|
top_p=top_p |
|
|
) |
|
|
|
|
|
generated_audio = xtts.predict_speech(ipts) |
|
|
|
|
|
print("Audio generated.. Saving to", path); |
|
|
generated_audio_path = os.path.join(AUDIOS_DIR, path) |
|
|
with open(generated_audio_path, "wb") as fp: |
|
|
fp.write(base64.b64decode(generated_audio)) |
|
|
AudioList.append(fp.name); |
|
|
|
|
|
AllFileList.clear(); |
|
|
AllFileList.extend(AudioList); |
|
|
|
|
|
return gr.Dropdown( |
|
|
label="Generated Audios", |
|
|
choices=list(AudioList), |
|
|
value=AudioList[0] |
|
|
) |
|
|
|
|
|
def get_file_content(f): |
|
|
if len(f) > 0: |
|
|
return f[0]; |
|
|
|
|
|
return None; |
|
|
|
|
|
|
|
|
def UpdateFileList(DirListState): |
|
|
DirListState.clear(); |
|
|
DirListState.extend( os.listdir(AUDIOS_DIR) ) |
|
|
|
|
|
def audio_list_update(d): |
|
|
fullPath = abspath(d) |
|
|
return fullPath |
|
|
|
|
|
def ZipAndDownload(files): |
|
|
allFiles = files |
|
|
|
|
|
DefaultPrefix = next(tempfile._get_candidate_names()); |
|
|
|
|
|
zipFile = abspath( os.path.join(ZIP_DIR, DefaultPrefix + ".zip") ); |
|
|
|
|
|
|
|
|
with zipfile.ZipFile(zipFile, 'w') as zipMe: |
|
|
for file in allFiles: |
|
|
print("Zipping", file); |
|
|
zipMe.write(abspath(file), os.path.basename(file), compress_type=zipfile.ZIP_DEFLATED) |
|
|
|
|
|
print("Pronto", zipFile); |
|
|
|
|
|
return '<a href="/file='+zipFile+'">If donwload dont starts, click here</a>'; |
|
|
|
|
|
|
|
|
js = """ |
|
|
function DetectDownloadLink(){ |
|
|
console.log('Configuring AutoDonwloadObservr...'); |
|
|
let hiddenLink = document.getElementById("DonwloadLink"); |
|
|
let onChange= function(mutations){ |
|
|
|
|
|
for (const mutation of mutations) { |
|
|
if (mutation.type !== 'childList') |
|
|
continue; |
|
|
|
|
|
for (const addedNode of mutation.addedNodes) { |
|
|
if (addedNode.nodeName === 'A') { |
|
|
location.href = addedNode.href; |
|
|
} |
|
|
} |
|
|
|
|
|
} |
|
|
} |
|
|
|
|
|
let config = { attributes: true, childList: true, subtree: true, attributeFilter: ["href"] } |
|
|
let obs = new MutationObserver(onChange); |
|
|
obs.observe(hiddenLink, config); |
|
|
} |
|
|
""" |
|
|
|
|
|
with gr.Blocks(js=js) as demo: |
|
|
defaultSpeaker = "Dionisio Schuyler" |
|
|
cloned_speaker_names = gr.State(list(cloned_speakers.keys())) |
|
|
AllFileList = gr.State(list([])) |
|
|
|
|
|
gr.Markdown("By using any functionality of this space, you agree to the terms of this license: https://coqui.ai/cpml") |
|
|
|
|
|
with gr.Tab("TTS"): |
|
|
with gr.Column() as row4: |
|
|
with gr.Row() as col4: |
|
|
speaker_type = gr.Dropdown(label="Speaker type", choices=["Studio", "Cloned"], value="Studio") |
|
|
speaker_name_studio = gr.Dropdown( |
|
|
label="Studio speaker", |
|
|
choices=STUDIO_SPEAKERS.keys(), |
|
|
value=defaultSpeaker if defaultSpeaker in STUDIO_SPEAKERS.keys() else None, |
|
|
) |
|
|
speaker_name_custom = gr.Dropdown( |
|
|
label="Cloned speaker", |
|
|
choices=cloned_speaker_names.value, |
|
|
value=cloned_speaker_names.value[0] if len(cloned_speaker_names.value) != 0 else None, |
|
|
) |
|
|
with gr.Accordion("Advanced options", open=False): |
|
|
with gr.Row() as rowAdvanced: |
|
|
temperature = gr.Slider(0.00, 1.00, 0.5, step=0.05, label="Temperature", info="Choose between 0 and 1") |
|
|
top_p = gr.Slider(0.00, 1.00, 0.8, step=0.05, label="TOP P", info="Choose between 0 and 1") |
|
|
top_k = gr.Number(label="TOP K",value=50) |
|
|
speed = gr.Slider(0.00, 1000.00, 1.0, step=0.1, label="Speed", info="Speed (0 to 1000)") |
|
|
with gr.Column() as col2: |
|
|
with gr.Row(): |
|
|
text = gr.Textbox(label="Text", info="Generate multiple audios separating lines with ---",lines=4, value="Customizado por IA Talking, o maior blog de Inteligência Artificial do Brasil!") |
|
|
pronunc = gr.Textbox(label="Pronunciation Fix", info="Fix words pronuncation using WORD = SPEAK",lines=4) |
|
|
with gr.Row(): |
|
|
lang = gr.Dropdown(label="Language", choices=LANUGAGES, value="pt") |
|
|
tts_button = gr.Button(value="TTS") |
|
|
with gr.Column() as col3: |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
AudioList = gr.Dropdown( |
|
|
label="Generated Audios", |
|
|
choices=[] |
|
|
,interactive=True |
|
|
) |
|
|
|
|
|
generated_audio = gr.Audio(label="Audio Play", autoplay=True) |
|
|
AudioList.change(fn=audio_list_update, inputs=[AudioList], outputs=[generated_audio]) |
|
|
|
|
|
dummyHtml = gr.HTML(elem_id = "DonwloadLink", render = False); |
|
|
downloadAll = gr.DownloadButton("Download All Files") |
|
|
downloadAll.click(ZipAndDownload, inputs=[AllFileList], outputs=[dummyHtml]); |
|
|
dummyHtml.render(); |
|
|
|
|
|
|
|
|
with gr.Tab("Clone a new speaker"): |
|
|
with gr.Column() as col1: |
|
|
upload_file = gr.Audio(label="Upload reference audio", type="filepath") |
|
|
clone_speaker_name = gr.Textbox(label="Speaker name", value="default_speaker") |
|
|
clone_button = gr.Button(value="Clone speaker") |
|
|
|
|
|
|
|
|
with gr.Tab("Help"): |
|
|
gr.Markdown(""" |
|
|
Welcome to the XTTS WebUI version customized by the IA Talking blog (https://iatalk.ing). |
|
|
|
|
|
The main goal of this space is to share more scenarios on how XTTS can be used, as well as serve as a study resource to learn more about the TTS process and AI. |
|
|
|
|
|
In this version, we have some customizations that are quite useful. |
|
|
|
|
|
# Multiple audios |
|
|
You can generate multiple audios at once by separating the text with three dashes. For example: |
|
|
|
|
|
``` |
|
|
Text 1 |
|
|
--- |
|
|
Text 2, line 1 |
|
|
Text 2, line 2 |
|
|
``` |
|
|
|
|
|
In the above example, 2 audio files will be generated! This is very useful when you want to generate a lot of audio but don't want to generate it all at once due to the context lost in XTTS. |
|
|
You can also specify variables that modify certain aspects. |
|
|
|
|
|
For example, `!speaker = Dionisio` forces the speaker to be Dionisio only for that specific audio. |
|
|
|
|
|
List of variables: |
|
|
- `speaker` = name of the speaker |
|
|
- `num` = file number (by default, it's the sequential number) |
|
|
- `prefix` = file name prefix |
|
|
|
|
|
# Pronunciation adjustment |
|
|
|
|
|
If you have a text that you cannot or do not want to change the content of, you can use the Pronunciation field to map words with different pronunciations. |
|
|
|
|
|
Simply separate them by each line. Example: |
|
|
|
|
|
``` |
|
|
API = A,P,I |
|
|
SomeFunctionCode = Function Code |
|
|
``` |
|
|
|
|
|
This is useful for mapping foreign words, abbreviations, acronyms, code, etc. |
|
|
""") |
|
|
|
|
|
clone_button.click( |
|
|
fn=clone_speaker, |
|
|
inputs=[upload_file, clone_speaker_name, cloned_speaker_names], |
|
|
outputs=[upload_file, clone_speaker_name, cloned_speaker_names, speaker_name_custom], |
|
|
) |
|
|
|
|
|
tts_button.click( |
|
|
fn=tts, |
|
|
inputs=[text, pronunc,speaker_type, speaker_name_studio, speaker_name_custom, lang, temperature |
|
|
,speed,top_p,top_k,AllFileList |
|
|
], |
|
|
outputs=[AudioList], |
|
|
) |
|
|
|
|
|
if __name__ == "__main__" and DO_CHECK == "1": |
|
|
print("Warming up server... Checking server healthy...") |
|
|
|
|
|
speakerName, embs = random.choice(list(STUDIO_SPEAKERS.items())); |
|
|
|
|
|
print("Testing with", speakerName); |
|
|
|
|
|
ipts = xtts.TTSInputs( |
|
|
speaker_embedding=embs["speaker_embedding"], |
|
|
gpt_cond_latent=embs["gpt_cond_latent"], |
|
|
text="This is a warmup request.", |
|
|
language="en", |
|
|
temperature=0.5, |
|
|
speed=1.0, |
|
|
top_k=50, |
|
|
top_p=0.8 |
|
|
) |
|
|
|
|
|
resp = xtts.predict_speech(ipts) |
|
|
|
|
|
print(" TEST OK") |
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
print("STARTING...") |
|
|
demo.launch( |
|
|
share=False, |
|
|
debug=False, |
|
|
server_port=7860, |
|
|
server_name="0.0.0.0", |
|
|
allowed_paths=[ZIP_DIR], |
|
|
ssr_mode=False |
|
|
) |
|
|
|