|
|
import os |
|
|
import subprocess |
|
|
import sys |
|
|
|
|
|
|
|
|
def setup_environment(): |
|
|
|
|
|
if not os.path.exists("tortoise-tts"): |
|
|
subprocess.run(["git", "clone", "https://github.com/neonbjb/tortoise-tts.git"], check=True) |
|
|
|
|
|
|
|
|
os.chdir("tortoise-tts") |
|
|
|
|
|
|
|
|
subprocess.run([sys.executable, "-m", "pip", "install", "-r", "requirements.txt"], check=True) |
|
|
|
|
|
|
|
|
subprocess.run([sys.executable, "setup.py", "install"], check=True) |
|
|
|
|
|
|
|
|
subprocess.run([sys.executable, "-m", "pip", "install", "gradio"], check=True) |
|
|
|
|
|
def main(): |
|
|
|
|
|
setup_environment() |
|
|
|
|
|
|
|
|
import gradio as gr |
|
|
import torchaudio |
|
|
import time |
|
|
from datetime import datetime |
|
|
|
|
|
|
|
|
try: |
|
|
from tortoise.api import TextToSpeech |
|
|
except ImportError as e: |
|
|
raise ImportError("Tortoise TTS not found. Make sure it is correctly installed.") from e |
|
|
|
|
|
|
|
|
tts = TextToSpeech() |
|
|
|
|
|
VOICE_OPTIONS = [ |
|
|
"random", |
|
|
"custom_voice", |
|
|
"disabled", |
|
|
] |
|
|
|
|
|
def inference(text, emotion, prompt, voice, mic_audio, voice_b, voice_c, preset, seed): |
|
|
if voice != "custom_voice": |
|
|
voices = [voice] |
|
|
else: |
|
|
voices = [] |
|
|
|
|
|
if voice_b != "disabled": |
|
|
voices.append(voice_b) |
|
|
if voice_c != "disabled": |
|
|
voices.append(voice_c) |
|
|
|
|
|
if emotion != "None/Custom": |
|
|
text = f"[I am really {emotion.lower()},] {text}" |
|
|
elif prompt.strip() != "": |
|
|
text = f"[{prompt},] {text}" |
|
|
|
|
|
c = None |
|
|
if voice == "custom_voice": |
|
|
if mic_audio is None: |
|
|
raise gr.Error("Please provide audio from mic when choosing custom voice") |
|
|
c = torchaudio.load(mic_audio)[0] |
|
|
|
|
|
if len(voices) == 1 or len(voices) == 0: |
|
|
if voice == "custom_voice": |
|
|
voice_samples, conditioning_latents = [c], None |
|
|
else: |
|
|
voice_samples, conditioning_latents = tts.load_voice(voice) |
|
|
else: |
|
|
voice_samples, conditioning_latents = tts.load_voices(voices) |
|
|
if voice == "custom_voice": |
|
|
voice_samples.append(c) |
|
|
|
|
|
sample_voice = voice_samples[0] if len(voice_samples) else None |
|
|
|
|
|
start_time = time.time() |
|
|
gen, _ = tts.tts_with_preset( |
|
|
text, |
|
|
voice_samples=voice_samples, |
|
|
conditioning_latents=conditioning_latents, |
|
|
preset=preset, |
|
|
use_deterministic_seed=seed, |
|
|
return_deterministic_state=True, |
|
|
k=3, |
|
|
) |
|
|
|
|
|
return ( |
|
|
(22050, sample_voice.squeeze().cpu().numpy()), |
|
|
(24000, gen[0].squeeze().cpu().numpy()), |
|
|
(24000, gen[1].squeeze().cpu().numpy()), |
|
|
(24000, gen[2].squeeze().cpu().numpy()), |
|
|
) |
|
|
|
|
|
|
|
|
interface = gr.Interface( |
|
|
fn=inference, |
|
|
inputs=[ |
|
|
gr.Textbox(lines=4, label="Text:"), |
|
|
gr.Radio(["None/Custom", "Happy", "Sad", "Angry", "Disgusted", "Arrogant"], |
|
|
value="None/Custom", label="Select emotion:"), |
|
|
gr.Textbox(lines=1, label="Enter prompt if [Custom] emotion:"), |
|
|
gr.Radio(["ultra_fast", "fast", "standard", "high_quality"], |
|
|
value="fast", label="Preset mode:"), |
|
|
gr.Dropdown( |
|
|
options=os.listdir(os.path.join("tortoise", "voices")) + VOICE_OPTIONS, |
|
|
value="angie", |
|
|
label="Select voice:" |
|
|
), |
|
|
gr.Audio(label="Record voice (when selected custom_voice):", type="filepath"), |
|
|
gr.Dropdown( |
|
|
options=os.listdir(os.path.join("tortoise", "voices")) + VOICE_OPTIONS, |
|
|
value="disabled", |
|
|
label="(Optional) Select second voice:" |
|
|
), |
|
|
gr.Dropdown( |
|
|
options=os.listdir(os.path.join("tortoise", "voices")) + VOICE_OPTIONS, |
|
|
value="disabled", |
|
|
label="(Optional) Select third voice:" |
|
|
), |
|
|
gr.Number(value=0, precision=0, label="Seed (for reproducibility):"), |
|
|
], |
|
|
outputs=[ |
|
|
gr.Audio(label="Sample of selected voice (first):"), |
|
|
gr.Audio(label="Output [Candidate 1]:"), |
|
|
gr.Audio(label="Output [Candidate 2]:"), |
|
|
gr.Audio(label="Output [Candidate 3]:"), |
|
|
], |
|
|
title="RJ VOICE CLONING", |
|
|
description="<h1 style='text-align: center; color: orange; font-weight: bold;'>RJ VOICE CLONING</h1>", |
|
|
css=".gradio-container { background-color: black; color: orange; }" |
|
|
) |
|
|
|
|
|
|
|
|
interface.launch(share=True) |
|
|
|
|
|
if __name__ == "__main__": |
|
|
main() |
|
|
|