vc / app.py
midhyaraj's picture
Update app.py
8eab11b verified
import os
import subprocess
import sys
# Function to setup the environment
def setup_environment():
# Clone the Tortoise-TTS repository if it doesn't exist
if not os.path.exists("tortoise-tts"):
subprocess.run(["git", "clone", "https://github.com/neonbjb/tortoise-tts.git"], check=True)
# Change directory to the cloned repository
os.chdir("tortoise-tts")
# Install requirements from requirements.txt
subprocess.run([sys.executable, "-m", "pip", "install", "-r", "requirements.txt"], check=True)
# Install the package using setup.py
subprocess.run([sys.executable, "setup.py", "install"], check=True)
# Install Gradio
subprocess.run([sys.executable, "-m", "pip", "install", "gradio"], check=True)
def main():
# Call the setup function to ensure everything is installed
setup_environment()
# Import Gradio and other required libraries after setting up the environment
import gradio as gr
import torchaudio
import time
from datetime import datetime
# Ensure the tortoise package is correctly imported
try:
from tortoise.api import TextToSpeech
except ImportError as e:
raise ImportError("Tortoise TTS not found. Make sure it is correctly installed.") from e
# Initialize the TextToSpeech instance
tts = TextToSpeech()
VOICE_OPTIONS = [
"random", # special option for random voice
"custom_voice", # special option for custom voice
"disabled", # special option for disabled voice
]
def inference(text, emotion, prompt, voice, mic_audio, voice_b, voice_c, preset, seed):
if voice != "custom_voice":
voices = [voice]
else:
voices = []
if voice_b != "disabled":
voices.append(voice_b)
if voice_c != "disabled":
voices.append(voice_c)
if emotion != "None/Custom":
text = f"[I am really {emotion.lower()},] {text}"
elif prompt.strip() != "":
text = f"[{prompt},] {text}"
c = None
if voice == "custom_voice":
if mic_audio is None:
raise gr.Error("Please provide audio from mic when choosing custom voice")
c = torchaudio.load(mic_audio)[0] # Use torchaudio to load audio
if len(voices) == 1 or len(voices) == 0:
if voice == "custom_voice":
voice_samples, conditioning_latents = [c], None
else:
voice_samples, conditioning_latents = tts.load_voice(voice) # Ensure to call TTS method
else:
voice_samples, conditioning_latents = tts.load_voices(voices)
if voice == "custom_voice":
voice_samples.append(c)
sample_voice = voice_samples[0] if len(voice_samples) else None
start_time = time.time()
gen, _ = tts.tts_with_preset(
text,
voice_samples=voice_samples,
conditioning_latents=conditioning_latents,
preset=preset,
use_deterministic_seed=seed,
return_deterministic_state=True,
k=3,
)
return (
(22050, sample_voice.squeeze().cpu().numpy()),
(24000, gen[0].squeeze().cpu().numpy()),
(24000, gen[1].squeeze().cpu().numpy()),
(24000, gen[2].squeeze().cpu().numpy()),
)
# Create the Gradio interface
interface = gr.Interface(
fn=inference,
inputs=[
gr.Textbox(lines=4, label="Text:"),
gr.Radio(["None/Custom", "Happy", "Sad", "Angry", "Disgusted", "Arrogant"],
value="None/Custom", label="Select emotion:"),
gr.Textbox(lines=1, label="Enter prompt if [Custom] emotion:"),
gr.Radio(["ultra_fast", "fast", "standard", "high_quality"],
value="fast", label="Preset mode:"),
gr.Dropdown(
options=os.listdir(os.path.join("tortoise", "voices")) + VOICE_OPTIONS,
value="angie", # Default voice
label="Select voice:"
),
gr.Audio(label="Record voice (when selected custom_voice):", type="filepath"),
gr.Dropdown(
options=os.listdir(os.path.join("tortoise", "voices")) + VOICE_OPTIONS,
value="disabled",
label="(Optional) Select second voice:"
),
gr.Dropdown(
options=os.listdir(os.path.join("tortoise", "voices")) + VOICE_OPTIONS,
value="disabled",
label="(Optional) Select third voice:"
),
gr.Number(value=0, precision=0, label="Seed (for reproducibility):"),
],
outputs=[
gr.Audio(label="Sample of selected voice (first):"),
gr.Audio(label="Output [Candidate 1]:"),
gr.Audio(label="Output [Candidate 2]:"),
gr.Audio(label="Output [Candidate 3]:"),
],
title="RJ VOICE CLONING",
description="<h1 style='text-align: center; color: orange; font-weight: bold;'>RJ VOICE CLONING</h1>",
css=".gradio-container { background-color: black; color: orange; }"
)
# Launch the interface
interface.launch(share=True)
if __name__ == "__main__":
main()