Spaces:

midhyaraj
/

vc

Runtime error

File size: 5,321 Bytes

321e586
ba4c6a9
e16556e
321e586
e16556e
ba4c6a9
e16556e
 
 
321e586
ba4c6a9
 
321e586
ba4c6a9
e16556e
ba4c6a9
 
8eab11b
ba4c6a9
 
e16556e
321e586
c3d1a4f
 
 
0343a55
c3d1a4f
 
 
 
 
0343a55
c3d1a4f
 
 
 
 
0343a55
c3d1a4f
 
0343a55
c3d1a4f
 
 
 
 
0343a55
c3d1a4f
8eab11b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
321e586
 
 
 
ba4c6a9
8eab11b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ba4c6a9
 
8eab11b
 
 
 
ba4c6a9
321e586
ba4c6a9
321e586

import os
import subprocess
import sys

# Function to setup the environment
def setup_environment():
    # Clone the Tortoise-TTS repository if it doesn't exist
    if not os.path.exists("tortoise-tts"):
        subprocess.run(["git", "clone", "https://github.com/neonbjb/tortoise-tts.git"], check=True)

    # Change directory to the cloned repository
    os.chdir("tortoise-tts")

    # Install requirements from requirements.txt
    subprocess.run([sys.executable, "-m", "pip", "install", "-r", "requirements.txt"], check=True)

    # Install the package using setup.py
    subprocess.run([sys.executable, "setup.py", "install"], check=True)

    # Install Gradio
    subprocess.run([sys.executable, "-m", "pip", "install", "gradio"], check=True)

def main():
    # Call the setup function to ensure everything is installed
    setup_environment()

    # Import Gradio and other required libraries after setting up the environment
    import gradio as gr
    import torchaudio
    import time
    from datetime import datetime

    # Ensure the tortoise package is correctly imported
    try:
        from tortoise.api import TextToSpeech
    except ImportError as e:
        raise ImportError("Tortoise TTS not found. Make sure it is correctly installed.") from e

    # Initialize the TextToSpeech instance
    tts = TextToSpeech()

    VOICE_OPTIONS = [
        "random",  # special option for random voice
        "custom_voice",  # special option for custom voice
        "disabled",  # special option for disabled voice
    ]

    def inference(text, emotion, prompt, voice, mic_audio, voice_b, voice_c, preset, seed):
        if voice != "custom_voice":
            voices = [voice]
        else:
            voices = []

        if voice_b != "disabled":
            voices.append(voice_b)
        if voice_c != "disabled":
            voices.append(voice_c)

        if emotion != "None/Custom":
            text = f"[I am really {emotion.lower()},] {text}"
        elif prompt.strip() != "":
            text = f"[{prompt},] {text}"

        c = None
        if voice == "custom_voice":
            if mic_audio is None:
                raise gr.Error("Please provide audio from mic when choosing custom voice")
            c = torchaudio.load(mic_audio)[0]  # Use torchaudio to load audio

        if len(voices) == 1 or len(voices) == 0:
            if voice == "custom_voice":
                voice_samples, conditioning_latents = [c], None
            else:
                voice_samples, conditioning_latents = tts.load_voice(voice)  # Ensure to call TTS method
        else:
            voice_samples, conditioning_latents = tts.load_voices(voices)
            if voice == "custom_voice":
                voice_samples.append(c)

        sample_voice = voice_samples[0] if len(voice_samples) else None

        start_time = time.time()
        gen, _ = tts.tts_with_preset(
            text,
            voice_samples=voice_samples,
            conditioning_latents=conditioning_latents,
            preset=preset,
            use_deterministic_seed=seed,
            return_deterministic_state=True,
            k=3,
        )

        return (
            (22050, sample_voice.squeeze().cpu().numpy()),
            (24000, gen[0].squeeze().cpu().numpy()),
            (24000, gen[1].squeeze().cpu().numpy()),
            (24000, gen[2].squeeze().cpu().numpy()),
        )

    # Create the Gradio interface
    interface = gr.Interface(
        fn=inference,
        inputs=[
            gr.Textbox(lines=4, label="Text:"),
            gr.Radio(["None/Custom", "Happy", "Sad", "Angry", "Disgusted", "Arrogant"],
                     value="None/Custom", label="Select emotion:"),
            gr.Textbox(lines=1, label="Enter prompt if [Custom] emotion:"),
            gr.Radio(["ultra_fast", "fast", "standard", "high_quality"],
                     value="fast", label="Preset mode:"),
            gr.Dropdown(
                options=os.listdir(os.path.join("tortoise", "voices")) + VOICE_OPTIONS,
                value="angie",  # Default voice
                label="Select voice:"
            ),
            gr.Audio(label="Record voice (when selected custom_voice):", type="filepath"),
            gr.Dropdown(
                options=os.listdir(os.path.join("tortoise", "voices")) + VOICE_OPTIONS,
                value="disabled",
                label="(Optional) Select second voice:"
            ),
            gr.Dropdown(
                options=os.listdir(os.path.join("tortoise", "voices")) + VOICE_OPTIONS,
                value="disabled",
                label="(Optional) Select third voice:"
            ),
            gr.Number(value=0, precision=0, label="Seed (for reproducibility):"),
        ],
        outputs=[
            gr.Audio(label="Sample of selected voice (first):"),
            gr.Audio(label="Output [Candidate 1]:"),
            gr.Audio(label="Output [Candidate 2]:"),
            gr.Audio(label="Output [Candidate 3]:"),
        ],
        title="RJ VOICE CLONING",
        description="<h1 style='text-align: center; color: orange; font-weight: bold;'>RJ VOICE CLONING</h1>",
        css=".gradio-container { background-color: black; color: orange; }"
    )

    # Launch the interface
    interface.launch(share=True)

if __name__ == "__main__":
    main()