Voxtral-TTS

Running

File size: 10,635 Bytes

import gradio as gr
import os
from tts import synthesize_and_save_audio
import time

def generate_tts(input_text, reference_audio_path, output_path="cloned.wav"):
    for i in range(3):
        try:
            result = synthesize_and_save_audio(
                input_text=input_text,
                voice_id=reference_audio_path,
                model="voxtral-mini-tts-2603",
                api_key=os.getenv("MISTRAL_API_KEY"),
                output_path=output_path,
            )
            return output_path if result == 0 else None
        except Exception as e:
            time.sleep(1*(i+1))
            print(e)
            raise gr.Error("An error occurred. Make sure you have selected an available voice, or that your reference audio is not longer than 30s (we recommend between 5s and 30s) and a valid mp3/wav file.")
    raise gr.Error("An error occurred. Make sure you have selected an available voice, or that your reference audio is not longer than 30s (we recommend between 5s and 30s) and a valid mp3/wav file.")

def gradio_tts(input_text, audio_choice, uploaded_audio=None, profile: gr.OAuthProfile | None = None):
    if profile is None:
        raise gr.Error('You must sign in to the Space to use this feature, please click on "Sign in with Hugging Face".')
    if uploaded_audio is not None:
        reference_audio = uploaded_audio
    else:
        reference_audio = voice_mapping.get(audio_choice, audio_choice)

    output_path = "cloned.wav"
    try:
        generated_audio = generate_tts(input_text, reference_audio, output_path)
        return generated_audio
    except Exception as e:
        print(f"Error: {e}")
        raise gr.Error("An error occurred. Make sure you have selected an available voice, or that your reference audio is not longer than 30s (we recommend between 5s and 30s) and a valid mp3/wav file.")

with open("styles.css", "r") as f:
    css = f.read()

voice_mapping = {
    "EN - Jane, Sarcasm": "gb_jane_sarcasm",
    "EN - Jane, Confused": "gb_jane_confused",
    "EN - Jane, Shameful": "gb_jane_shameful",
    "EN - Jane, Sad": "gb_jane_sad",
    "EN - Jane, Neutral": "gb_jane_neutral",
    "EN - Jane, Jealousy": "gb_jane_jealousy",
    "EN - Jane, Frustrated": "gb_jane_frustrated",
    "EN - Jane, Curious": "gb_jane_curious",
    "EN - Jane, Confident": "gb_jane_confident",
    "EN - Paul, Sad": "en_paul_sad",
    "EN - Paul, Neutral": "en_paul_neutral",
    "EN - Paul, Happy": "en_paul_happy",
    "EN - Paul, Frustrated": "en_paul_frustrated",
    "EN - Paul, Excited": "en_paul_excited",
    "EN - Paul, Confident": "en_paul_confident",
    "EN - Paul, Cheerful": "en_paul_cheerful",
    "EN - Paul, Angry": "en_paul_angry",
    "EN - Oliver, Neutral": "gb_oliver_neutral",
    "EN - Oliver, Sad": "gb_oliver_sad",
    "EN - Oliver, Excited": "gb_oliver_excited",
    "EN - Oliver, Curious": "gb_oliver_curious",
    "EN - Oliver, Confident": "gb_oliver_confident",
    "EN - Oliver, Cheerful": "gb_oliver_cheerful",
    "EN - Oliver, Angry": "gb_oliver_angry",
    "FR - Marie, Sad": "fr_marie_sad",
    "FR - Marie, Neutral": "fr_marie_neutral",
    "FR - Marie, Happy": "fr_marie_happy",
    "FR - Marie, Excited": "fr_marie_excited",
    "FR - Marie, Curious": "fr_marie_curious",
    "FR - Marie, Angry": "fr_marie_angry"
}

fixed_voice_mapping = {
    "Cheerful Female": "examples/cheerful_female_sample.wav",
    "Casual Male": "examples/casual_male_sample.wav",
    "Neutral Female": "examples/neutral_female_sample.wav",
    "Neutral Male": "examples/neutral_male_sample.wav",
    "Casual Female": "examples/casual_female_sample.wav",
}

examples = [
    ["Welcome to our AI demonstration. Let me show you how this works.", "EN - Jane, Neutral"],
    ["Regardez comme cette peinture est magnifique! Les couleurs sont si vives et harmonieuses.", "FR - Marie, Curious"],
    ["The results of the experiment were even better than we expected. This could change everything!", "EN - Oliver, Excited"],
    ["I’m not sure how to solve this problem, but I’ll keep trying until I figure it out.", "EN - Jane, Confused"],
    ["The weather today is absolutely perfect for a picnic in the park. Don’t you think?", "EN - Oliver, Cheerful"],
    ["I’m confident this project will be a success if we stay focused and work together.", "EN - Paul, Confident"],
    ["Bonjour! Je suis ravie de vous rencontrer aujourd’hui. Comment puis-je vous aider?", "FR - Marie, Happy"],
    ["I’ve always wondered how birds know exactly when to migrate south for the winter.", "EN - Jane, Curious"],
    ["This new software update is going to make our workflow so much faster and easier!", "EN - Oliver, Excited"],
    ["I’m really sorry to hear about what happened. Is there anything I can do to help?", "EN - Paul, Sad"],
    ["Oh no! I think I left my keys at the office. This is going to be a problem.", "EN - Jane, Frustrated"],
    ["Je ne peux pas croire qu’ils aient annulé le concert à la dernière minute! C’est inacceptable!", "FR - Marie, Angry"],
    ["La présentation était incroyable! J’ai appris tellement de choses nouvelles aujourd’hui.", "FR - Marie, Happy"],
    ["I’m really proud of what we’ve accomplished as a team. This milestone is just the beginning.", "EN - Paul, Confident"],
    ["I can’t believe how quickly time flies. It feels like just yesterday we started this project.", "EN - Jane, Neutral"],
]

cln_examples = [
    ["I just tried the new chocolate cake at that bakery downtown - it was absolutely divine! The rich, velvety texture just melted in my mouth.", "examples/cheerful_female_sample.wav"],
    ["Hey, did you catch the game last night? That last-minute goal was insane! I couldn't believe my eyes when it happened.", "examples/casual_male_sample.wav"],
    ["The new art exhibition at the museum is truly remarkable. The way they've curated modern and classical pieces together creates such an interesting dialogue.", "examples/neutral_female_sample.wav"],
    ["I've been reading about the latest advancements in renewable energy. The new solar panel efficiency records are quite impressive.", "examples/neutral_male_sample.wav"],
    ["You won't believe what happened at the office today - it was the funniest thing I've seen in ages!", "examples/casual_female_sample.wav"],
]

demo = gr.Blocks()

with demo:
    gr.LoginButton()
    gr.Markdown("## Voxtral TTS Demo", elem_classes="markdown")
    gr.Markdown('### Please sign-in to this space by clicking on "Sign in with Hugging Face" above.', elem_classes="markdown")
    gr.Markdown("Voxtral TTS is a text-to-speech model that can synthesize realistic speech. This release includes an open-weight model with fixed voices, and our proprietary model with voice customization capabilities.\n\nTest the full extent of our Voxtral TTS model in this demo space, or visit our [AI Studio](https://console.mistral.ai/build/audio/text-to-speech) for a better experience. For our open-weights release, learn more about it [here](https://huggingface.co/mistralai/Voxtral-4B-TTS-2603).", elem_classes="markdown")

    with gr.Tabs():
        with gr.TabItem("Fixed Voices"):
            gr.Markdown("# Fixed Voices", elem_classes="markdown")
            gr.Markdown("Enter text to synthesize and select a predefined voice available through our AI Studio.", elem_classes="markdown")
            with gr.Row():
                with gr.Column(elem_classes="gradio-box"):
                    input_text_predefined = gr.Textbox(
                        label="Enter text to synthesize",
                        placeholder="Frontier AI in your hands.",
                        elem_classes="gradio-textbox"
                    )
                    audio_choice = gr.Dropdown(
                        label="Select a predefined voice",
                        choices=list(voice_mapping.keys()),
                        value="EN - Jane, Curious",
                    )
                    submit_btn_predefined = gr.Button("Generate Audio", elem_classes="gradio-button")
                with gr.Column(elem_classes="gradio-box"):
                    output_audio_predefined = gr.Audio(label="Generated audio", elem_classes="gradio-audio", autoplay=True, buttons=[])

            submit_btn_predefined.click(
                fn=gradio_tts,
                inputs=[input_text_predefined, audio_choice],
                outputs=[output_audio_predefined],
                concurrency_limit=1,
            )
            gr.Examples(
                examples=examples,
                inputs=[input_text_predefined, audio_choice],
                outputs=[output_audio_predefined],
                fn=gradio_tts,
                cache_examples=False,
            )
        with gr.TabItem("Customization"):
            gr.Markdown("# Customization", elem_classes="markdown")
            gr.Markdown(
                "Enter text to synthesize and upload your own reference audio through our AI Studio. "
                "A set of 5 examples is provided using 5 audio samples.",
                elem_classes="markdown"
            )
            with gr.Row():
                with gr.Column(elem_classes="gradio-box"):
                    input_text_cloning = gr.Textbox(
                        label="Enter text to synthesize",
                        placeholder="Frontier AI in your hands.",
                        elem_classes="gradio-textbox"
                    )
                    uploaded_audio = gr.Audio(
                        label="Upload your reference audio (5s-30s)",
                        type="filepath",
                        sources=["upload", "microphone"],
                        elem_classes="gradio-audio",
                        buttons=[],
                    )
                    submit_btn_cloning = gr.Button("Generate Audio", elem_classes="gradio-button")
                with gr.Column(elem_classes="gradio-box"):
                    output_audio_cloning = gr.Audio(label="Generated audio", elem_classes="gradio-audio", autoplay=True, buttons=[])

            submit_btn_cloning.click(
                fn=gradio_tts,
                inputs=[input_text_cloning, uploaded_audio],
                outputs=[output_audio_cloning],
                concurrency_limit=1,
            )
            gr.Examples(
                examples=cln_examples,
                inputs=[input_text_cloning, uploaded_audio],
                outputs=[output_audio_cloning],
                fn=gradio_tts,
                cache_examples=False,
            )

demo.queue(max_size=10)
if __name__ == "__main__":
    demo.launch(share=False, css=css)