Spaces:
Running
Running
| import gradio as gr | |
| import os | |
| from tts import synthesize_and_save_audio | |
| import time | |
| def generate_tts(input_text, reference_audio_path, output_path="cloned.wav"): | |
| for i in range(3): | |
| try: | |
| result = synthesize_and_save_audio( | |
| input_text=input_text, | |
| voice_id=reference_audio_path, | |
| model="voxtral-mini-tts-2603", | |
| api_key=os.getenv("MISTRAL_API_KEY"), | |
| output_path=output_path, | |
| ) | |
| return output_path if result == 0 else None | |
| except Exception as e: | |
| time.sleep(1*(i+1)) | |
| print(e) | |
| raise gr.Error("An error occurred. Make sure you have selected an available voice, or that your reference audio is not longer than 30s (we recommend between 5s and 30s) and a valid mp3/wav file.") | |
| raise gr.Error("An error occurred. Make sure you have selected an available voice, or that your reference audio is not longer than 30s (we recommend between 5s and 30s) and a valid mp3/wav file.") | |
| def gradio_tts(input_text, audio_choice, uploaded_audio=None, profile: gr.OAuthProfile | None = None): | |
| if profile is None: | |
| raise gr.Error('You must sign in to the Space to use this feature, please click on "Sign in with Hugging Face".') | |
| if uploaded_audio is not None: | |
| reference_audio = uploaded_audio | |
| else: | |
| reference_audio = voice_mapping.get(audio_choice, audio_choice) | |
| output_path = "cloned.wav" | |
| try: | |
| generated_audio = generate_tts(input_text, reference_audio, output_path) | |
| return generated_audio | |
| except Exception as e: | |
| print(f"Error: {e}") | |
| raise gr.Error("An error occurred. Make sure you have selected an available voice, or that your reference audio is not longer than 30s (we recommend between 5s and 30s) and a valid mp3/wav file.") | |
| with open("styles.css", "r") as f: | |
| css = f.read() | |
| voice_mapping = { | |
| "EN - Jane, Sarcasm": "gb_jane_sarcasm", | |
| "EN - Jane, Confused": "gb_jane_confused", | |
| "EN - Jane, Shameful": "gb_jane_shameful", | |
| "EN - Jane, Sad": "gb_jane_sad", | |
| "EN - Jane, Neutral": "gb_jane_neutral", | |
| "EN - Jane, Jealousy": "gb_jane_jealousy", | |
| "EN - Jane, Frustrated": "gb_jane_frustrated", | |
| "EN - Jane, Curious": "gb_jane_curious", | |
| "EN - Jane, Confident": "gb_jane_confident", | |
| "EN - Paul, Sad": "en_paul_sad", | |
| "EN - Paul, Neutral": "en_paul_neutral", | |
| "EN - Paul, Happy": "en_paul_happy", | |
| "EN - Paul, Frustrated": "en_paul_frustrated", | |
| "EN - Paul, Excited": "en_paul_excited", | |
| "EN - Paul, Confident": "en_paul_confident", | |
| "EN - Paul, Cheerful": "en_paul_cheerful", | |
| "EN - Paul, Angry": "en_paul_angry", | |
| "EN - Oliver, Neutral": "gb_oliver_neutral", | |
| "EN - Oliver, Sad": "gb_oliver_sad", | |
| "EN - Oliver, Excited": "gb_oliver_excited", | |
| "EN - Oliver, Curious": "gb_oliver_curious", | |
| "EN - Oliver, Confident": "gb_oliver_confident", | |
| "EN - Oliver, Cheerful": "gb_oliver_cheerful", | |
| "EN - Oliver, Angry": "gb_oliver_angry", | |
| "FR - Marie, Sad": "fr_marie_sad", | |
| "FR - Marie, Neutral": "fr_marie_neutral", | |
| "FR - Marie, Happy": "fr_marie_happy", | |
| "FR - Marie, Excited": "fr_marie_excited", | |
| "FR - Marie, Curious": "fr_marie_curious", | |
| "FR - Marie, Angry": "fr_marie_angry" | |
| } | |
| fixed_voice_mapping = { | |
| "Cheerful Female": "examples/cheerful_female_sample.wav", | |
| "Casual Male": "examples/casual_male_sample.wav", | |
| "Neutral Female": "examples/neutral_female_sample.wav", | |
| "Neutral Male": "examples/neutral_male_sample.wav", | |
| "Casual Female": "examples/casual_female_sample.wav", | |
| } | |
| examples = [ | |
| ["Welcome to our AI demonstration. Let me show you how this works.", "EN - Jane, Neutral"], | |
| ["Regardez comme cette peinture est magnifique! Les couleurs sont si vives et harmonieuses.", "FR - Marie, Curious"], | |
| ["The results of the experiment were even better than we expected. This could change everything!", "EN - Oliver, Excited"], | |
| ["I’m not sure how to solve this problem, but I’ll keep trying until I figure it out.", "EN - Jane, Confused"], | |
| ["The weather today is absolutely perfect for a picnic in the park. Don’t you think?", "EN - Oliver, Cheerful"], | |
| ["I’m confident this project will be a success if we stay focused and work together.", "EN - Paul, Confident"], | |
| ["Bonjour! Je suis ravie de vous rencontrer aujourd’hui. Comment puis-je vous aider?", "FR - Marie, Happy"], | |
| ["I’ve always wondered how birds know exactly when to migrate south for the winter.", "EN - Jane, Curious"], | |
| ["This new software update is going to make our workflow so much faster and easier!", "EN - Oliver, Excited"], | |
| ["I’m really sorry to hear about what happened. Is there anything I can do to help?", "EN - Paul, Sad"], | |
| ["Oh no! I think I left my keys at the office. This is going to be a problem.", "EN - Jane, Frustrated"], | |
| ["Je ne peux pas croire qu’ils aient annulé le concert à la dernière minute! C’est inacceptable!", "FR - Marie, Angry"], | |
| ["La présentation était incroyable! J’ai appris tellement de choses nouvelles aujourd’hui.", "FR - Marie, Happy"], | |
| ["I’m really proud of what we’ve accomplished as a team. This milestone is just the beginning.", "EN - Paul, Confident"], | |
| ["I can’t believe how quickly time flies. It feels like just yesterday we started this project.", "EN - Jane, Neutral"], | |
| ] | |
| cln_examples = [ | |
| ["I just tried the new chocolate cake at that bakery downtown - it was absolutely divine! The rich, velvety texture just melted in my mouth.", "examples/cheerful_female_sample.wav"], | |
| ["Hey, did you catch the game last night? That last-minute goal was insane! I couldn't believe my eyes when it happened.", "examples/casual_male_sample.wav"], | |
| ["The new art exhibition at the museum is truly remarkable. The way they've curated modern and classical pieces together creates such an interesting dialogue.", "examples/neutral_female_sample.wav"], | |
| ["I've been reading about the latest advancements in renewable energy. The new solar panel efficiency records are quite impressive.", "examples/neutral_male_sample.wav"], | |
| ["You won't believe what happened at the office today - it was the funniest thing I've seen in ages!", "examples/casual_female_sample.wav"], | |
| ] | |
| demo = gr.Blocks() | |
| with demo: | |
| gr.LoginButton() | |
| gr.Markdown("## Voxtral TTS Demo", elem_classes="markdown") | |
| gr.Markdown('### Please sign-in to this space by clicking on "Sign in with Hugging Face" above.', elem_classes="markdown") | |
| gr.Markdown("Voxtral TTS is a text-to-speech model that can synthesize realistic speech. This release includes an open-weight model with fixed voices, and our proprietary model with voice customization capabilities.\n\nTest the full extent of our Voxtral TTS model in this demo space, or visit our [AI Studio](https://console.mistral.ai/build/audio/text-to-speech) for a better experience. For our open-weights release, learn more about it [here](https://huggingface.co/mistralai/Voxtral-4B-TTS-2603).", elem_classes="markdown") | |
| with gr.Tabs(): | |
| with gr.TabItem("Fixed Voices"): | |
| gr.Markdown("# Fixed Voices", elem_classes="markdown") | |
| gr.Markdown("Enter text to synthesize and select a predefined voice available through our AI Studio.", elem_classes="markdown") | |
| with gr.Row(): | |
| with gr.Column(elem_classes="gradio-box"): | |
| input_text_predefined = gr.Textbox( | |
| label="Enter text to synthesize", | |
| placeholder="Frontier AI in your hands.", | |
| elem_classes="gradio-textbox" | |
| ) | |
| audio_choice = gr.Dropdown( | |
| label="Select a predefined voice", | |
| choices=list(voice_mapping.keys()), | |
| value="EN - Jane, Curious", | |
| ) | |
| submit_btn_predefined = gr.Button("Generate Audio", elem_classes="gradio-button") | |
| with gr.Column(elem_classes="gradio-box"): | |
| output_audio_predefined = gr.Audio(label="Generated audio", elem_classes="gradio-audio", autoplay=True, buttons=[]) | |
| submit_btn_predefined.click( | |
| fn=gradio_tts, | |
| inputs=[input_text_predefined, audio_choice], | |
| outputs=[output_audio_predefined], | |
| concurrency_limit=1, | |
| ) | |
| gr.Examples( | |
| examples=examples, | |
| inputs=[input_text_predefined, audio_choice], | |
| outputs=[output_audio_predefined], | |
| fn=gradio_tts, | |
| cache_examples=False, | |
| ) | |
| with gr.TabItem("Customization"): | |
| gr.Markdown("# Customization", elem_classes="markdown") | |
| gr.Markdown( | |
| "Enter text to synthesize and upload your own reference audio through our AI Studio. " | |
| "A set of 5 examples is provided using 5 audio samples.", | |
| elem_classes="markdown" | |
| ) | |
| with gr.Row(): | |
| with gr.Column(elem_classes="gradio-box"): | |
| input_text_cloning = gr.Textbox( | |
| label="Enter text to synthesize", | |
| placeholder="Frontier AI in your hands.", | |
| elem_classes="gradio-textbox" | |
| ) | |
| uploaded_audio = gr.Audio( | |
| label="Upload your reference audio (5s-30s)", | |
| type="filepath", | |
| sources=["upload", "microphone"], | |
| elem_classes="gradio-audio", | |
| buttons=[], | |
| ) | |
| submit_btn_cloning = gr.Button("Generate Audio", elem_classes="gradio-button") | |
| with gr.Column(elem_classes="gradio-box"): | |
| output_audio_cloning = gr.Audio(label="Generated audio", elem_classes="gradio-audio", autoplay=True, buttons=[]) | |
| submit_btn_cloning.click( | |
| fn=gradio_tts, | |
| inputs=[input_text_cloning, uploaded_audio], | |
| outputs=[output_audio_cloning], | |
| concurrency_limit=1, | |
| ) | |
| gr.Examples( | |
| examples=cln_examples, | |
| inputs=[input_text_cloning, uploaded_audio], | |
| outputs=[output_audio_cloning], | |
| fn=gradio_tts, | |
| cache_examples=False, | |
| ) | |
| demo.queue(max_size=10) | |
| if __name__ == "__main__": | |
| demo.launch(share=False, css=css) |