Spaces:
Running
Running
File size: 10,635 Bytes
478642e 4c63b06 478642e 2c35bc9 7d0d905 4c63b06 f43c1ec 478642e 96acde1 941fc87 478642e 140ca3d 478642e fef0116 478642e 2c35bc9 478642e f43c1ec 478642e fef0116 14462b4 478642e 8574c41 4c63b06 31e8463 14462b4 478642e 4ead1b7 fb6f673 0836426 7086bf8 478642e 5c923b4 fdcf153 14462b4 478642e 6a69b1e d07a779 b14f1c4 d07a779 96acde1 d07a779 d785628 fb6f673 d07a779 96acde1 d07a779 adb9bfc 6a69b1e 2fa0da5 6a69b1e 478642e 2876313 478642e 0b34b7e bb167e3 b14f1c4 478642e b14f1c4 478642e 96acde1 478642e d785628 fb6f673 478642e 4ead1b7 478642e 96acde1 478642e d785628 478642e fb6f673 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 | import gradio as gr
import os
from tts import synthesize_and_save_audio
import time
def generate_tts(input_text, reference_audio_path, output_path="cloned.wav"):
for i in range(3):
try:
result = synthesize_and_save_audio(
input_text=input_text,
voice_id=reference_audio_path,
model="voxtral-mini-tts-2603",
api_key=os.getenv("MISTRAL_API_KEY"),
output_path=output_path,
)
return output_path if result == 0 else None
except Exception as e:
time.sleep(1*(i+1))
print(e)
raise gr.Error("An error occurred. Make sure you have selected an available voice, or that your reference audio is not longer than 30s (we recommend between 5s and 30s) and a valid mp3/wav file.")
raise gr.Error("An error occurred. Make sure you have selected an available voice, or that your reference audio is not longer than 30s (we recommend between 5s and 30s) and a valid mp3/wav file.")
def gradio_tts(input_text, audio_choice, uploaded_audio=None, profile: gr.OAuthProfile | None = None):
if profile is None:
raise gr.Error('You must sign in to the Space to use this feature, please click on "Sign in with Hugging Face".')
if uploaded_audio is not None:
reference_audio = uploaded_audio
else:
reference_audio = voice_mapping.get(audio_choice, audio_choice)
output_path = "cloned.wav"
try:
generated_audio = generate_tts(input_text, reference_audio, output_path)
return generated_audio
except Exception as e:
print(f"Error: {e}")
raise gr.Error("An error occurred. Make sure you have selected an available voice, or that your reference audio is not longer than 30s (we recommend between 5s and 30s) and a valid mp3/wav file.")
with open("styles.css", "r") as f:
css = f.read()
voice_mapping = {
"EN - Jane, Sarcasm": "gb_jane_sarcasm",
"EN - Jane, Confused": "gb_jane_confused",
"EN - Jane, Shameful": "gb_jane_shameful",
"EN - Jane, Sad": "gb_jane_sad",
"EN - Jane, Neutral": "gb_jane_neutral",
"EN - Jane, Jealousy": "gb_jane_jealousy",
"EN - Jane, Frustrated": "gb_jane_frustrated",
"EN - Jane, Curious": "gb_jane_curious",
"EN - Jane, Confident": "gb_jane_confident",
"EN - Paul, Sad": "en_paul_sad",
"EN - Paul, Neutral": "en_paul_neutral",
"EN - Paul, Happy": "en_paul_happy",
"EN - Paul, Frustrated": "en_paul_frustrated",
"EN - Paul, Excited": "en_paul_excited",
"EN - Paul, Confident": "en_paul_confident",
"EN - Paul, Cheerful": "en_paul_cheerful",
"EN - Paul, Angry": "en_paul_angry",
"EN - Oliver, Neutral": "gb_oliver_neutral",
"EN - Oliver, Sad": "gb_oliver_sad",
"EN - Oliver, Excited": "gb_oliver_excited",
"EN - Oliver, Curious": "gb_oliver_curious",
"EN - Oliver, Confident": "gb_oliver_confident",
"EN - Oliver, Cheerful": "gb_oliver_cheerful",
"EN - Oliver, Angry": "gb_oliver_angry",
"FR - Marie, Sad": "fr_marie_sad",
"FR - Marie, Neutral": "fr_marie_neutral",
"FR - Marie, Happy": "fr_marie_happy",
"FR - Marie, Excited": "fr_marie_excited",
"FR - Marie, Curious": "fr_marie_curious",
"FR - Marie, Angry": "fr_marie_angry"
}
fixed_voice_mapping = {
"Cheerful Female": "examples/cheerful_female_sample.wav",
"Casual Male": "examples/casual_male_sample.wav",
"Neutral Female": "examples/neutral_female_sample.wav",
"Neutral Male": "examples/neutral_male_sample.wav",
"Casual Female": "examples/casual_female_sample.wav",
}
examples = [
["Welcome to our AI demonstration. Let me show you how this works.", "EN - Jane, Neutral"],
["Regardez comme cette peinture est magnifique! Les couleurs sont si vives et harmonieuses.", "FR - Marie, Curious"],
["The results of the experiment were even better than we expected. This could change everything!", "EN - Oliver, Excited"],
["I’m not sure how to solve this problem, but I’ll keep trying until I figure it out.", "EN - Jane, Confused"],
["The weather today is absolutely perfect for a picnic in the park. Don’t you think?", "EN - Oliver, Cheerful"],
["I’m confident this project will be a success if we stay focused and work together.", "EN - Paul, Confident"],
["Bonjour! Je suis ravie de vous rencontrer aujourd’hui. Comment puis-je vous aider?", "FR - Marie, Happy"],
["I’ve always wondered how birds know exactly when to migrate south for the winter.", "EN - Jane, Curious"],
["This new software update is going to make our workflow so much faster and easier!", "EN - Oliver, Excited"],
["I’m really sorry to hear about what happened. Is there anything I can do to help?", "EN - Paul, Sad"],
["Oh no! I think I left my keys at the office. This is going to be a problem.", "EN - Jane, Frustrated"],
["Je ne peux pas croire qu’ils aient annulé le concert à la dernière minute! C’est inacceptable!", "FR - Marie, Angry"],
["La présentation était incroyable! J’ai appris tellement de choses nouvelles aujourd’hui.", "FR - Marie, Happy"],
["I’m really proud of what we’ve accomplished as a team. This milestone is just the beginning.", "EN - Paul, Confident"],
["I can’t believe how quickly time flies. It feels like just yesterday we started this project.", "EN - Jane, Neutral"],
]
cln_examples = [
["I just tried the new chocolate cake at that bakery downtown - it was absolutely divine! The rich, velvety texture just melted in my mouth.", "examples/cheerful_female_sample.wav"],
["Hey, did you catch the game last night? That last-minute goal was insane! I couldn't believe my eyes when it happened.", "examples/casual_male_sample.wav"],
["The new art exhibition at the museum is truly remarkable. The way they've curated modern and classical pieces together creates such an interesting dialogue.", "examples/neutral_female_sample.wav"],
["I've been reading about the latest advancements in renewable energy. The new solar panel efficiency records are quite impressive.", "examples/neutral_male_sample.wav"],
["You won't believe what happened at the office today - it was the funniest thing I've seen in ages!", "examples/casual_female_sample.wav"],
]
demo = gr.Blocks()
with demo:
gr.LoginButton()
gr.Markdown("## Voxtral TTS Demo", elem_classes="markdown")
gr.Markdown('### Please sign-in to this space by clicking on "Sign in with Hugging Face" above.', elem_classes="markdown")
gr.Markdown("Voxtral TTS is a text-to-speech model that can synthesize realistic speech. This release includes an open-weight model with fixed voices, and our proprietary model with voice customization capabilities.\n\nTest the full extent of our Voxtral TTS model in this demo space, or visit our [AI Studio](https://console.mistral.ai/build/audio/text-to-speech) for a better experience. For our open-weights release, learn more about it [here](https://huggingface.co/mistralai/Voxtral-4B-TTS-2603).", elem_classes="markdown")
with gr.Tabs():
with gr.TabItem("Fixed Voices"):
gr.Markdown("# Fixed Voices", elem_classes="markdown")
gr.Markdown("Enter text to synthesize and select a predefined voice available through our AI Studio.", elem_classes="markdown")
with gr.Row():
with gr.Column(elem_classes="gradio-box"):
input_text_predefined = gr.Textbox(
label="Enter text to synthesize",
placeholder="Frontier AI in your hands.",
elem_classes="gradio-textbox"
)
audio_choice = gr.Dropdown(
label="Select a predefined voice",
choices=list(voice_mapping.keys()),
value="EN - Jane, Curious",
)
submit_btn_predefined = gr.Button("Generate Audio", elem_classes="gradio-button")
with gr.Column(elem_classes="gradio-box"):
output_audio_predefined = gr.Audio(label="Generated audio", elem_classes="gradio-audio", autoplay=True, buttons=[])
submit_btn_predefined.click(
fn=gradio_tts,
inputs=[input_text_predefined, audio_choice],
outputs=[output_audio_predefined],
concurrency_limit=1,
)
gr.Examples(
examples=examples,
inputs=[input_text_predefined, audio_choice],
outputs=[output_audio_predefined],
fn=gradio_tts,
cache_examples=False,
)
with gr.TabItem("Customization"):
gr.Markdown("# Customization", elem_classes="markdown")
gr.Markdown(
"Enter text to synthesize and upload your own reference audio through our AI Studio. "
"A set of 5 examples is provided using 5 audio samples.",
elem_classes="markdown"
)
with gr.Row():
with gr.Column(elem_classes="gradio-box"):
input_text_cloning = gr.Textbox(
label="Enter text to synthesize",
placeholder="Frontier AI in your hands.",
elem_classes="gradio-textbox"
)
uploaded_audio = gr.Audio(
label="Upload your reference audio (5s-30s)",
type="filepath",
sources=["upload", "microphone"],
elem_classes="gradio-audio",
buttons=[],
)
submit_btn_cloning = gr.Button("Generate Audio", elem_classes="gradio-button")
with gr.Column(elem_classes="gradio-box"):
output_audio_cloning = gr.Audio(label="Generated audio", elem_classes="gradio-audio", autoplay=True, buttons=[])
submit_btn_cloning.click(
fn=gradio_tts,
inputs=[input_text_cloning, uploaded_audio],
outputs=[output_audio_cloning],
concurrency_limit=1,
)
gr.Examples(
examples=cln_examples,
inputs=[input_text_cloning, uploaded_audio],
outputs=[output_audio_cloning],
fn=gradio_tts,
cache_examples=False,
)
demo.queue(max_size=10)
if __name__ == "__main__":
demo.launch(share=False, css=css) |