File size: 10,635 Bytes
478642e
 
4c63b06
 
478642e
2c35bc9
7d0d905
4c63b06
 
 
 
 
 
 
 
 
 
 
 
f43c1ec
 
478642e
96acde1
 
941fc87
478642e
140ca3d
478642e
fef0116
478642e
 
 
2c35bc9
478642e
 
 
f43c1ec
478642e
 
 
 
fef0116
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14462b4
 
 
 
 
 
 
 
478642e
8574c41
4c63b06
31e8463
 
14462b4
 
 
 
 
 
 
 
 
 
 
478642e
 
4ead1b7
 
 
 
 
 
 
 
fb6f673
0836426
 
7086bf8
478642e
5c923b4
fdcf153
14462b4
478642e
6a69b1e
 
d07a779
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b14f1c4
d07a779
 
96acde1
d07a779
 
d785628
fb6f673
d07a779
 
 
 
96acde1
 
d07a779
adb9bfc
 
6a69b1e
 
2fa0da5
6a69b1e
 
478642e
 
 
 
 
 
 
 
2876313
478642e
0b34b7e
bb167e3
b14f1c4
478642e
 
 
b14f1c4
478642e
 
96acde1
478642e
 
d785628
fb6f673
478642e
4ead1b7
478642e
 
96acde1
 
478642e
 
d785628
478642e
fb6f673
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
import gradio as gr
import os
from tts import synthesize_and_save_audio
import time

def generate_tts(input_text, reference_audio_path, output_path="cloned.wav"):
    for i in range(3):
        try:
            result = synthesize_and_save_audio(
                input_text=input_text,
                voice_id=reference_audio_path,
                model="voxtral-mini-tts-2603",
                api_key=os.getenv("MISTRAL_API_KEY"),
                output_path=output_path,
            )
            return output_path if result == 0 else None
        except Exception as e:
            time.sleep(1*(i+1))
            print(e)
            raise gr.Error("An error occurred. Make sure you have selected an available voice, or that your reference audio is not longer than 30s (we recommend between 5s and 30s) and a valid mp3/wav file.")
    raise gr.Error("An error occurred. Make sure you have selected an available voice, or that your reference audio is not longer than 30s (we recommend between 5s and 30s) and a valid mp3/wav file.")

def gradio_tts(input_text, audio_choice, uploaded_audio=None, profile: gr.OAuthProfile | None = None):
    if profile is None:
        raise gr.Error('You must sign in to the Space to use this feature, please click on "Sign in with Hugging Face".')
    if uploaded_audio is not None:
        reference_audio = uploaded_audio
    else:
        reference_audio = voice_mapping.get(audio_choice, audio_choice)

    output_path = "cloned.wav"
    try:
        generated_audio = generate_tts(input_text, reference_audio, output_path)
        return generated_audio
    except Exception as e:
        print(f"Error: {e}")
        raise gr.Error("An error occurred. Make sure you have selected an available voice, or that your reference audio is not longer than 30s (we recommend between 5s and 30s) and a valid mp3/wav file.")

with open("styles.css", "r") as f:
    css = f.read()

voice_mapping = {
    "EN - Jane, Sarcasm": "gb_jane_sarcasm",
    "EN - Jane, Confused": "gb_jane_confused",
    "EN - Jane, Shameful": "gb_jane_shameful",
    "EN - Jane, Sad": "gb_jane_sad",
    "EN - Jane, Neutral": "gb_jane_neutral",
    "EN - Jane, Jealousy": "gb_jane_jealousy",
    "EN - Jane, Frustrated": "gb_jane_frustrated",
    "EN - Jane, Curious": "gb_jane_curious",
    "EN - Jane, Confident": "gb_jane_confident",
    "EN - Paul, Sad": "en_paul_sad",
    "EN - Paul, Neutral": "en_paul_neutral",
    "EN - Paul, Happy": "en_paul_happy",
    "EN - Paul, Frustrated": "en_paul_frustrated",
    "EN - Paul, Excited": "en_paul_excited",
    "EN - Paul, Confident": "en_paul_confident",
    "EN - Paul, Cheerful": "en_paul_cheerful",
    "EN - Paul, Angry": "en_paul_angry",
    "EN - Oliver, Neutral": "gb_oliver_neutral",
    "EN - Oliver, Sad": "gb_oliver_sad",
    "EN - Oliver, Excited": "gb_oliver_excited",
    "EN - Oliver, Curious": "gb_oliver_curious",
    "EN - Oliver, Confident": "gb_oliver_confident",
    "EN - Oliver, Cheerful": "gb_oliver_cheerful",
    "EN - Oliver, Angry": "gb_oliver_angry",
    "FR - Marie, Sad": "fr_marie_sad",
    "FR - Marie, Neutral": "fr_marie_neutral",
    "FR - Marie, Happy": "fr_marie_happy",
    "FR - Marie, Excited": "fr_marie_excited",
    "FR - Marie, Curious": "fr_marie_curious",
    "FR - Marie, Angry": "fr_marie_angry"
}

fixed_voice_mapping = {
    "Cheerful Female": "examples/cheerful_female_sample.wav",
    "Casual Male": "examples/casual_male_sample.wav",
    "Neutral Female": "examples/neutral_female_sample.wav",
    "Neutral Male": "examples/neutral_male_sample.wav",
    "Casual Female": "examples/casual_female_sample.wav",
}

examples = [
    ["Welcome to our AI demonstration. Let me show you how this works.", "EN - Jane, Neutral"],
    ["Regardez comme cette peinture est magnifique! Les couleurs sont si vives et harmonieuses.", "FR - Marie, Curious"],
    ["The results of the experiment were even better than we expected. This could change everything!", "EN - Oliver, Excited"],
    ["I’m not sure how to solve this problem, but I’ll keep trying until I figure it out.", "EN - Jane, Confused"],
    ["The weather today is absolutely perfect for a picnic in the park. Don’t you think?", "EN - Oliver, Cheerful"],
    ["I’m confident this project will be a success if we stay focused and work together.", "EN - Paul, Confident"],
    ["Bonjour! Je suis ravie de vous rencontrer aujourd’hui. Comment puis-je vous aider?", "FR - Marie, Happy"],
    ["I’ve always wondered how birds know exactly when to migrate south for the winter.", "EN - Jane, Curious"],
    ["This new software update is going to make our workflow so much faster and easier!", "EN - Oliver, Excited"],
    ["I’m really sorry to hear about what happened. Is there anything I can do to help?", "EN - Paul, Sad"],
    ["Oh no! I think I left my keys at the office. This is going to be a problem.", "EN - Jane, Frustrated"],
    ["Je ne peux pas croire qu’ils aient annulé le concert à la dernière minute! C’est inacceptable!", "FR - Marie, Angry"],
    ["La présentation était incroyable! J’ai appris tellement de choses nouvelles aujourd’hui.", "FR - Marie, Happy"],
    ["I’m really proud of what we’ve accomplished as a team. This milestone is just the beginning.", "EN - Paul, Confident"],
    ["I can’t believe how quickly time flies. It feels like just yesterday we started this project.", "EN - Jane, Neutral"],
]

cln_examples = [
    ["I just tried the new chocolate cake at that bakery downtown - it was absolutely divine! The rich, velvety texture just melted in my mouth.", "examples/cheerful_female_sample.wav"],
    ["Hey, did you catch the game last night? That last-minute goal was insane! I couldn't believe my eyes when it happened.", "examples/casual_male_sample.wav"],
    ["The new art exhibition at the museum is truly remarkable. The way they've curated modern and classical pieces together creates such an interesting dialogue.", "examples/neutral_female_sample.wav"],
    ["I've been reading about the latest advancements in renewable energy. The new solar panel efficiency records are quite impressive.", "examples/neutral_male_sample.wav"],
    ["You won't believe what happened at the office today - it was the funniest thing I've seen in ages!", "examples/casual_female_sample.wav"],
]

demo = gr.Blocks()

with demo:
    gr.LoginButton()
    gr.Markdown("## Voxtral TTS Demo", elem_classes="markdown")
    gr.Markdown('### Please sign-in to this space by clicking on "Sign in with Hugging Face" above.', elem_classes="markdown")
    gr.Markdown("Voxtral TTS is a text-to-speech model that can synthesize realistic speech. This release includes an open-weight model with fixed voices, and our proprietary model with voice customization capabilities.\n\nTest the full extent of our Voxtral TTS model in this demo space, or visit our [AI Studio](https://console.mistral.ai/build/audio/text-to-speech) for a better experience. For our open-weights release, learn more about it [here](https://huggingface.co/mistralai/Voxtral-4B-TTS-2603).", elem_classes="markdown")

    with gr.Tabs():
        with gr.TabItem("Fixed Voices"):
            gr.Markdown("# Fixed Voices", elem_classes="markdown")
            gr.Markdown("Enter text to synthesize and select a predefined voice available through our AI Studio.", elem_classes="markdown")
            with gr.Row():
                with gr.Column(elem_classes="gradio-box"):
                    input_text_predefined = gr.Textbox(
                        label="Enter text to synthesize",
                        placeholder="Frontier AI in your hands.",
                        elem_classes="gradio-textbox"
                    )
                    audio_choice = gr.Dropdown(
                        label="Select a predefined voice",
                        choices=list(voice_mapping.keys()),
                        value="EN - Jane, Curious",
                    )
                    submit_btn_predefined = gr.Button("Generate Audio", elem_classes="gradio-button")
                with gr.Column(elem_classes="gradio-box"):
                    output_audio_predefined = gr.Audio(label="Generated audio", elem_classes="gradio-audio", autoplay=True, buttons=[])

            submit_btn_predefined.click(
                fn=gradio_tts,
                inputs=[input_text_predefined, audio_choice],
                outputs=[output_audio_predefined],
                concurrency_limit=1,
            )
            gr.Examples(
                examples=examples,
                inputs=[input_text_predefined, audio_choice],
                outputs=[output_audio_predefined],
                fn=gradio_tts,
                cache_examples=False,
            )
        with gr.TabItem("Customization"):
            gr.Markdown("# Customization", elem_classes="markdown")
            gr.Markdown(
                "Enter text to synthesize and upload your own reference audio through our AI Studio. "
                "A set of 5 examples is provided using 5 audio samples.",
                elem_classes="markdown"
            )
            with gr.Row():
                with gr.Column(elem_classes="gradio-box"):
                    input_text_cloning = gr.Textbox(
                        label="Enter text to synthesize",
                        placeholder="Frontier AI in your hands.",
                        elem_classes="gradio-textbox"
                    )
                    uploaded_audio = gr.Audio(
                        label="Upload your reference audio (5s-30s)",
                        type="filepath",
                        sources=["upload", "microphone"],
                        elem_classes="gradio-audio",
                        buttons=[],
                    )
                    submit_btn_cloning = gr.Button("Generate Audio", elem_classes="gradio-button")
                with gr.Column(elem_classes="gradio-box"):
                    output_audio_cloning = gr.Audio(label="Generated audio", elem_classes="gradio-audio", autoplay=True, buttons=[])

            submit_btn_cloning.click(
                fn=gradio_tts,
                inputs=[input_text_cloning, uploaded_audio],
                outputs=[output_audio_cloning],
                concurrency_limit=1,
            )
            gr.Examples(
                examples=cln_examples,
                inputs=[input_text_cloning, uploaded_audio],
                outputs=[output_audio_cloning],
                fn=gradio_tts,
                cache_examples=False,
            )

demo.queue(max_size=10)
if __name__ == "__main__":
    demo.launch(share=False, css=css)