Spaces:
Running
Running
| import gradio as gr | |
| import torch | |
| from transformers import AutoProcessor, VoxtralForConditionalGeneration | |
| import spaces | |
| #### Functions | |
| def process_transcript(language: str, audio_path: str) -> str: | |
| """Process the audio file to return its transcription. | |
| Args: | |
| language: The language of the audio. | |
| audio_path: The path to the audio file. | |
| Returns: | |
| The transcribed text of the audio. | |
| """ | |
| if audio_path is None: | |
| return "Please provide some input audio: either upload an audio file or use the microphone." | |
| else: | |
| id_language = dict_languages[language] | |
| inputs = processor.apply_transcrition_request(language=id_language, audio=audio_path, model_id=model_name) | |
| inputs = inputs.to(device, dtype=torch.bfloat16) | |
| outputs = model.generate(**inputs, max_new_tokens=MAX_TOKENS) | |
| decoded_outputs = processor.batch_decode(outputs[:, inputs.input_ids.shape[1]:], skip_special_tokens=True) | |
| return decoded_outputs[0] | |
| ### | |
| def process_translate(language: str, audio_path: str) -> str: | |
| if audio_path is None: | |
| return "Please provide some input audio: either upload an audio file or use the microphone." | |
| else: | |
| conversation = [ | |
| { | |
| "role": "user", | |
| "content": [ | |
| { | |
| "type": "audio", | |
| "path": audio_path, | |
| }, | |
| {"type": "text", "text": "Translate this in "+language}, | |
| ], | |
| } | |
| ] | |
| inputs = processor.apply_chat_template(conversation) | |
| inputs = inputs.to(device, dtype=torch.bfloat16) | |
| outputs = model.generate(**inputs, max_new_tokens=MAX_TOKENS) | |
| decoded_outputs = processor.batch_decode(outputs[:, inputs.input_ids.shape[1]:], skip_special_tokens=True) | |
| return decoded_outputs[0] | |
| ### | |
| def process_chat(question: str, audio_path: str) -> str: | |
| if audio_path is None: | |
| return "Please provide some input audio: either upload an audio file or use the microphone." | |
| else: | |
| conversation = [ | |
| { | |
| "role": "user", | |
| "content": [ | |
| { | |
| "type": "audio", | |
| "path": audio_path, | |
| }, | |
| {"type": "text", "text": question}, | |
| ], | |
| } | |
| ] | |
| inputs = processor.apply_chat_template(conversation) | |
| inputs = inputs.to(device, dtype=torch.bfloat16) | |
| outputs = model.generate(**inputs, max_new_tokens=500) | |
| decoded_outputs = processor.batch_decode(outputs[:, inputs.input_ids.shape[1]:], skip_special_tokens=True) | |
| return decoded_outputs[0] | |
| ### | |
| def disable_buttons(): | |
| return gr.update(interactive=False), gr.update(interactive=False), gr.update(interactive=False) | |
| def enable_buttons(): | |
| return gr.update(interactive=True), gr.update(interactive=True), gr.update(interactive=True) | |
| ### | |
| ### Initializations | |
| MAX_TOKENS = 32000 | |
| device = "cuda" if torch.cuda.is_available() else "cpu" | |
| print(f"*** Device: {device}") | |
| model_name = 'mistralai/Voxtral-Mini-3B-2507' | |
| processor = AutoProcessor.from_pretrained(model_name) | |
| model = VoxtralForConditionalGeneration.from_pretrained(model_name, | |
| torch_dtype=torch.bfloat16, | |
| device_map=device) | |
| # Supported languages | |
| dict_languages = {"English": "en", | |
| "French": "fr", | |
| "German": "de", | |
| "Spanish": "es", | |
| "Italian": "it", | |
| "Portuguese": "pt", | |
| "Dutch": "nl", | |
| "Hindi": "hi"} | |
| #### Gradio interface | |
| with gr.Blocks(title="Voxtral") as voxtral: | |
| gr.Markdown("# **Voxtral Mini Evaluation**") | |
| gr.Markdown("""#### Voxtral Mini is an enhancement of **Ministral 3B**, incorporating state-of-the-art audio input \ | |
| capabilities while retaining best-in-class text performance. | |
| #### It excels at speech transcription, translation and audio understanding.""") | |
| with gr.Accordion("🔎 More on Voxtral", open=False): | |
| gr.Markdown("""## **Key Features:** | |
| #### Voxtral builds upon Ministral-3B with powerful audio understanding capabilities. | |
| ##### - **Dedicated transcription mode**: Voxtral can operate in a pure speech transcription mode to maximize performance. By default, Voxtral automatically predicts the source audio language and transcribes the text accordingly | |
| ##### - **Long-form context**: With a 32k token context length, Voxtral handles audios up to 30 minutes for transcription, or 40 minutes for understanding | |
| ##### - **Built-in Q&A and summarization**: Supports asking questions directly through audio. Analyze audio and generate structured summaries without the need for separate ASR and language models | |
| ##### - **Natively multilingual**: Automatic language detection and state-of-the-art performance in the world’s most widely used languages (English, Spanish, French, Portuguese, Hindi, German, Dutch, Italian) | |
| ##### - **Function-calling straight from voice**: Enables direct triggering of backend functions, workflows, or API calls based on spoken user intents | |
| ##### - **Highly capable at text**: Retains the text understanding capabilities of its language model backbone, Ministral-3B""") | |
| gr.Markdown("### **1. Upload an audio file, record via microphone, or select a demo file:**") | |
| gr.Markdown("### *(Voxtral handles audios up to 30 minutes for transcription)*") | |
| with gr.Row(): | |
| sel_audio = gr.Audio(sources=["upload", "microphone"], type="filepath", | |
| label="Set an audio file to process it:") | |
| example = [["mapo_tofu.mp3"]] | |
| gr.Examples( | |
| examples=example, | |
| inputs=sel_audio, | |
| outputs=None, | |
| fn=None, | |
| cache_examples=False, | |
| run_on_click=False | |
| ) | |
| with gr.Row(): | |
| gr.Markdown("### **2. Choose one of theese tasks:**") | |
| with gr.Row(): | |
| with gr.Column(): | |
| with gr.Accordion("📝 Transcription", open=True): | |
| sel_language = gr.Dropdown( | |
| choices=list(dict_languages.keys()), | |
| value="English", | |
| label="Select the language of the audio file:" | |
| ) | |
| submit_transcript = gr.Button("Extract transcription", variant="primary") | |
| text_transcript = gr.Textbox(label="💬 Generated transcription", lines=10) | |
| with gr.Column(): | |
| with gr.Accordion("🔁 Translation", open=True): | |
| sel_translate_language = gr.Dropdown( | |
| choices=list(dict_languages.keys()), | |
| value="English", | |
| label="Select the language for translation:" | |
| ) | |
| submit_translate = gr.Button("Translate audio file", variant="primary") | |
| text_translate = gr.Textbox(label="💬 Generated translation", lines=10) | |
| with gr.Column(): | |
| with gr.Accordion("🤖 Ask audio file", open=True): | |
| question_chat = gr.Textbox(label="Enter your question about audio file:", placeholder="Enter your question about audio file") | |
| submit_chat = gr.Button("Ask audio file", variant="primary") | |
| example_chat = [["What is the subject of this audio file?"], ["Quels sont les ingrédients ?"]] | |
| gr.Examples( | |
| examples=example_chat, | |
| inputs=question_chat, | |
| outputs=None, | |
| fn=None, | |
| cache_examples=False, | |
| run_on_click=False | |
| ) | |
| text_chat = gr.Textbox(label="💬 Model answer", lines=10) | |
| ### Processing | |
| # Transcription | |
| submit_transcript.click( | |
| disable_buttons, | |
| outputs=[submit_transcript, submit_translate, submit_chat], | |
| trigger_mode="once", | |
| ).then( | |
| fn=process_transcript, | |
| inputs=[sel_language, sel_audio], | |
| outputs=text_transcript | |
| ).then( | |
| enable_buttons, | |
| outputs=[submit_transcript, submit_translate, submit_chat], | |
| ) | |
| # Translation | |
| submit_translate.click( | |
| disable_buttons, | |
| outputs=[submit_transcript, submit_translate, submit_chat], | |
| trigger_mode="once", | |
| ).then( | |
| fn=process_translate, | |
| inputs=[sel_translate_language, sel_audio], | |
| outputs=text_translate | |
| ).then( | |
| enable_buttons, | |
| outputs=[submit_transcript, submit_translate, submit_chat], | |
| ) | |
| # Chat | |
| submit_chat.click( | |
| disable_buttons, | |
| outputs=[submit_transcript, submit_translate, submit_chat], | |
| trigger_mode="once", | |
| ).then( | |
| fn=process_chat, | |
| inputs=[question_chat, sel_audio], | |
| outputs=text_chat | |
| ).then( | |
| enable_buttons, | |
| outputs=[submit_transcript, submit_translate, submit_chat], | |
| ) | |
| ### Launch the app | |
| if __name__ == "__main__": | |
| voxtral.queue().launch() | |