Spaces:
Running
on
Zero
Running
on
Zero
| import gradio as gr | |
| import torch | |
| from transformers import AutoProcessor, VoxtralForConditionalGeneration | |
| import spaces | |
| MAX_TOKENS = 32000 | |
| device = "cuda" if torch.cuda.is_available() else "cpu" | |
| print(f"*** Device: {device}") | |
| model_name = 'mistralai/Voxtral-Mini-3B-2507' | |
| processor = AutoProcessor.from_pretrained(model_name) | |
| model = VoxtralForConditionalGeneration.from_pretrained(model_name, | |
| torch_dtype=torch.bfloat16, | |
| device_map=device) | |
| # Supported languages | |
| dict_languages = {"English": "en", | |
| "French": "fr", | |
| "German": "de", | |
| "Spanish": "es", | |
| "Italian": "it", | |
| "Portuguese": "pt", | |
| "Dutch": "nl", | |
| "Hindi": "hi"} | |
| def process_transcript(language, audio_path, model, processor): | |
| """Process audio with selected Voxtral model and return the generated response""" | |
| print("*** DANS PROCESS") | |
| if audio_path is None: | |
| print("***** audio_path VIDE") | |
| raise gr.Error("Please provide some input audio: either upload an audio file or use the microphone") | |
| else: | |
| print("***** audio_path NON VIDE") | |
| print("audio_path:", audio_path) | |
| id_language = dict_languages[language] | |
| inputs = processor.apply_transcrition_request(language=id_language, audio=audio_path, model_id=model_name) | |
| inputs = inputs.to(device, dtype=torch.bfloat16) | |
| outputs = model.generate(**inputs, max_new_tokens=MAX_TOKENS) | |
| decoded_outputs = processor.batch_decode(outputs[:, inputs.input_ids.shape[1]:], skip_special_tokens=True) | |
| return decoded_outputs[0] | |
| # Define Gradio interface | |
| with gr.Blocks(title="Transcription") as transcript: | |
| gr.Markdown("# Audio Transcription") | |
| gr.Markdown("#### Choose the language of the audio and the model, then set an audio file to get its transcription.") | |
| gr.Markdown("#### **(Voxtral handles audios up to 30 minutes for transcription)**") | |
| with gr.Row(): | |
| with gr.Column(): | |
| sel_language = gr.Dropdown( | |
| choices=list(dict_languages.keys()), | |
| value="English", | |
| label="Select the language of the audio file:" | |
| ) | |
| with gr.Column(): | |
| sel_audio = gr.Audio(sources=["microphone", "upload"], type="filepath", label="Upload an audio file or record via microphone:") | |
| submit_transcript = gr.Button("Extract Transcription", variant="primary") | |
| with gr.Column(): | |
| text_transcript = gr.Textbox(label="Generated Response", lines=10) | |
| print("sel_audio:", sel_audio) | |
| submit_transcript.click( | |
| fn=lambda v1, v2: process_transcript(model, processor, v1, v2), | |
| inputs=[sel_language, sel_audio], | |
| outputs=text_transcript | |
| ) | |
| # Launch the app | |
| if __name__ == "__main__": | |
| transcript.launch(share=True) | |