import gradio as gr import torch from transformers import AutoProcessor, VoxtralForConditionalGeneration import spaces MAX_TOKENS = 32000 device = "cuda" if torch.cuda.is_available() else "cpu" print(f"*** Device: {device}") model_name = 'mistralai/Voxtral-Mini-3B-2507' processor = AutoProcessor.from_pretrained(model_name) model = VoxtralForConditionalGeneration.from_pretrained(model_name, torch_dtype=torch.bfloat16, device_map=device) # Supported languages dict_languages = {"English": "en", "French": "fr", "German": "de", "Spanish": "es", "Italian": "it", "Portuguese": "pt", "Dutch": "nl", "Hindi": "hi"} @spaces.GPU def process_transcript(model, processor, language, audio_path): """Process audio with selected Voxtral model and return the generated response""" print("*** DANS PROCESS") if audio_path is None: print("***** audio_path VIDE") return "Please provide some input audio: either upload an audio file or use the microphone." else: id_language = dict_languages[language] inputs = processor.apply_transcrition_request(language=id_language, audio=audio_path, model_id=model_name) inputs = inputs.to(device, dtype=torch.bfloat16) outputs = model.generate(**inputs, max_new_tokens=MAX_TOKENS) decoded_outputs = processor.batch_decode(outputs[:, inputs.input_ids.shape[1]:], skip_special_tokens=True) return decoded_outputs[0] # Define Gradio interface with gr.Blocks(title="Transcription") as transcript: gr.Markdown("# Audio Transcription") gr.Markdown("#### Choose the language of the audio and the model, then set an audio file to get its transcription.") gr.Markdown("#### **(Voxtral handles audios up to 30 minutes for transcription)**") with gr.Row(): with gr.Column(): sel_language = gr.Dropdown( choices=list(dict_languages.keys()), value="English", label="Select the language of the audio file:" ) with gr.Column(): sel_audio = gr.Audio(sources=["microphone", "upload"], type="filepath", label="Upload an audio file or record via microphone:") with gr.Row(): with gr.Column(): submit_transcript = gr.Button("Extract Transcription", variant="primary") text_transcript = gr.Textbox(label="Generated Response", lines=10) example = [["mapo_tofu.mp3"]] gr.Examples( examples=example, inputs=sel_audio, outputs=text_transcript, fn=lambda v1, v2: process_transcript(model, processor, v1, v2), cache_examples=True # exécuter les exemples au chargement ) submit_transcript.click( fn=lambda v1, v2: process_transcript(model, processor, v1, v2), inputs=[sel_language, sel_audio], outputs=text_transcript ) # Launch the app if __name__ == "__main__": transcript.launch(share=True)