Loren's picture
Update app.py
3b10277 verified
raw
history blame
2.97 kB
import gradio as gr
import torch
from transformers import AutoProcessor, VoxtralForConditionalGeneration
import spaces
MAX_TOKENS = 32000
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"*** Device: {device}")
model_name = 'mistralai/Voxtral-Mini-3B-2507'
processor = AutoProcessor.from_pretrained(model_name)
model = VoxtralForConditionalGeneration.from_pretrained(model_name,
torch_dtype=torch.bfloat16,
device_map=device)
# Supported languages
dict_languages = {"English": "en",
"French": "fr",
"German": "de",
"Spanish": "es",
"Italian": "it",
"Portuguese": "pt",
"Dutch": "nl",
"Hindi": "hi"}
@spaces.GPU
def process_transcript(language, audio_path, model, processor):
"""Process audio with selected Voxtral model and return the generated response"""
print("*** DANS PROCESS")
if audio_path is None:
print("***** audio_path VIDE")
raise gr.Error("Please provide some input audio: either upload an audio file or use the microphone")
else:
print("***** audio_path NON VIDE")
print("audio_path:", audio_path)
id_language = dict_languages[language]
inputs = processor.apply_transcrition_request(language=id_language, audio=audio_path, model_id=model_name)
inputs = inputs.to(device, dtype=torch.bfloat16)
outputs = model.generate(**inputs, max_new_tokens=MAX_TOKENS)
decoded_outputs = processor.batch_decode(outputs[:, inputs.input_ids.shape[1]:], skip_special_tokens=True)
return decoded_outputs[0]
# Define Gradio interface
with gr.Blocks(title="Transcription") as transcript:
gr.Markdown("# Audio Transcription")
gr.Markdown("#### Choose the language of the audio and the model, then set an audio file to get its transcription.")
gr.Markdown("#### **(Voxtral handles audios up to 30 minutes for transcription)**")
with gr.Row():
with gr.Column():
sel_language = gr.Dropdown(
choices=list(dict_languages.keys()),
value="English",
label="Select the language of the audio file:"
)
with gr.Column():
sel_audio = gr.Audio(sources=["microphone", "upload"], type="filepath", label="Upload an audio file or record via microphone:")
submit_transcript = gr.Button("Extract Transcription", variant="primary")
with gr.Column():
text_transcript = gr.Textbox(label="Generated Response", lines=10)
print("sel_audio:", sel_audio)
submit_transcript.click(
fn=lambda v1, v2: process_transcript(model, processor, v1, v2),
inputs=[sel_language, sel_audio],
outputs=text_transcript
)
# Launch the app
if __name__ == "__main__":
transcript.launch(share=True)