File size: 2,971 Bytes
469746c
 
 
2dca6c6
469746c
 
 
 
 
 
934d9ac
 
 
 
 
 
469746c
 
 
 
 
 
 
 
 
 
 
d72f988
469746c
e80735d
 
6ab2b5f
e80735d
6ab2b5f
e80735d
3b10277
e80735d
382aa9f
 
469746c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
26b8f27
934d9ac
d72f988
382aa9f
934d9ac
 
469746c
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
import gradio as gr
import torch
from transformers import AutoProcessor, VoxtralForConditionalGeneration
import spaces

MAX_TOKENS = 32000

device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"*** Device: {device}")

model_name = 'mistralai/Voxtral-Mini-3B-2507'

processor = AutoProcessor.from_pretrained(model_name)
model = VoxtralForConditionalGeneration.from_pretrained(model_name,
                                                        torch_dtype=torch.bfloat16,
                                                        device_map=device)
# Supported languages
dict_languages = {"English": "en",
                  "French": "fr",
                  "German": "de",
                  "Spanish": "es",
                  "Italian": "it",
                  "Portuguese": "pt",
                  "Dutch": "nl",
                  "Hindi": "hi"}

@spaces.GPU
def process_transcript(language, audio_path, model, processor):
    """Process audio with selected Voxtral model and return the generated response"""
    print("*** DANS PROCESS")

    if audio_path is None:
        print("***** audio_path VIDE")
        raise gr.Error("Please provide some input audio: either upload an audio file or use the microphone")
    else:
        print("***** audio_path NON VIDE")
        print("audio_path:", audio_path)
    id_language = dict_languages[language]
    inputs = processor.apply_transcrition_request(language=id_language, audio=audio_path, model_id=model_name)
    inputs = inputs.to(device, dtype=torch.bfloat16)

    outputs = model.generate(**inputs, max_new_tokens=MAX_TOKENS)
    decoded_outputs = processor.batch_decode(outputs[:, inputs.input_ids.shape[1]:], skip_special_tokens=True)

    return decoded_outputs[0]



# Define Gradio interface
with gr.Blocks(title="Transcription") as transcript:
    gr.Markdown("# Audio Transcription")
    gr.Markdown("#### Choose the language of the audio and the model, then set an audio file to get its transcription.")
    gr.Markdown("#### **(Voxtral handles audios up to 30 minutes for transcription)**")

    with gr.Row():
        with gr.Column():
            sel_language = gr.Dropdown(
                choices=list(dict_languages.keys()),
                value="English",
                label="Select the language of the audio file:"
            )

        with gr.Column():
            sel_audio = gr.Audio(sources=["microphone", "upload"], type="filepath", label="Upload an audio file or record via microphone:")

            submit_transcript = gr.Button("Extract Transcription", variant="primary")

        with gr.Column():
            text_transcript = gr.Textbox(label="Generated Response", lines=10)

    print("sel_audio:", sel_audio)
    submit_transcript.click(
        fn=lambda v1, v2: process_transcript(model, processor, v1, v2),
        inputs=[sel_language, sel_audio],
        outputs=text_transcript
    )

# Launch the app
if __name__ == "__main__":
    transcript.launch(share=True)