File size: 3,631 Bytes
f0d5b79
 
 
 
 
 
 
 
 
aa14541
 
f0d5b79
aa14541
 
f0d5b79
ba6a1e9
 
 
 
 
 
 
 
 
 
 
 
 
 
f0d5b79
ba6a1e9
f0d5b79
 
 
 
 
 
 
aa14541
f0d5b79
 
 
aa14541
f0d5b79
 
 
ba6a1e9
346a58e
f0d5b79
 
 
 
 
 
 
ba6a1e9
 
f0d5b79
 
e76aabe
 
1e39ba8
f0d5b79
 
 
 
 
 
 
 
 
 
 
 
ba6a1e9
de4881c
f0d5b79
 
 
 
ba6a1e9
f0d5b79
 
 
 
 
 
 
 
 
 
 
 
1e39ba8
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
import gradio as gr
import spaces
import torch
from transformers import AutoProcessor, VoxtralForConditionalGeneration

device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

# Load model and processor
voxtral_mini_processor = AutoProcessor.from_pretrained("MohamedRashad/Voxtral-Mini-3B-2507-transformers")
voxtral_mini_model = VoxtralForConditionalGeneration.from_pretrained("MohamedRashad/Voxtral-Mini-3B-2507-transformers", torch_dtype=torch.bfloat16, device_map=device)

voxtral_small_processor = AutoProcessor.from_pretrained("MohamedRashad/Voxtral-Small-24B-2507-transformers")
voxtral_small_model = VoxtralForConditionalGeneration.from_pretrained("MohamedRashad/Voxtral-Small-24B-2507-transformers", torch_dtype=torch.bfloat16, device_map=device)

LANGUAGES = {
    "English": "en",
    "French": "fr",
    "German": "de",
    "Spanish": "es",
    "Italian": "it",
    "Portuguese": "pt",
    "Dutch": "nl",
    "Russian": "ru",
    "Chinese": "zh",
    "Japanese": "ja",
    "Arabic": "ar",
}

@spaces.GPU()
def process_audio(audio_path, model_name, lang_name, max_tokens=500):
    """Process audio with selected Voxtral model and return the generated response"""
    if not audio_path:
        return "Please upload an audio file."

    if model_name == "Voxtral Mini (3B)":
        model = voxtral_mini_model
        processor = voxtral_mini_processor
        repo_id = "MohamedRashad/Voxtral-Mini-3B-2507-transformers"
    elif model_name == "Voxtral Small (24B)":
        model = voxtral_small_model
        processor = voxtral_small_processor
        repo_id = "MohamedRashad/Voxtral-Small-24B-2507-transformers"
    else:
        return "Invalid model selected."
    
    language = LANGUAGES[lang_name]
    inputs = processor.apply_transcrition_request(language=language, audio=audio_path, model_id=repo_id)
    inputs = inputs.to(device, dtype=torch.bfloat16)
    
    outputs = model.generate(**inputs, max_new_tokens=max_tokens)
    decoded_outputs = processor.batch_decode(outputs[:, inputs.input_ids.shape[1]:], skip_special_tokens=True)
    
    return decoded_outputs[0]



# Define Gradio interface
with gr.Blocks(title="Voxtral Demo") as demo:
    gr.Markdown("# Voxtral Transcription Demo")
    gr.Markdown("Upload an audio file and get a transcription from Voxtral.")
    gr.Markdown("You can find the `transformers` version of Voxtral here: [3B](https://huggingface.co/MohamedRashad/Voxtral-Mini-3B-2507-transformers), [24B](https://huggingface.co/MohamedRashad/Voxtral-Small-24B-2507-transformers)")
    
    with gr.Row():
        with gr.Column():
            audio_input = gr.Audio(type="filepath", label="Upload Audio")
            
            model_selector = gr.Dropdown(
                choices=["Voxtral Mini (3B)", "Voxtral Small (24B)"],
                value="Voxtral Mini (3B)",
                label="Select Model"
            )
            
            language = gr.Dropdown(
                choices=list(LANGUAGES.keys()),
                value="English",
                label="Language"
            )
            
            max_tokens = gr.Slider(minimum=50, maximum=1000, value=500, step=50, label="Max Output Tokens")
            submit_btn = gr.Button("Extract Transcription", variant="primary")
        
        with gr.Column():
            output_text = gr.Textbox(label="Generated Response", lines=10)
    
    submit_btn.click(
        fn=process_audio,
        inputs=[audio_input, model_selector, language, max_tokens],
        outputs=output_text
    )

# Launch the app
if __name__ == "__main__":
    demo.queue().launch(share=False)