File size: 8,573 Bytes
db214be
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
import gradio as gr
import torch
from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq
import librosa
import numpy as np
import os
import tempfile
from datetime import datetime

# Global variables for model and processor
processor = None
model = None

def load_model():
    """Load the Voxtral model and processor"""
    global processor, model
    
    if processor is not None and model is not None:
        return processor, model
    
    try:
        model_name = "mistralai/Voxtral-Small-24B-2507"
        
        print("Loading Voxtral model... This may take a few minutes.")
        processor = AutoProcessor.from_pretrained(model_name)
        model = AutoModelForSpeechSeq2Seq.from_pretrained(
            model_name,
            torch_dtype=torch.float16,
            device_map="auto",
            low_cpu_mem_usage=True,
            trust_remote_code=True
        )
        
        print("Model loaded successfully!")
        return processor, model
        
    except Exception as e:
        print(f"Error loading model: {str(e)}")
        return None, None

def transcribe_audio(audio_file):
    """Process audio file and return transcription"""
    if audio_file is None:
        return "Please upload an audio file.", "", ""
    
    try:
        # Load model if not already loaded
        global processor, model
        if processor is None or model is None:
            processor, model = load_model()
        
        if processor is None or model is None:
            return "Error: Model failed to load. Please try again.", "", ""
        
        # Load audio file
        if isinstance(audio_file, str):
            # If it's a file path
            audio, sample_rate = librosa.load(audio_file, sr=16000)
        else:
            # If it's uploaded file data
            audio, sample_rate = librosa.load(audio_file.name, sr=16000)
        
        # Calculate duration
        duration = len(audio) / sample_rate
        
        # Process with the model
        inputs = processor(audio, sampling_rate=16000, return_tensors="pt")
        
        # Move inputs to the same device as model
        if torch.cuda.is_available():
            inputs = {k: v.cuda() if isinstance(v, torch.Tensor) else v for k, v in inputs.items()}
        
        with torch.no_grad():
            predicted_ids = model.generate(**inputs, max_length=512)
            transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
        
        # Generate file info
        word_count = len(transcription.split())
        file_info = f"Duration: {duration:.2f} seconds | Words: {word_count} | Processed: {datetime.now().strftime('%H:%M:%S')}"
        
        return transcription, file_info, transcription  # Return transcription twice for download
        
    except Exception as e:
        error_msg = f"Error processing audio: {str(e)}"
        print(error_msg)
        return error_msg, "", ""

def clear_inputs():
    """Clear all inputs and outputs"""
    return None, "", "", ""

# Custom CSS for better styling
css = """
.gradio-container {
    font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
}

.main-header {
    text-align: center;
    color: #2d5aa0;
    margin-bottom: 20px;
}

.info-box {
    background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
    color: white;
    padding: 20px;
    border-radius: 10px;
    margin: 10px 0;
}

.result-box {
    background-color: #f8f9fa;
    border: 1px solid #e9ecef;
    border-radius: 8px;
    padding: 15px;
    margin: 10px 0;
}
"""

# Create the Gradio interface
def create_interface():
    with gr.Blocks(css=css, title="Voxtral-Small-24B Speech Recognition") as demo:
        
        # Header
        gr.Markdown(
            """
            # 🎀 Voxtral-Small-24B Speech Recognition
            
            Upload an audio file to transcribe it using Mistral AI's Voxtral-Small-24B-2507 model.
            """,
            elem_classes=["main-header"]
        )
        
        # Model info
        with gr.Accordion("ℹ️ About this model", open=False):
            gr.Markdown(
                """
                **Voxtral-Small-24B-2507** is a speech-to-text model developed by Mistral AI.
                
                - **Model**: mistralai/Voxtral-Small-24B-2507
                - **Type**: Speech-to-Text Transformation
                - **Developer**: Mistral AI
                - **Use Case**: Audio transcription and speech recognition
                - **Supported Formats**: WAV, MP3, FLAC, M4A, OGG
                
                πŸ’‘ **Tip**: For best results, use clear audio files with minimal background noise.
                """
            )
        
        with gr.Row():
            with gr.Column(scale=1):
                # Audio input
                audio_input = gr.Audio(
                    label="πŸ“ Upload Audio File",
                    type="filepath",
                    sources=["upload", "microphone"]
                )
                
                # Control buttons
                with gr.Row():
                    transcribe_btn = gr.Button(
                        "πŸš€ Transcribe Audio", 
                        variant="primary",
                        size="lg"
                    )
                    clear_btn = gr.Button(
                        "πŸ—‘οΈ Clear", 
                        variant="secondary"
                    )
            
            with gr.Column(scale=1):
                # Results
                transcription_output = gr.Textbox(
                    label="πŸ“ Transcription Result",
                    lines=8,
                    max_lines=15,
                    placeholder="Transcribed text will appear here...",
                    show_copy_button=True
                )
                
                # File info
                info_output = gr.Textbox(
                    label="πŸ“Š Audio Information",
                    lines=1,
                    placeholder="Audio details will appear here..."
                )
                
                # Download option
                download_file = gr.File(
                    label="πŸ’Ύ Download Transcription",
                    visible=False
                )
        
        # Hidden textbox for file content (for download)
        hidden_text = gr.Textbox(visible=False)
        
        # Event handlers
        transcribe_btn.click(
            fn=transcribe_audio,
            inputs=[audio_input],
            outputs=[transcription_output, info_output, hidden_text],
            show_progress=True
        )
        
        # Update download file when transcription is complete
        def update_download(text_content):
            if text_content and text_content.strip():
                # Create a temporary file with the transcription
                temp_file = tempfile.NamedTemporaryFile(
                    mode='w', 
                    delete=False, 
                    suffix='.txt',
                    prefix='transcription_'
                )
                temp_file.write(text_content)
                temp_file.close()
                return gr.File(value=temp_file.name, visible=True)
            else:
                return gr.File(visible=False)
        
        hidden_text.change(
            fn=update_download,
            inputs=[hidden_text],
            outputs=[download_file]
        )
        
        clear_btn.click(
            fn=clear_inputs,
            outputs=[audio_input, transcription_output, info_output, hidden_text]
        )
        
        # Footer
        gr.Markdown(
            """
            ---
            
            ### πŸ› οΈ Usage Instructions:
            1. **Upload**: Click on the audio input area to upload a file or use your microphone
            2. **Transcribe**: Click the "Transcribe Audio" button to process your audio
            3. **Results**: View your transcription in the text area on the right
            4. **Download**: Use the download button to save your transcription as a text file
            
            **Supported formats**: WAV, MP3, FLAC, M4A, OGG
            """
        )
    
    return demo

# Initialize and launch the app
if __name__ == "__main__":
    # Pre-load the model when the app starts
    print("Initializing Voxtral model...")
    load_model()
    
    # Create and launch the interface
    demo = create_interface()
    demo.launch(
        share=True,
        show_error=True,
        server_name="0.0.0.0",
        server_port=7860
    )