File size: 8,573 Bytes
db214be |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 |
import gradio as gr
import torch
from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq
import librosa
import numpy as np
import os
import tempfile
from datetime import datetime
# Global variables for model and processor
processor = None
model = None
def load_model():
"""Load the Voxtral model and processor"""
global processor, model
if processor is not None and model is not None:
return processor, model
try:
model_name = "mistralai/Voxtral-Small-24B-2507"
print("Loading Voxtral model... This may take a few minutes.")
processor = AutoProcessor.from_pretrained(model_name)
model = AutoModelForSpeechSeq2Seq.from_pretrained(
model_name,
torch_dtype=torch.float16,
device_map="auto",
low_cpu_mem_usage=True,
trust_remote_code=True
)
print("Model loaded successfully!")
return processor, model
except Exception as e:
print(f"Error loading model: {str(e)}")
return None, None
def transcribe_audio(audio_file):
"""Process audio file and return transcription"""
if audio_file is None:
return "Please upload an audio file.", "", ""
try:
# Load model if not already loaded
global processor, model
if processor is None or model is None:
processor, model = load_model()
if processor is None or model is None:
return "Error: Model failed to load. Please try again.", "", ""
# Load audio file
if isinstance(audio_file, str):
# If it's a file path
audio, sample_rate = librosa.load(audio_file, sr=16000)
else:
# If it's uploaded file data
audio, sample_rate = librosa.load(audio_file.name, sr=16000)
# Calculate duration
duration = len(audio) / sample_rate
# Process with the model
inputs = processor(audio, sampling_rate=16000, return_tensors="pt")
# Move inputs to the same device as model
if torch.cuda.is_available():
inputs = {k: v.cuda() if isinstance(v, torch.Tensor) else v for k, v in inputs.items()}
with torch.no_grad():
predicted_ids = model.generate(**inputs, max_length=512)
transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
# Generate file info
word_count = len(transcription.split())
file_info = f"Duration: {duration:.2f} seconds | Words: {word_count} | Processed: {datetime.now().strftime('%H:%M:%S')}"
return transcription, file_info, transcription # Return transcription twice for download
except Exception as e:
error_msg = f"Error processing audio: {str(e)}"
print(error_msg)
return error_msg, "", ""
def clear_inputs():
"""Clear all inputs and outputs"""
return None, "", "", ""
# Custom CSS for better styling
css = """
.gradio-container {
font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
}
.main-header {
text-align: center;
color: #2d5aa0;
margin-bottom: 20px;
}
.info-box {
background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
color: white;
padding: 20px;
border-radius: 10px;
margin: 10px 0;
}
.result-box {
background-color: #f8f9fa;
border: 1px solid #e9ecef;
border-radius: 8px;
padding: 15px;
margin: 10px 0;
}
"""
# Create the Gradio interface
def create_interface():
with gr.Blocks(css=css, title="Voxtral-Small-24B Speech Recognition") as demo:
# Header
gr.Markdown(
"""
# π€ Voxtral-Small-24B Speech Recognition
Upload an audio file to transcribe it using Mistral AI's Voxtral-Small-24B-2507 model.
""",
elem_classes=["main-header"]
)
# Model info
with gr.Accordion("βΉοΈ About this model", open=False):
gr.Markdown(
"""
**Voxtral-Small-24B-2507** is a speech-to-text model developed by Mistral AI.
- **Model**: mistralai/Voxtral-Small-24B-2507
- **Type**: Speech-to-Text Transformation
- **Developer**: Mistral AI
- **Use Case**: Audio transcription and speech recognition
- **Supported Formats**: WAV, MP3, FLAC, M4A, OGG
π‘ **Tip**: For best results, use clear audio files with minimal background noise.
"""
)
with gr.Row():
with gr.Column(scale=1):
# Audio input
audio_input = gr.Audio(
label="π Upload Audio File",
type="filepath",
sources=["upload", "microphone"]
)
# Control buttons
with gr.Row():
transcribe_btn = gr.Button(
"π Transcribe Audio",
variant="primary",
size="lg"
)
clear_btn = gr.Button(
"ποΈ Clear",
variant="secondary"
)
with gr.Column(scale=1):
# Results
transcription_output = gr.Textbox(
label="π Transcription Result",
lines=8,
max_lines=15,
placeholder="Transcribed text will appear here...",
show_copy_button=True
)
# File info
info_output = gr.Textbox(
label="π Audio Information",
lines=1,
placeholder="Audio details will appear here..."
)
# Download option
download_file = gr.File(
label="πΎ Download Transcription",
visible=False
)
# Hidden textbox for file content (for download)
hidden_text = gr.Textbox(visible=False)
# Event handlers
transcribe_btn.click(
fn=transcribe_audio,
inputs=[audio_input],
outputs=[transcription_output, info_output, hidden_text],
show_progress=True
)
# Update download file when transcription is complete
def update_download(text_content):
if text_content and text_content.strip():
# Create a temporary file with the transcription
temp_file = tempfile.NamedTemporaryFile(
mode='w',
delete=False,
suffix='.txt',
prefix='transcription_'
)
temp_file.write(text_content)
temp_file.close()
return gr.File(value=temp_file.name, visible=True)
else:
return gr.File(visible=False)
hidden_text.change(
fn=update_download,
inputs=[hidden_text],
outputs=[download_file]
)
clear_btn.click(
fn=clear_inputs,
outputs=[audio_input, transcription_output, info_output, hidden_text]
)
# Footer
gr.Markdown(
"""
---
### π οΈ Usage Instructions:
1. **Upload**: Click on the audio input area to upload a file or use your microphone
2. **Transcribe**: Click the "Transcribe Audio" button to process your audio
3. **Results**: View your transcription in the text area on the right
4. **Download**: Use the download button to save your transcription as a text file
**Supported formats**: WAV, MP3, FLAC, M4A, OGG
"""
)
return demo
# Initialize and launch the app
if __name__ == "__main__":
# Pre-load the model when the app starts
print("Initializing Voxtral model...")
load_model()
# Create and launch the interface
demo = create_interface()
demo.launch(
share=True,
show_error=True,
server_name="0.0.0.0",
server_port=7860
) |