Spaces:

ravinder024
/

testspace-for-voice-to-text

Running

File size: 8,573 Bytes

db214be

import gradio as gr
import torch
from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq
import librosa
import numpy as np
import os
import tempfile
from datetime import datetime

# Global variables for model and processor
processor = None
model = None

def load_model():
    """Load the Voxtral model and processor"""
    global processor, model
    
    if processor is not None and model is not None:
        return processor, model
    
    try:
        model_name = "mistralai/Voxtral-Small-24B-2507"
        
        print("Loading Voxtral model... This may take a few minutes.")
        processor = AutoProcessor.from_pretrained(model_name)
        model = AutoModelForSpeechSeq2Seq.from_pretrained(
            model_name,
            torch_dtype=torch.float16,
            device_map="auto",
            low_cpu_mem_usage=True,
            trust_remote_code=True
        )
        
        print("Model loaded successfully!")
        return processor, model
        
    except Exception as e:
        print(f"Error loading model: {str(e)}")
        return None, None

def transcribe_audio(audio_file):
    """Process audio file and return transcription"""
    if audio_file is None:
        return "Please upload an audio file.", "", ""
    
    try:
        # Load model if not already loaded
        global processor, model
        if processor is None or model is None:
            processor, model = load_model()
        
        if processor is None or model is None:
            return "Error: Model failed to load. Please try again.", "", ""
        
        # Load audio file
        if isinstance(audio_file, str):
            # If it's a file path
            audio, sample_rate = librosa.load(audio_file, sr=16000)
        else:
            # If it's uploaded file data
            audio, sample_rate = librosa.load(audio_file.name, sr=16000)
        
        # Calculate duration
        duration = len(audio) / sample_rate
        
        # Process with the model
        inputs = processor(audio, sampling_rate=16000, return_tensors="pt")
        
        # Move inputs to the same device as model
        if torch.cuda.is_available():
            inputs = {k: v.cuda() if isinstance(v, torch.Tensor) else v for k, v in inputs.items()}
        
        with torch.no_grad():
            predicted_ids = model.generate(**inputs, max_length=512)
            transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
        
        # Generate file info
        word_count = len(transcription.split())
        file_info = f"Duration: {duration:.2f} seconds | Words: {word_count} | Processed: {datetime.now().strftime('%H:%M:%S')}"
        
        return transcription, file_info, transcription  # Return transcription twice for download
        
    except Exception as e:
        error_msg = f"Error processing audio: {str(e)}"
        print(error_msg)
        return error_msg, "", ""

def clear_inputs():
    """Clear all inputs and outputs"""
    return None, "", "", ""

# Custom CSS for better styling
css = """
.gradio-container {
    font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
}

.main-header {
    text-align: center;
    color: #2d5aa0;
    margin-bottom: 20px;
}

.info-box {
    background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
    color: white;
    padding: 20px;
    border-radius: 10px;
    margin: 10px 0;
}

.result-box {
    background-color: #f8f9fa;
    border: 1px solid #e9ecef;
    border-radius: 8px;
    padding: 15px;
    margin: 10px 0;
}
"""

# Create the Gradio interface
def create_interface():
    with gr.Blocks(css=css, title="Voxtral-Small-24B Speech Recognition") as demo:
        
        # Header
        gr.Markdown(
            """
            # 🎤 Voxtral-Small-24B Speech Recognition
            
            Upload an audio file to transcribe it using Mistral AI's Voxtral-Small-24B-2507 model.
            """,
            elem_classes=["main-header"]
        )
        
        # Model info
        with gr.Accordion("ℹ️ About this model", open=False):
            gr.Markdown(
                """
                **Voxtral-Small-24B-2507** is a speech-to-text model developed by Mistral AI.
                
                - **Model**: mistralai/Voxtral-Small-24B-2507
                - **Type**: Speech-to-Text Transformation
                - **Developer**: Mistral AI
                - **Use Case**: Audio transcription and speech recognition
                - **Supported Formats**: WAV, MP3, FLAC, M4A, OGG
                
                💡 **Tip**: For best results, use clear audio files with minimal background noise.
                """
            )
        
        with gr.Row():
            with gr.Column(scale=1):
                # Audio input
                audio_input = gr.Audio(
                    label="📁 Upload Audio File",
                    type="filepath",
                    sources=["upload", "microphone"]
                )
                
                # Control buttons
                with gr.Row():
                    transcribe_btn = gr.Button(
                        "🚀 Transcribe Audio", 
                        variant="primary",
                        size="lg"
                    )
                    clear_btn = gr.Button(
                        "🗑️ Clear", 
                        variant="secondary"
                    )
            
            with gr.Column(scale=1):
                # Results
                transcription_output = gr.Textbox(
                    label="📝 Transcription Result",
                    lines=8,
                    max_lines=15,
                    placeholder="Transcribed text will appear here...",
                    show_copy_button=True
                )
                
                # File info
                info_output = gr.Textbox(
                    label="📊 Audio Information",
                    lines=1,
                    placeholder="Audio details will appear here..."
                )
                
                # Download option
                download_file = gr.File(
                    label="💾 Download Transcription",
                    visible=False
                )
        
        # Hidden textbox for file content (for download)
        hidden_text = gr.Textbox(visible=False)
        
        # Event handlers
        transcribe_btn.click(
            fn=transcribe_audio,
            inputs=[audio_input],
            outputs=[transcription_output, info_output, hidden_text],
            show_progress=True
        )
        
        # Update download file when transcription is complete
        def update_download(text_content):
            if text_content and text_content.strip():
                # Create a temporary file with the transcription
                temp_file = tempfile.NamedTemporaryFile(
                    mode='w', 
                    delete=False, 
                    suffix='.txt',
                    prefix='transcription_'
                )
                temp_file.write(text_content)
                temp_file.close()
                return gr.File(value=temp_file.name, visible=True)
            else:
                return gr.File(visible=False)
        
        hidden_text.change(
            fn=update_download,
            inputs=[hidden_text],
            outputs=[download_file]
        )
        
        clear_btn.click(
            fn=clear_inputs,
            outputs=[audio_input, transcription_output, info_output, hidden_text]
        )
        
        # Footer
        gr.Markdown(
            """
            ---
            
            ### 🛠️ Usage Instructions:
            1. **Upload**: Click on the audio input area to upload a file or use your microphone
            2. **Transcribe**: Click the "Transcribe Audio" button to process your audio
            3. **Results**: View your transcription in the text area on the right
            4. **Download**: Use the download button to save your transcription as a text file
            
            **Supported formats**: WAV, MP3, FLAC, M4A, OGG
            """
        )
    
    return demo

# Initialize and launch the app
if __name__ == "__main__":
    # Pre-load the model when the app starts
    print("Initializing Voxtral model...")
    load_model()
    
    # Create and launch the interface
    demo = create_interface()
    demo.launch(
        share=True,
        show_error=True,
        server_name="0.0.0.0",
        server_port=7860
    )