Spaces:

FineToon
/

Ai-Audio-Text-To-Text

Sleeping

File size: 6,423 Bytes

913772f
 
494e59f
913772f
ec0d1b9
a21a911
 
 
 
ec0d1b9
 
 
a21a911
913772f
ec0d1b9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a21a911
494e59f
ec0d1b9
a21a911
ec0d1b9
 
a21a911
ec0d1b9
 
 
 
 
 
 
913772f
ec0d1b9
494e59f
 
 
 
ec0d1b9
 
 
 
494e59f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
913772f
ec0d1b9
 
 
913772f
 
 
 
ec0d1b9
494e59f
ec0d1b9
 
 
 
 
 
 
 
 
 
 
 
 
913772f
a21a911
ec0d1b9
 
 
 
 
 
a21a911
ec0d1b9
 
 
 
 
913772f
ec0d1b9
 
 
 
 
 
 
 
913772f
 
ec0d1b9
913772f
 
 
ec0d1b9
 
 
 
 
 
 
 
 
 
 
913772f
494e59f
ec0d1b9
 
913772f
 
 
 
ec0d1b9
 
 
 
913772f
ec0d1b9
913772f
 
a21a911
ec0d1b9
a21a911
 
 
ec0d1b9
 
a21a911
 
 
81878e1
 
 
 
 
 
ec0d1b9
 
a21a911
 
913772f
ec0d1b9
913772f

from transformers import pipeline
import gradio as gr
import torch

# Updated model options with 2 new models
MODEL_OPTIONS = {
    "Whisper Tiny (Fastest)": "openai/whisper-tiny",
    "Whisper Base (Balanced)": "openai/whisper-base",
    "Whisper Small (Better Accuracy)": "openai/whisper-small",
    "Whisper Medium (High Accuracy)": "openai/whisper-medium",
    "Whisper Large (Highest Accuracy)": "openai/whisper-large",  # New model
    "Whisper Large-v2 (Latest)": "openai/whisper-large-v2"       # New model
}

# Language codes for Whisper
LANGUAGE_CODES = {
    "Auto-detect": None,
    "English": "en",
    "Spanish": "es",
    "French": "fr",
    "German": "de",
    "Italian": "it",
    "Portuguese": "pt",
    "Russian": "ru",
    "Chinese": "zh",
    "Japanese": "ja",
    "Korean": "ko",
    "Arabic": "ar",
    "Hindi": "hi",
    "Dutch": "nl"
}

def transcribe_audio(audio_file, model_choice, task_choice, language_choice, timestamp_choice, beam_size):
    # Initialize the pipeline with selected model
    model_name = MODEL_OPTIONS[model_choice]
    task = "translate" if task_choice == "Translate to English" else "transcribe"
    language = LANGUAGE_CODES[language_choice]
    
    # Create pipeline
    pipe = pipeline(
        "automatic-speech-recognition",
        model=model_name,
        chunk_length_s=30,
        device=0 if torch.cuda.is_available() else -1
    )
    
    # Generate kwargs for the pipeline
    generate_kwargs = {
        "task": task,
        "num_beams": beam_size
    }
    if language and task == "transcribe":
        generate_kwargs["language"] = language
    
    # Process audio file
    if timestamp_choice:
        result = pipe(
            audio_file,
            generate_kwargs=generate_kwargs,
            return_timestamps=True
        )
        timestamp_text = "\n".join([
            f"[{chunk['timestamp'][0]:.2f}s -> {chunk['timestamp'][1]:.2f}s] {chunk['text']}"
            for chunk in result.get("chunks", [])
        ])
        return result["text"], timestamp_text, gr.update(visible=True)
    else:
        result = pipe(
            audio_file,
            generate_kwargs=generate_kwargs,
            return_timestamps=False
        )
        return result["text"], "", gr.update(visible=False)

with gr.Blocks() as demo:
    gr.Markdown("# 🎵 Audio Transcription & Translation")
    gr.Markdown("Upload an audio file or use your microphone to transcribe or translate speech.")
    
    with gr.Row():
        with gr.Column():
            audio_input = gr.Audio(
                label="Audio Input",
                type="filepath"
            )
            
            # Updated model selection with new models
            model_choice = gr.Dropdown(
                choices=list(MODEL_OPTIONS.keys()),
                value="Whisper Tiny (Fastest)",
                label="Model Selection"
            )
            
            task_choice = gr.Radio(
                choices=["Transcribe", "Translate to English"],
                value="Transcribe",
                label="Task"
            )
            
            # Extended language options
            language_choice = gr.Dropdown(
                choices=list(LANGUAGE_CODES.keys()),
                value="Auto-detect",
                label="Language (for transcription)"
            )
            
            # New features
            timestamp_choice = gr.Checkbox(
                label="Include Timestamps",
                value=False
            )
            
            beam_size = gr.Slider(
                minimum=1,
                maximum=10,
                value=1,
                step=1,
                label="Beam Size (Higher = Better Accuracy but Slower)"
            )
        
        with gr.Column():
            text_output = gr.Textbox(
                lines=15,
                label="Transcription",
                interactive=False
            )
            
            # New output for timestamps
            timestamp_output = gr.Textbox(
                lines=8,
                label="Timestamps (if enabled)",
                interactive=False,
                visible=False
            )
    
    transcribe_btn = gr.Button("Transcribe Audio", variant="primary")
    
    transcribe_btn.click(
        transcribe_audio,
        inputs=[audio_input, model_choice, task_choice, language_choice, timestamp_choice, beam_size],
        outputs=[text_output, timestamp_output, timestamp_output]
    )
    
    gr.Examples(
        examples=[
            ["example_audio_1.wav", "Whisper Tiny (Fastest)", "Transcribe", "Auto-detect", False, 1],
            ["example_audio_2.wav", "Whisper Base (Balanced)", "Transcribe", "English", False, 1],
            ["example_audio_3.wav", "Whisper Small (Better Accuracy)", "Translate to English", "Auto-detect", False, 1],
            ["example_audio_4.wav", "Whisper Large (Highest Accuracy)", "Transcribe", "Spanish", True, 3]
        ],
        inputs=[audio_input, model_choice, task_choice, language_choice, timestamp_choice, beam_size],
    )
    
    gr.Markdown("### Features")
    gr.Markdown("- **Model Selection**: Choose from 6 different Whisper models with speed/accuracy tradeoffs")
    gr.Markdown("- **Task Options**: Transcribe audio in original language or translate to English")
    gr.Markdown("- **Language Selection**: Auto-detect or specify input language for better accuracy")
    gr.Markdown("- **Multiple Input Methods**: Upload audio files or record with microphone")
    gr.Markdown("- **Timestamps**: Option to include word-level timestamps")
    gr.Markdown("- **Beam Search**: Adjustable beam size for better accuracy")
    
    gr.Markdown("### Model Information")
    gr.Markdown("""
    | Model | Parameters | Speed | Best For |
    |-------|------------|-------|----------|
    | Whisper Tiny | 39M | Fastest | Quick transcriptions, low resources |
    | Whisper Base | 74M | Fast | Balanced performance |
    | Whisper Small | 244M | Medium | Better accuracy |
    | Whisper Medium | 769M | Slow | High accuracy transcriptions |
    | Whisper Large | 1.5B | Slower | Very high accuracy |
    | Whisper Large-v2 | 1.5B | Slower | Latest improvements |
    """)
    
    gr.Markdown("- **Supported Formats**: WAV, MP3, M4A, FLAC")
    gr.Markdown("- **Note**: First transcription may take 10-60 seconds (model loading)")

if __name__ == "__main__":
    demo.launch()