File size: 6,423 Bytes
913772f
 
494e59f
913772f
ec0d1b9
a21a911
 
 
 
ec0d1b9
 
 
a21a911
913772f
ec0d1b9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a21a911
494e59f
ec0d1b9
a21a911
ec0d1b9
 
a21a911
ec0d1b9
 
 
 
 
 
 
913772f
ec0d1b9
494e59f
 
 
 
ec0d1b9
 
 
 
494e59f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
913772f
ec0d1b9
 
 
913772f
 
 
 
ec0d1b9
494e59f
ec0d1b9
 
 
 
 
 
 
 
 
 
 
 
 
913772f
a21a911
ec0d1b9
 
 
 
 
 
a21a911
ec0d1b9
 
 
 
 
913772f
ec0d1b9
 
 
 
 
 
 
 
913772f
 
ec0d1b9
913772f
 
 
ec0d1b9
 
 
 
 
 
 
 
 
 
 
913772f
494e59f
ec0d1b9
 
913772f
 
 
 
ec0d1b9
 
 
 
913772f
ec0d1b9
913772f
 
a21a911
ec0d1b9
a21a911
 
 
ec0d1b9
 
a21a911
 
 
81878e1
 
 
 
 
 
ec0d1b9
 
a21a911
 
913772f
ec0d1b9
913772f
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
from transformers import pipeline
import gradio as gr
import torch

# Updated model options with 2 new models
MODEL_OPTIONS = {
    "Whisper Tiny (Fastest)": "openai/whisper-tiny",
    "Whisper Base (Balanced)": "openai/whisper-base",
    "Whisper Small (Better Accuracy)": "openai/whisper-small",
    "Whisper Medium (High Accuracy)": "openai/whisper-medium",
    "Whisper Large (Highest Accuracy)": "openai/whisper-large",  # New model
    "Whisper Large-v2 (Latest)": "openai/whisper-large-v2"       # New model
}

# Language codes for Whisper
LANGUAGE_CODES = {
    "Auto-detect": None,
    "English": "en",
    "Spanish": "es",
    "French": "fr",
    "German": "de",
    "Italian": "it",
    "Portuguese": "pt",
    "Russian": "ru",
    "Chinese": "zh",
    "Japanese": "ja",
    "Korean": "ko",
    "Arabic": "ar",
    "Hindi": "hi",
    "Dutch": "nl"
}

def transcribe_audio(audio_file, model_choice, task_choice, language_choice, timestamp_choice, beam_size):
    # Initialize the pipeline with selected model
    model_name = MODEL_OPTIONS[model_choice]
    task = "translate" if task_choice == "Translate to English" else "transcribe"
    language = LANGUAGE_CODES[language_choice]
    
    # Create pipeline
    pipe = pipeline(
        "automatic-speech-recognition",
        model=model_name,
        chunk_length_s=30,
        device=0 if torch.cuda.is_available() else -1
    )
    
    # Generate kwargs for the pipeline
    generate_kwargs = {
        "task": task,
        "num_beams": beam_size
    }
    if language and task == "transcribe":
        generate_kwargs["language"] = language
    
    # Process audio file
    if timestamp_choice:
        result = pipe(
            audio_file,
            generate_kwargs=generate_kwargs,
            return_timestamps=True
        )
        timestamp_text = "\n".join([
            f"[{chunk['timestamp'][0]:.2f}s -> {chunk['timestamp'][1]:.2f}s] {chunk['text']}"
            for chunk in result.get("chunks", [])
        ])
        return result["text"], timestamp_text, gr.update(visible=True)
    else:
        result = pipe(
            audio_file,
            generate_kwargs=generate_kwargs,
            return_timestamps=False
        )
        return result["text"], "", gr.update(visible=False)

with gr.Blocks() as demo:
    gr.Markdown("# 🎵 Audio Transcription & Translation")
    gr.Markdown("Upload an audio file or use your microphone to transcribe or translate speech.")
    
    with gr.Row():
        with gr.Column():
            audio_input = gr.Audio(
                label="Audio Input",
                type="filepath"
            )
            
            # Updated model selection with new models
            model_choice = gr.Dropdown(
                choices=list(MODEL_OPTIONS.keys()),
                value="Whisper Tiny (Fastest)",
                label="Model Selection"
            )
            
            task_choice = gr.Radio(
                choices=["Transcribe", "Translate to English"],
                value="Transcribe",
                label="Task"
            )
            
            # Extended language options
            language_choice = gr.Dropdown(
                choices=list(LANGUAGE_CODES.keys()),
                value="Auto-detect",
                label="Language (for transcription)"
            )
            
            # New features
            timestamp_choice = gr.Checkbox(
                label="Include Timestamps",
                value=False
            )
            
            beam_size = gr.Slider(
                minimum=1,
                maximum=10,
                value=1,
                step=1,
                label="Beam Size (Higher = Better Accuracy but Slower)"
            )
        
        with gr.Column():
            text_output = gr.Textbox(
                lines=15,
                label="Transcription",
                interactive=False
            )
            
            # New output for timestamps
            timestamp_output = gr.Textbox(
                lines=8,
                label="Timestamps (if enabled)",
                interactive=False,
                visible=False
            )
    
    transcribe_btn = gr.Button("Transcribe Audio", variant="primary")
    
    transcribe_btn.click(
        transcribe_audio,
        inputs=[audio_input, model_choice, task_choice, language_choice, timestamp_choice, beam_size],
        outputs=[text_output, timestamp_output, timestamp_output]
    )
    
    gr.Examples(
        examples=[
            ["example_audio_1.wav", "Whisper Tiny (Fastest)", "Transcribe", "Auto-detect", False, 1],
            ["example_audio_2.wav", "Whisper Base (Balanced)", "Transcribe", "English", False, 1],
            ["example_audio_3.wav", "Whisper Small (Better Accuracy)", "Translate to English", "Auto-detect", False, 1],
            ["example_audio_4.wav", "Whisper Large (Highest Accuracy)", "Transcribe", "Spanish", True, 3]
        ],
        inputs=[audio_input, model_choice, task_choice, language_choice, timestamp_choice, beam_size],
    )
    
    gr.Markdown("### Features")
    gr.Markdown("- **Model Selection**: Choose from 6 different Whisper models with speed/accuracy tradeoffs")
    gr.Markdown("- **Task Options**: Transcribe audio in original language or translate to English")
    gr.Markdown("- **Language Selection**: Auto-detect or specify input language for better accuracy")
    gr.Markdown("- **Multiple Input Methods**: Upload audio files or record with microphone")
    gr.Markdown("- **Timestamps**: Option to include word-level timestamps")
    gr.Markdown("- **Beam Search**: Adjustable beam size for better accuracy")
    
    gr.Markdown("### Model Information")
    gr.Markdown("""
    | Model | Parameters | Speed | Best For |
    |-------|------------|-------|----------|
    | Whisper Tiny | 39M | Fastest | Quick transcriptions, low resources |
    | Whisper Base | 74M | Fast | Balanced performance |
    | Whisper Small | 244M | Medium | Better accuracy |
    | Whisper Medium | 769M | Slow | High accuracy transcriptions |
    | Whisper Large | 1.5B | Slower | Very high accuracy |
    | Whisper Large-v2 | 1.5B | Slower | Latest improvements |
    """)
    
    gr.Markdown("- **Supported Formats**: WAV, MP3, M4A, FLAC")
    gr.Markdown("- **Note**: First transcription may take 10-60 seconds (model loading)")

if __name__ == "__main__":
    demo.launch()