Spaces:
Sleeping
Sleeping
| from transformers import pipeline | |
| import gradio as gr | |
| import torch | |
| # Updated model options with 2 new models | |
| MODEL_OPTIONS = { | |
| "Whisper Tiny (Fastest)": "openai/whisper-tiny", | |
| "Whisper Base (Balanced)": "openai/whisper-base", | |
| "Whisper Small (Better Accuracy)": "openai/whisper-small", | |
| "Whisper Medium (High Accuracy)": "openai/whisper-medium", | |
| "Whisper Large (Highest Accuracy)": "openai/whisper-large", # New model | |
| "Whisper Large-v2 (Latest)": "openai/whisper-large-v2" # New model | |
| } | |
| # Language codes for Whisper | |
| LANGUAGE_CODES = { | |
| "Auto-detect": None, | |
| "English": "en", | |
| "Spanish": "es", | |
| "French": "fr", | |
| "German": "de", | |
| "Italian": "it", | |
| "Portuguese": "pt", | |
| "Russian": "ru", | |
| "Chinese": "zh", | |
| "Japanese": "ja", | |
| "Korean": "ko", | |
| "Arabic": "ar", | |
| "Hindi": "hi", | |
| "Dutch": "nl" | |
| } | |
| def transcribe_audio(audio_file, model_choice, task_choice, language_choice, timestamp_choice, beam_size): | |
| # Initialize the pipeline with selected model | |
| model_name = MODEL_OPTIONS[model_choice] | |
| task = "translate" if task_choice == "Translate to English" else "transcribe" | |
| language = LANGUAGE_CODES[language_choice] | |
| # Create pipeline | |
| pipe = pipeline( | |
| "automatic-speech-recognition", | |
| model=model_name, | |
| chunk_length_s=30, | |
| device=0 if torch.cuda.is_available() else -1 | |
| ) | |
| # Generate kwargs for the pipeline | |
| generate_kwargs = { | |
| "task": task, | |
| "num_beams": beam_size | |
| } | |
| if language and task == "transcribe": | |
| generate_kwargs["language"] = language | |
| # Process audio file | |
| if timestamp_choice: | |
| result = pipe( | |
| audio_file, | |
| generate_kwargs=generate_kwargs, | |
| return_timestamps=True | |
| ) | |
| timestamp_text = "\n".join([ | |
| f"[{chunk['timestamp'][0]:.2f}s -> {chunk['timestamp'][1]:.2f}s] {chunk['text']}" | |
| for chunk in result.get("chunks", []) | |
| ]) | |
| return result["text"], timestamp_text, gr.update(visible=True) | |
| else: | |
| result = pipe( | |
| audio_file, | |
| generate_kwargs=generate_kwargs, | |
| return_timestamps=False | |
| ) | |
| return result["text"], "", gr.update(visible=False) | |
| with gr.Blocks() as demo: | |
| gr.Markdown("# 🎵 Audio Transcription & Translation") | |
| gr.Markdown("Upload an audio file or use your microphone to transcribe or translate speech.") | |
| with gr.Row(): | |
| with gr.Column(): | |
| audio_input = gr.Audio( | |
| label="Audio Input", | |
| type="filepath" | |
| ) | |
| # Updated model selection with new models | |
| model_choice = gr.Dropdown( | |
| choices=list(MODEL_OPTIONS.keys()), | |
| value="Whisper Tiny (Fastest)", | |
| label="Model Selection" | |
| ) | |
| task_choice = gr.Radio( | |
| choices=["Transcribe", "Translate to English"], | |
| value="Transcribe", | |
| label="Task" | |
| ) | |
| # Extended language options | |
| language_choice = gr.Dropdown( | |
| choices=list(LANGUAGE_CODES.keys()), | |
| value="Auto-detect", | |
| label="Language (for transcription)" | |
| ) | |
| # New features | |
| timestamp_choice = gr.Checkbox( | |
| label="Include Timestamps", | |
| value=False | |
| ) | |
| beam_size = gr.Slider( | |
| minimum=1, | |
| maximum=10, | |
| value=1, | |
| step=1, | |
| label="Beam Size (Higher = Better Accuracy but Slower)" | |
| ) | |
| with gr.Column(): | |
| text_output = gr.Textbox( | |
| lines=15, | |
| label="Transcription", | |
| interactive=False | |
| ) | |
| # New output for timestamps | |
| timestamp_output = gr.Textbox( | |
| lines=8, | |
| label="Timestamps (if enabled)", | |
| interactive=False, | |
| visible=False | |
| ) | |
| transcribe_btn = gr.Button("Transcribe Audio", variant="primary") | |
| transcribe_btn.click( | |
| transcribe_audio, | |
| inputs=[audio_input, model_choice, task_choice, language_choice, timestamp_choice, beam_size], | |
| outputs=[text_output, timestamp_output, timestamp_output] | |
| ) | |
| gr.Examples( | |
| examples=[ | |
| ["example_audio_1.wav", "Whisper Tiny (Fastest)", "Transcribe", "Auto-detect", False, 1], | |
| ["example_audio_2.wav", "Whisper Base (Balanced)", "Transcribe", "English", False, 1], | |
| ["example_audio_3.wav", "Whisper Small (Better Accuracy)", "Translate to English", "Auto-detect", False, 1], | |
| ["example_audio_4.wav", "Whisper Large (Highest Accuracy)", "Transcribe", "Spanish", True, 3] | |
| ], | |
| inputs=[audio_input, model_choice, task_choice, language_choice, timestamp_choice, beam_size], | |
| ) | |
| gr.Markdown("### Features") | |
| gr.Markdown("- **Model Selection**: Choose from 6 different Whisper models with speed/accuracy tradeoffs") | |
| gr.Markdown("- **Task Options**: Transcribe audio in original language or translate to English") | |
| gr.Markdown("- **Language Selection**: Auto-detect or specify input language for better accuracy") | |
| gr.Markdown("- **Multiple Input Methods**: Upload audio files or record with microphone") | |
| gr.Markdown("- **Timestamps**: Option to include word-level timestamps") | |
| gr.Markdown("- **Beam Search**: Adjustable beam size for better accuracy") | |
| gr.Markdown("### Model Information") | |
| gr.Markdown(""" | |
| | Model | Parameters | Speed | Best For | | |
| |-------|------------|-------|----------| | |
| | Whisper Tiny | 39M | Fastest | Quick transcriptions, low resources | | |
| | Whisper Base | 74M | Fast | Balanced performance | | |
| | Whisper Small | 244M | Medium | Better accuracy | | |
| | Whisper Medium | 769M | Slow | High accuracy transcriptions | | |
| | Whisper Large | 1.5B | Slower | Very high accuracy | | |
| | Whisper Large-v2 | 1.5B | Slower | Latest improvements | | |
| """) | |
| gr.Markdown("- **Supported Formats**: WAV, MP3, M4A, FLAC") | |
| gr.Markdown("- **Note**: First transcription may take 10-60 seconds (model loading)") | |
| if __name__ == "__main__": | |
| demo.launch() |