AiCoderv2's picture
Update app.py
494e59f verified
from transformers import pipeline
import gradio as gr
import torch
# Updated model options with 2 new models
MODEL_OPTIONS = {
"Whisper Tiny (Fastest)": "openai/whisper-tiny",
"Whisper Base (Balanced)": "openai/whisper-base",
"Whisper Small (Better Accuracy)": "openai/whisper-small",
"Whisper Medium (High Accuracy)": "openai/whisper-medium",
"Whisper Large (Highest Accuracy)": "openai/whisper-large", # New model
"Whisper Large-v2 (Latest)": "openai/whisper-large-v2" # New model
}
# Language codes for Whisper
LANGUAGE_CODES = {
"Auto-detect": None,
"English": "en",
"Spanish": "es",
"French": "fr",
"German": "de",
"Italian": "it",
"Portuguese": "pt",
"Russian": "ru",
"Chinese": "zh",
"Japanese": "ja",
"Korean": "ko",
"Arabic": "ar",
"Hindi": "hi",
"Dutch": "nl"
}
def transcribe_audio(audio_file, model_choice, task_choice, language_choice, timestamp_choice, beam_size):
# Initialize the pipeline with selected model
model_name = MODEL_OPTIONS[model_choice]
task = "translate" if task_choice == "Translate to English" else "transcribe"
language = LANGUAGE_CODES[language_choice]
# Create pipeline
pipe = pipeline(
"automatic-speech-recognition",
model=model_name,
chunk_length_s=30,
device=0 if torch.cuda.is_available() else -1
)
# Generate kwargs for the pipeline
generate_kwargs = {
"task": task,
"num_beams": beam_size
}
if language and task == "transcribe":
generate_kwargs["language"] = language
# Process audio file
if timestamp_choice:
result = pipe(
audio_file,
generate_kwargs=generate_kwargs,
return_timestamps=True
)
timestamp_text = "\n".join([
f"[{chunk['timestamp'][0]:.2f}s -> {chunk['timestamp'][1]:.2f}s] {chunk['text']}"
for chunk in result.get("chunks", [])
])
return result["text"], timestamp_text, gr.update(visible=True)
else:
result = pipe(
audio_file,
generate_kwargs=generate_kwargs,
return_timestamps=False
)
return result["text"], "", gr.update(visible=False)
with gr.Blocks() as demo:
gr.Markdown("# 🎵 Audio Transcription & Translation")
gr.Markdown("Upload an audio file or use your microphone to transcribe or translate speech.")
with gr.Row():
with gr.Column():
audio_input = gr.Audio(
label="Audio Input",
type="filepath"
)
# Updated model selection with new models
model_choice = gr.Dropdown(
choices=list(MODEL_OPTIONS.keys()),
value="Whisper Tiny (Fastest)",
label="Model Selection"
)
task_choice = gr.Radio(
choices=["Transcribe", "Translate to English"],
value="Transcribe",
label="Task"
)
# Extended language options
language_choice = gr.Dropdown(
choices=list(LANGUAGE_CODES.keys()),
value="Auto-detect",
label="Language (for transcription)"
)
# New features
timestamp_choice = gr.Checkbox(
label="Include Timestamps",
value=False
)
beam_size = gr.Slider(
minimum=1,
maximum=10,
value=1,
step=1,
label="Beam Size (Higher = Better Accuracy but Slower)"
)
with gr.Column():
text_output = gr.Textbox(
lines=15,
label="Transcription",
interactive=False
)
# New output for timestamps
timestamp_output = gr.Textbox(
lines=8,
label="Timestamps (if enabled)",
interactive=False,
visible=False
)
transcribe_btn = gr.Button("Transcribe Audio", variant="primary")
transcribe_btn.click(
transcribe_audio,
inputs=[audio_input, model_choice, task_choice, language_choice, timestamp_choice, beam_size],
outputs=[text_output, timestamp_output, timestamp_output]
)
gr.Examples(
examples=[
["example_audio_1.wav", "Whisper Tiny (Fastest)", "Transcribe", "Auto-detect", False, 1],
["example_audio_2.wav", "Whisper Base (Balanced)", "Transcribe", "English", False, 1],
["example_audio_3.wav", "Whisper Small (Better Accuracy)", "Translate to English", "Auto-detect", False, 1],
["example_audio_4.wav", "Whisper Large (Highest Accuracy)", "Transcribe", "Spanish", True, 3]
],
inputs=[audio_input, model_choice, task_choice, language_choice, timestamp_choice, beam_size],
)
gr.Markdown("### Features")
gr.Markdown("- **Model Selection**: Choose from 6 different Whisper models with speed/accuracy tradeoffs")
gr.Markdown("- **Task Options**: Transcribe audio in original language or translate to English")
gr.Markdown("- **Language Selection**: Auto-detect or specify input language for better accuracy")
gr.Markdown("- **Multiple Input Methods**: Upload audio files or record with microphone")
gr.Markdown("- **Timestamps**: Option to include word-level timestamps")
gr.Markdown("- **Beam Search**: Adjustable beam size for better accuracy")
gr.Markdown("### Model Information")
gr.Markdown("""
| Model | Parameters | Speed | Best For |
|-------|------------|-------|----------|
| Whisper Tiny | 39M | Fastest | Quick transcriptions, low resources |
| Whisper Base | 74M | Fast | Balanced performance |
| Whisper Small | 244M | Medium | Better accuracy |
| Whisper Medium | 769M | Slow | High accuracy transcriptions |
| Whisper Large | 1.5B | Slower | Very high accuracy |
| Whisper Large-v2 | 1.5B | Slower | Latest improvements |
""")
gr.Markdown("- **Supported Formats**: WAV, MP3, M4A, FLAC")
gr.Markdown("- **Note**: First transcription may take 10-60 seconds (model loading)")
if __name__ == "__main__":
demo.launch()