Spaces:
Sleeping
Sleeping
File size: 6,423 Bytes
913772f 494e59f 913772f ec0d1b9 a21a911 ec0d1b9 a21a911 913772f ec0d1b9 a21a911 494e59f ec0d1b9 a21a911 ec0d1b9 a21a911 ec0d1b9 913772f ec0d1b9 494e59f ec0d1b9 494e59f 913772f ec0d1b9 913772f ec0d1b9 494e59f ec0d1b9 913772f a21a911 ec0d1b9 a21a911 ec0d1b9 913772f ec0d1b9 913772f ec0d1b9 913772f ec0d1b9 913772f 494e59f ec0d1b9 913772f ec0d1b9 913772f ec0d1b9 913772f a21a911 ec0d1b9 a21a911 ec0d1b9 a21a911 81878e1 ec0d1b9 a21a911 913772f ec0d1b9 913772f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 |
from transformers import pipeline
import gradio as gr
import torch
# Updated model options with 2 new models
MODEL_OPTIONS = {
"Whisper Tiny (Fastest)": "openai/whisper-tiny",
"Whisper Base (Balanced)": "openai/whisper-base",
"Whisper Small (Better Accuracy)": "openai/whisper-small",
"Whisper Medium (High Accuracy)": "openai/whisper-medium",
"Whisper Large (Highest Accuracy)": "openai/whisper-large", # New model
"Whisper Large-v2 (Latest)": "openai/whisper-large-v2" # New model
}
# Language codes for Whisper
LANGUAGE_CODES = {
"Auto-detect": None,
"English": "en",
"Spanish": "es",
"French": "fr",
"German": "de",
"Italian": "it",
"Portuguese": "pt",
"Russian": "ru",
"Chinese": "zh",
"Japanese": "ja",
"Korean": "ko",
"Arabic": "ar",
"Hindi": "hi",
"Dutch": "nl"
}
def transcribe_audio(audio_file, model_choice, task_choice, language_choice, timestamp_choice, beam_size):
# Initialize the pipeline with selected model
model_name = MODEL_OPTIONS[model_choice]
task = "translate" if task_choice == "Translate to English" else "transcribe"
language = LANGUAGE_CODES[language_choice]
# Create pipeline
pipe = pipeline(
"automatic-speech-recognition",
model=model_name,
chunk_length_s=30,
device=0 if torch.cuda.is_available() else -1
)
# Generate kwargs for the pipeline
generate_kwargs = {
"task": task,
"num_beams": beam_size
}
if language and task == "transcribe":
generate_kwargs["language"] = language
# Process audio file
if timestamp_choice:
result = pipe(
audio_file,
generate_kwargs=generate_kwargs,
return_timestamps=True
)
timestamp_text = "\n".join([
f"[{chunk['timestamp'][0]:.2f}s -> {chunk['timestamp'][1]:.2f}s] {chunk['text']}"
for chunk in result.get("chunks", [])
])
return result["text"], timestamp_text, gr.update(visible=True)
else:
result = pipe(
audio_file,
generate_kwargs=generate_kwargs,
return_timestamps=False
)
return result["text"], "", gr.update(visible=False)
with gr.Blocks() as demo:
gr.Markdown("# 🎵 Audio Transcription & Translation")
gr.Markdown("Upload an audio file or use your microphone to transcribe or translate speech.")
with gr.Row():
with gr.Column():
audio_input = gr.Audio(
label="Audio Input",
type="filepath"
)
# Updated model selection with new models
model_choice = gr.Dropdown(
choices=list(MODEL_OPTIONS.keys()),
value="Whisper Tiny (Fastest)",
label="Model Selection"
)
task_choice = gr.Radio(
choices=["Transcribe", "Translate to English"],
value="Transcribe",
label="Task"
)
# Extended language options
language_choice = gr.Dropdown(
choices=list(LANGUAGE_CODES.keys()),
value="Auto-detect",
label="Language (for transcription)"
)
# New features
timestamp_choice = gr.Checkbox(
label="Include Timestamps",
value=False
)
beam_size = gr.Slider(
minimum=1,
maximum=10,
value=1,
step=1,
label="Beam Size (Higher = Better Accuracy but Slower)"
)
with gr.Column():
text_output = gr.Textbox(
lines=15,
label="Transcription",
interactive=False
)
# New output for timestamps
timestamp_output = gr.Textbox(
lines=8,
label="Timestamps (if enabled)",
interactive=False,
visible=False
)
transcribe_btn = gr.Button("Transcribe Audio", variant="primary")
transcribe_btn.click(
transcribe_audio,
inputs=[audio_input, model_choice, task_choice, language_choice, timestamp_choice, beam_size],
outputs=[text_output, timestamp_output, timestamp_output]
)
gr.Examples(
examples=[
["example_audio_1.wav", "Whisper Tiny (Fastest)", "Transcribe", "Auto-detect", False, 1],
["example_audio_2.wav", "Whisper Base (Balanced)", "Transcribe", "English", False, 1],
["example_audio_3.wav", "Whisper Small (Better Accuracy)", "Translate to English", "Auto-detect", False, 1],
["example_audio_4.wav", "Whisper Large (Highest Accuracy)", "Transcribe", "Spanish", True, 3]
],
inputs=[audio_input, model_choice, task_choice, language_choice, timestamp_choice, beam_size],
)
gr.Markdown("### Features")
gr.Markdown("- **Model Selection**: Choose from 6 different Whisper models with speed/accuracy tradeoffs")
gr.Markdown("- **Task Options**: Transcribe audio in original language or translate to English")
gr.Markdown("- **Language Selection**: Auto-detect or specify input language for better accuracy")
gr.Markdown("- **Multiple Input Methods**: Upload audio files or record with microphone")
gr.Markdown("- **Timestamps**: Option to include word-level timestamps")
gr.Markdown("- **Beam Search**: Adjustable beam size for better accuracy")
gr.Markdown("### Model Information")
gr.Markdown("""
| Model | Parameters | Speed | Best For |
|-------|------------|-------|----------|
| Whisper Tiny | 39M | Fastest | Quick transcriptions, low resources |
| Whisper Base | 74M | Fast | Balanced performance |
| Whisper Small | 244M | Medium | Better accuracy |
| Whisper Medium | 769M | Slow | High accuracy transcriptions |
| Whisper Large | 1.5B | Slower | Very high accuracy |
| Whisper Large-v2 | 1.5B | Slower | Latest improvements |
""")
gr.Markdown("- **Supported Formats**: WAV, MP3, M4A, FLAC")
gr.Markdown("- **Note**: First transcription may take 10-60 seconds (model loading)")
if __name__ == "__main__":
demo.launch() |