AiCoderv2 commited on
Commit
ec0d1b9
·
verified ·
1 Parent(s): 81878e1

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +148 -91
app.py CHANGED
@@ -1,133 +1,188 @@
1
  from transformers import pipeline
2
  import gradio as gr
3
- import numpy as np
4
 
5
- # Model options
6
  MODEL_OPTIONS = {
7
  "Whisper Tiny (Fastest)": "openai/whisper-tiny",
8
  "Whisper Base (Balanced)": "openai/whisper-base",
9
  "Whisper Small (Better Accuracy)": "openai/whisper-small",
10
- "Whisper Medium (High Accuracy)": "openai/whisper-medium"
 
 
11
  }
12
 
13
- # Global variable for the current model
14
- current_model = None
15
- current_model_name = None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16
 
17
- def load_model(model_choice):
18
- global current_model, current_model_name
19
  model_name = MODEL_OPTIONS[model_choice]
 
 
20
 
21
- # Only reload if different from current model
22
- if current_model_name != model_name:
23
- current_model = pipeline("automatic-speech-recognition", model=model_name)
24
- current_model_name = model_name
25
- return current_model
26
-
27
- def transcribe_audio(audio, model_choice, task_choice, language_choice):
28
- if audio is None:
29
- return "No audio provided. Please upload an audio file or record using the microphone."
30
 
31
- try:
32
- # Load the selected model
33
- asr = load_model(model_choice)
34
-
35
- # Convert audio to numpy array (Gradio provides mono audio as float32)
36
- sr, data = audio
37
-
38
- # Prepare generation arguments
39
- generate_kwargs = {}
40
-
41
- # Set task (transcribe or translate)
42
- if task_choice == "Translate to English":
43
- generate_kwargs["task"] = "translate"
44
- else:
45
- generate_kwargs["task"] = "transcribe"
46
-
47
- # Set language if specified
48
- if language_choice != "Auto-detect":
49
- language_map = {
50
- "English": "en",
51
- "Spanish": "es",
52
- "French": "fr",
53
- "German": "de",
54
- "Italian": "it",
55
- "Portuguese": "pt",
56
- "Russian": "ru",
57
- "Chinese": "zh",
58
- "Japanese": "ja",
59
- "Korean": "ko"
60
- }
61
- generate_kwargs["language"] = language_map[language_choice]
62
-
63
- # Transcribe audio (sampling_rate is handled by the pipeline)
64
- result = asr(data, generate_kwargs=generate_kwargs)
65
- return result["text"]
66
- except Exception as e:
67
- return f"Error during transcription: {str(e)}"
68
 
69
- # Gradio interface
70
- with gr.Blocks(title="Advanced Speech to Text") as demo:
71
- gr.Markdown("# 🎵 Advanced Speech to Text Transcription")
72
- gr.Markdown("Convert audio to text using OpenAI's Whisper models with multiple options")
73
 
74
  with gr.Row():
75
  with gr.Column():
76
  audio_input = gr.Audio(
77
- sources=["upload", "microphone"],
78
- type="numpy",
79
- label="Audio Input"
 
 
 
 
 
 
 
 
 
 
 
 
 
80
  )
81
 
82
- with gr.Group():
83
- model_choice = gr.Dropdown(
84
- choices=list(MODEL_OPTIONS.keys()),
85
- value="Whisper Tiny (Fastest)",
86
- label="Model Selection"
87
- )
88
-
89
- task_choice = gr.Radio(
90
- choices=["Transcribe", "Translate to English"],
91
- value="Transcribe",
92
- label="Task"
93
- )
94
-
95
- language_choice = gr.Dropdown(
96
- choices=["Auto-detect", "English", "Spanish", "French", "German",
97
- "Italian", "Portuguese", "Russian", "Chinese", "Japanese", "Korean"],
98
- value="Auto-detect",
99
- label="Language (for transcription)"
100
- )
101
 
102
- transcribe_btn = gr.Button("Transcribe Audio", variant="primary")
 
 
 
 
103
 
 
 
 
 
 
 
 
 
104
  with gr.Column():
105
  text_output = gr.Textbox(
106
- lines=12,
107
  label="Transcription",
108
  interactive=False
109
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
110
 
111
  transcribe_btn.click(
112
- transcribe_audio,
113
- inputs=[audio_input, model_choice, task_choice, language_choice],
114
- outputs=text_output
115
  )
116
 
117
  gr.Examples(
118
  examples=[
119
- ["example_audio_1.wav", "Whisper Tiny (Fastest)", "Transcribe", "Auto-detect"],
120
- ["example_audio_2.wav", "Whisper Base (Balanced)", "Transcribe", "English"],
121
- ["example_audio_3.wav", "Whisper Small (Better Accuracy)", "Translate to English", "Auto-detect"]
 
122
  ],
123
- inputs=[audio_input, model_choice, task_choice, language_choice],
124
  )
125
 
126
  gr.Markdown("### Features")
127
- gr.Markdown("- **Model Selection**: Choose from 4 different Whisper models with speed/accuracy tradeoffs")
128
  gr.Markdown("- **Task Options**: Transcribe audio in original language or translate to English")
129
  gr.Markdown("- **Language Selection**: Auto-detect or specify input language for better accuracy")
130
  gr.Markdown("- **Multiple Input Methods**: Upload audio files or record with microphone")
 
 
131
 
132
  gr.Markdown("### Model Information")
133
  gr.Markdown("""
@@ -137,10 +192,12 @@ with gr.Blocks(title="Advanced Speech to Text") as demo:
137
  | Whisper Base | 74M | Fast | Balanced performance |
138
  | Whisper Small | 244M | Medium | Better accuracy |
139
  | Whisper Medium | 769M | Slow | High accuracy transcriptions |
 
 
140
  """)
141
 
142
  gr.Markdown("- **Supported Formats**: WAV, MP3, M4A, FLAC")
143
- gr.Markdown("- **Note**: First transcription may take 10-30 seconds (model loading)")
144
 
145
  if __name__ == "__main__":
146
  demo.launch()
 
1
  from transformers import pipeline
2
  import gradio as gr
 
3
 
4
+ # Updated model options with 2 new models
5
  MODEL_OPTIONS = {
6
  "Whisper Tiny (Fastest)": "openai/whisper-tiny",
7
  "Whisper Base (Balanced)": "openai/whisper-base",
8
  "Whisper Small (Better Accuracy)": "openai/whisper-small",
9
+ "Whisper Medium (High Accuracy)": "openai/whisper-medium",
10
+ "Whisper Large (Highest Accuracy)": "openai/whisper-large", # New model
11
+ "Whisper Large-v2 (Latest)": "openai/whisper-large-v2" # New model
12
  }
13
 
14
+ # Language codes for Whisper
15
+ LANGUAGE_CODES = {
16
+ "Auto-detect": None,
17
+ "English": "en",
18
+ "Spanish": "es",
19
+ "French": "fr",
20
+ "German": "de",
21
+ "Italian": "it",
22
+ "Portuguese": "pt",
23
+ "Russian": "ru",
24
+ "Chinese": "zh",
25
+ "Japanese": "ja",
26
+ "Korean": "ko",
27
+ "Arabic": "ar",
28
+ "Hindi": "hi",
29
+ "Dutch": "nl"
30
+ }
31
 
32
+ def transcribe_audio(audio_file, model_choice, task_choice, language_choice):
33
+ # Initialize the pipeline with selected model
34
  model_name = MODEL_OPTIONS[model_choice]
35
+ task = "translate" if task_choice == "Translate to English" else "transcribe"
36
+ language = LANGUAGE_CODES[language_choice]
37
 
38
+ # Create pipeline
39
+ pipe = pipeline(
40
+ "automatic-speech-recognition",
41
+ model=model_name,
42
+ chunk_length_s=30,
43
+ device=0 if torch.cuda.is_available() else -1
44
+ )
 
 
45
 
46
+ # Generate kwargs for the pipeline
47
+ generate_kwargs = {"task": task}
48
+ if language and task == "transcribe":
49
+ generate_kwargs["language"] = language
50
+
51
+ # Process audio file
52
+ result = pipe(
53
+ audio_file,
54
+ generate_kwargs=generate_kwargs,
55
+ return_timestamps=False
56
+ )
57
+
58
+ return result["text"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
59
 
60
+ with gr.Blocks() as demo:
61
+ gr.Markdown("# 🎵 Audio Transcription & Translation")
62
+ gr.Markdown("Upload an audio file or use your microphone to transcribe or translate speech.")
 
63
 
64
  with gr.Row():
65
  with gr.Column():
66
  audio_input = gr.Audio(
67
+ label="Audio Input",
68
+ type="filepath",
69
+ source="upload"
70
+ )
71
+
72
+ # Updated model selection with new models
73
+ model_choice = gr.Dropdown(
74
+ choices=list(MODEL_OPTIONS.keys()),
75
+ value="Whisper Tiny (Fastest)",
76
+ label="Model Selection"
77
+ )
78
+
79
+ task_choice = gr.Radio(
80
+ choices=["Transcribe", "Translate to English"],
81
+ value="Transcribe",
82
+ label="Task"
83
  )
84
 
85
+ # Extended language options
86
+ language_choice = gr.Dropdown(
87
+ choices=list(LANGUAGE_CODES.keys()),
88
+ value="Auto-detect",
89
+ label="Language (for transcription)"
90
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
91
 
92
+ # New features
93
+ timestamp_choice = gr.Checkbox(
94
+ label="Include Timestamps",
95
+ value=False
96
+ )
97
 
98
+ beam_size = gr.Slider(
99
+ minimum=1,
100
+ maximum=10,
101
+ value=1,
102
+ step=1,
103
+ label="Beam Size (Higher = Better Accuracy but Slower)"
104
+ )
105
+
106
  with gr.Column():
107
  text_output = gr.Textbox(
108
+ lines=15,
109
  label="Transcription",
110
  interactive=False
111
  )
112
+
113
+ # New output for timestamps
114
+ timestamp_output = gr.Textbox(
115
+ lines=8,
116
+ label="Timestamps (if enabled)",
117
+ interactive=False,
118
+ visible=False
119
+ )
120
+
121
+ transcribe_btn = gr.Button("Transcribe Audio", variant="primary")
122
+
123
+ # Updated function to handle new features
124
+ def process_audio(audio_file, model_choice, task_choice, language_choice, timestamp_choice, beam_size):
125
+ model_name = MODEL_OPTIONS[model_choice]
126
+ task = "translate" if task_choice == "Translate to English" else "transcribe"
127
+ language = LANGUAGE_CODES[language_choice]
128
+
129
+ pipe = pipeline(
130
+ "automatic-speech-recognition",
131
+ model=model_name,
132
+ chunk_length_s=30,
133
+ device=0 if torch.cuda.is_available() else -1
134
+ )
135
+
136
+ generate_kwargs = {
137
+ "task": task,
138
+ "num_beams": beam_size
139
+ }
140
+ if language and task == "transcribe":
141
+ generate_kwargs["language"] = language
142
+
143
+ # Process with or without timestamps
144
+ if timestamp_choice:
145
+ result = pipe(
146
+ audio_file,
147
+ generate_kwargs=generate_kwargs,
148
+ return_timestamps=True
149
+ )
150
+ timestamp_text = "\n".join([
151
+ f"[{chunk['timestamp'][0]:.2f}s -> {chunk['timestamp'][1]:.2f}s] {chunk['text']}"
152
+ for chunk in result.get("chunks", [])
153
+ ])
154
+ return result["text"], timestamp_text, gr.update(visible=True)
155
+ else:
156
+ result = pipe(
157
+ audio_file,
158
+ generate_kwargs=generate_kwargs,
159
+ return_timestamps=False
160
+ )
161
+ return result["text"], "", gr.update(visible=False)
162
 
163
  transcribe_btn.click(
164
+ process_audio,
165
+ inputs=[audio_input, model_choice, task_choice, language_choice, timestamp_choice, beam_size],
166
+ outputs=[text_output, timestamp_output, timestamp_output]
167
  )
168
 
169
  gr.Examples(
170
  examples=[
171
+ ["example_audio_1.wav", "Whisper Tiny (Fastest)", "Transcribe", "Auto-detect", False, 1],
172
+ ["example_audio_2.wav", "Whisper Base (Balanced)", "Transcribe", "English", False, 1],
173
+ ["example_audio_3.wav", "Whisper Small (Better Accuracy)", "Translate to English", "Auto-detect", False, 1],
174
+ ["example_audio_4.wav", "Whisper Large (Highest Accuracy)", "Transcribe", "Spanish", True, 3]
175
  ],
176
+ inputs=[audio_input, model_choice, task_choice, language_choice, timestamp_choice, beam_size],
177
  )
178
 
179
  gr.Markdown("### Features")
180
+ gr.Markdown("- **Model Selection**: Choose from 6 different Whisper models with speed/accuracy tradeoffs")
181
  gr.Markdown("- **Task Options**: Transcribe audio in original language or translate to English")
182
  gr.Markdown("- **Language Selection**: Auto-detect or specify input language for better accuracy")
183
  gr.Markdown("- **Multiple Input Methods**: Upload audio files or record with microphone")
184
+ gr.Markdown("- **Timestamps**: Option to include word-level timestamps")
185
+ gr.Markdown("- **Beam Search**: Adjustable beam size for better accuracy")
186
 
187
  gr.Markdown("### Model Information")
188
  gr.Markdown("""
 
192
  | Whisper Base | 74M | Fast | Balanced performance |
193
  | Whisper Small | 244M | Medium | Better accuracy |
194
  | Whisper Medium | 769M | Slow | High accuracy transcriptions |
195
+ | Whisper Large | 1.5B | Slower | Very high accuracy |
196
+ | Whisper Large-v2 | 1.5B | Slower | Latest improvements |
197
  """)
198
 
199
  gr.Markdown("- **Supported Formats**: WAV, MP3, M4A, FLAC")
200
+ gr.Markdown("- **Note**: First transcription may take 10-60 seconds (model loading)")
201
 
202
  if __name__ == "__main__":
203
  demo.launch()