Whisper Transcriber Bot commited on
Commit
7f464b5
·
1 Parent(s): 14efc79

Simplify to minimal clean interface - default HF style

Browse files
Files changed (1) hide show
  1. app.py +44 -166
app.py CHANGED
@@ -1,8 +1,7 @@
1
  import gradio as gr
2
  import os
3
  import tempfile
4
- from pathlib import Path
5
- from typing import Optional, Tuple, List
6
  import logging
7
 
8
  from utils.audio_processor import AudioProcessor
@@ -32,20 +31,7 @@ class WhisperTranscriberApp:
32
  enable_diarization: bool,
33
  progress=gr.Progress()
34
  ) -> Tuple[str, str, str, str, str]:
35
- """
36
- Main processing function for transcription
37
-
38
- Args:
39
- file_input: Uploaded file
40
- url_input: URL input (YouTube or direct link)
41
- model_size: Whisper model size
42
- language: Language code
43
- enable_diarization: Whether to enable speaker diarization
44
- progress: Gradio progress tracker
45
-
46
- Returns:
47
- Tuple of (preview_text, srt_file, vtt_file, txt_file, json_file)
48
- """
49
  temp_files = []
50
 
51
  try:
@@ -53,30 +39,18 @@ class WhisperTranscriberApp:
53
  progress(0.05, desc="Processing input...")
54
 
55
  if url_input and url_input.strip():
56
- # Download from URL
57
  audio_file, source_type = MediaDownloader.download_media(
58
  url_input,
59
  progress_callback=lambda msg: progress(0.1, desc=msg)
60
  )
61
  temp_files.append(audio_file)
62
- logger.info(f"Downloaded from {source_type}: {audio_file}")
63
  elif file_input is not None:
64
- # Use uploaded file
65
  audio_file = file_input.name
66
- logger.info(f"Using uploaded file: {audio_file}")
67
  else:
68
  raise ValueError("Please provide either a file or a URL")
69
 
70
- # Step 2: Check file format and extract audio if needed
71
  progress(0.15, desc="Extracting audio...")
72
-
73
- if not AudioProcessor.is_supported_file(audio_file):
74
- raise ValueError(
75
- f"Unsupported file format. Supported: "
76
- f"{AudioProcessor.SUPPORTED_FORMATS['audio'] + AudioProcessor.SUPPORTED_FORMATS['video']}"
77
- )
78
-
79
- # Extract/convert audio to WAV for processing
80
  processed_audio = AudioProcessor.extract_audio(
81
  audio_file,
82
  output_format='wav',
@@ -84,12 +58,9 @@ class WhisperTranscriberApp:
84
  )
85
  temp_files.append(processed_audio)
86
 
87
- # Get file info
88
  duration = AudioProcessor.get_audio_duration(processed_audio)
89
- file_size = AudioProcessor.get_file_size_mb(processed_audio)
90
- logger.info(f"Audio duration: {duration:.2f}s, Size: {file_size:.2f}MB")
91
 
92
- # Step 3: Load Whisper model if needed
93
  if self.transcriber is None or self.current_model != model_size:
94
  progress(0.25, desc=f"Loading Whisper {model_size} model...")
95
  self.transcriber = WhisperTranscriber(model_size=model_size)
@@ -98,30 +69,25 @@ class WhisperTranscriberApp:
98
  )
99
  self.current_model = model_size
100
 
101
- # Step 4: Chunk audio if necessary
102
- progress(0.35, desc="Preparing audio for transcription...")
103
  chunks = AudioProcessor.chunk_audio(
104
  processed_audio,
105
  progress_callback=lambda msg: progress(0.4, desc=msg)
106
  )
107
-
108
- # Add chunk files to cleanup list
109
  for chunk_file, _ in chunks:
110
  if chunk_file != processed_audio:
111
  temp_files.append(chunk_file)
112
 
113
  # Step 5: Transcribe
114
  progress(0.45, desc="Transcribing audio...")
115
-
116
  if len(chunks) == 1:
117
- # Single chunk transcription
118
  transcription_result = self.transcriber.transcribe(
119
  chunks[0][0],
120
  language=language,
121
  progress_callback=lambda msg: progress(0.65, desc=msg)
122
  )
123
  else:
124
- # Multi-chunk transcription
125
  transcription_result = self.transcriber.transcribe_chunks(
126
  chunks,
127
  language=language,
@@ -130,24 +96,20 @@ class WhisperTranscriberApp:
130
 
131
  progress(0.70, desc="Transcription complete!")
132
 
133
- # Step 6: Speaker diarization (optional)
134
  speaker_labels = None
135
  if enable_diarization:
136
  progress(0.75, desc="Performing speaker diarization...")
137
-
138
  if not SpeakerDiarizer.is_available():
139
- logger.warning("HF_TOKEN not set, skipping diarization")
140
  progress(0.75, desc="Skipping diarization (HF_TOKEN not set)")
141
  else:
142
  try:
143
  if self.diarizer is None:
144
  self.diarizer = SpeakerDiarizer()
145
-
146
  diarization_result = self.diarizer.diarize(
147
  processed_audio,
148
  progress_callback=lambda msg: progress(0.85, desc=msg)
149
  )
150
-
151
  speaker_labels = self.diarizer.align_with_transcription(
152
  diarization_result,
153
  transcription_result,
@@ -155,11 +117,9 @@ class WhisperTranscriberApp:
155
  )
156
  except Exception as e:
157
  logger.error(f"Diarization failed: {e}")
158
- progress(0.9, desc=f"Diarization failed: {str(e)[:50]}")
159
 
160
- # Step 7: Generate output files
161
  progress(0.92, desc="Generating output files...")
162
-
163
  output_prefix = tempfile.mktemp(prefix="whisper_output_")
164
  outputs = SubtitleFormatter.generate_all_formats(
165
  transcription_result,
@@ -167,22 +127,16 @@ class WhisperTranscriberApp:
167
  speaker_labels
168
  )
169
 
170
- # Step 8: Prepare preview
171
- preview_text = f"""
172
- **Transcription Complete!**
173
 
174
  **Language:** {transcription_result['language']}
175
  **Duration:** {duration:.2f} seconds
176
  **Model Used:** {model_size}
177
- **Diarization:** {'Enabled' if speaker_labels else 'Disabled'}
178
 
179
- **Preview (first 500 characters):**
180
- {transcription_result['text'][:500]}...
181
- """
182
 
183
  progress(1.0, desc="Done!")
184
-
185
- # Cleanup temporary files
186
  AudioProcessor.cleanup_temp_files(*temp_files)
187
 
188
  return (
@@ -195,123 +149,47 @@ class WhisperTranscriberApp:
195
 
196
  except Exception as e:
197
  logger.error(f"Processing failed: {e}")
198
- # Cleanup on error
199
  AudioProcessor.cleanup_temp_files(*temp_files)
200
  raise gr.Error(f"Processing failed: {str(e)}")
201
 
202
 
203
- def create_interface():
204
- """Create and configure Gradio interface"""
205
-
206
- app = WhisperTranscriberApp()
207
-
208
- # Get available options
209
- model_choices = WhisperTranscriber.get_available_models()
210
- language_choices = WhisperTranscriber.get_language_list()
211
 
212
- with gr.Blocks(theme=gr.themes.Soft(), title="Whisper Transcriber") as demo:
213
- gr.Markdown(
214
- """
215
- # 🎤 Whisper Transcriber
216
- Generate accurate subtitles and transcripts from audio/video files using OpenAI Whisper.
217
- """
218
- )
219
 
220
- with gr.Tab("Transcribe"):
221
- with gr.Row():
222
- with gr.Column():
223
- file_input = gr.File(
224
- label="📁 Upload Audio/Video File",
225
- file_types=['audio', 'video']
226
- )
227
 
228
- url_input = gr.Textbox(
229
- label="🔗 Or Paste URL (YouTube or direct link)",
230
- placeholder="https://www.youtube.com/watch?v=... or https://example.com/audio.mp3"
231
- )
232
-
233
- model_size = gr.Dropdown(
234
- choices=model_choices,
235
- value='small',
236
- label="🎯 Model Size"
237
- )
238
-
239
- language = gr.Dropdown(
240
- choices=[(f"{v} ({k})", k) for k, v in language_choices.items()],
241
- value='auto',
242
- label="🌍 Language"
243
- )
244
-
245
- enable_diarization = gr.Checkbox(
246
- label="👥 Enable Speaker Diarization",
247
- value=False
248
- )
249
-
250
- process_btn = gr.Button("🚀 Generate Transcription", variant="primary")
251
-
252
- with gr.Column():
253
- preview_output = gr.Markdown(label="📄 Preview")
254
-
255
- srt_output = gr.File(label="SRT File")
256
- vtt_output = gr.File(label="VTT File")
257
- txt_output = gr.File(label="TXT File")
258
- json_output = gr.File(label="JSON File")
259
-
260
- with gr.Tab("Help"):
261
- gr.Markdown(
262
- """
263
- ## 📚 How to Use
264
-
265
- 1. **Upload a file** or **paste a URL** (YouTube or direct media link)
266
- 2. **Select model size**: Tiny (fast), Small (balanced), Medium (accurate)
267
- 3. **Choose language**: Auto-detect or select manually
268
- 4. **Enable diarization** (optional): Identifies different speakers
269
- 5. Click **Generate Transcription**
270
- 6. **Download** your preferred format(s)
271
-
272
- ## 📋 Supported Formats
273
- **Audio:** MP3, WAV, M4A, FLAC, AAC, OGG, WMA
274
- **Video:** MP4, AVI, MKV, MOV, WMV, WebM, FLV
275
-
276
- ## ⚙️ Features
277
- - ✅ Auto language detection (99+ languages)
278
- - ✅ Multiple output formats (SRT, VTT, TXT, JSON)
279
- - ✅ Word-level timestamps in JSON
280
- - ✅ Large file chunking (30-min segments)
281
- - ✅ Optional speaker identification
282
- - ✅ Public API endpoint
283
-
284
- ## 💡 Tips
285
- - Use **Small model** for most cases
286
- - **Diarization** requires HF_TOKEN (Space settings)
287
- - Large files are automatically chunked
288
- - Processing time varies by model and file length
289
- """
290
  )
291
-
292
- # Wire up the button
293
- process_btn.click(
294
- fn=app.process_media,
295
- inputs=[
296
- file_input,
297
- url_input,
298
- model_size,
299
- language,
300
- enable_diarization
301
- ],
302
- outputs=[
303
- preview_output,
304
- srt_output,
305
- vtt_output,
306
- txt_output,
307
- json_output
308
- ]
309
- )
310
-
311
- return demo
312
-
313
 
314
  if __name__ == "__main__":
315
- demo = create_interface()
316
- demo.queue() # Enable queuing for better handling of concurrent requests
317
  demo.launch()
 
1
  import gradio as gr
2
  import os
3
  import tempfile
4
+ from typing import Optional, Tuple
 
5
  import logging
6
 
7
  from utils.audio_processor import AudioProcessor
 
31
  enable_diarization: bool,
32
  progress=gr.Progress()
33
  ) -> Tuple[str, str, str, str, str]:
34
+ """Main processing function for transcription"""
 
 
 
 
 
 
 
 
 
 
 
 
 
35
  temp_files = []
36
 
37
  try:
 
39
  progress(0.05, desc="Processing input...")
40
 
41
  if url_input and url_input.strip():
 
42
  audio_file, source_type = MediaDownloader.download_media(
43
  url_input,
44
  progress_callback=lambda msg: progress(0.1, desc=msg)
45
  )
46
  temp_files.append(audio_file)
 
47
  elif file_input is not None:
 
48
  audio_file = file_input.name
 
49
  else:
50
  raise ValueError("Please provide either a file or a URL")
51
 
52
+ # Step 2: Extract audio
53
  progress(0.15, desc="Extracting audio...")
 
 
 
 
 
 
 
 
54
  processed_audio = AudioProcessor.extract_audio(
55
  audio_file,
56
  output_format='wav',
 
58
  )
59
  temp_files.append(processed_audio)
60
 
 
61
  duration = AudioProcessor.get_audio_duration(processed_audio)
 
 
62
 
63
+ # Step 3: Load model
64
  if self.transcriber is None or self.current_model != model_size:
65
  progress(0.25, desc=f"Loading Whisper {model_size} model...")
66
  self.transcriber = WhisperTranscriber(model_size=model_size)
 
69
  )
70
  self.current_model = model_size
71
 
72
+ # Step 4: Chunk audio
73
+ progress(0.35, desc="Preparing audio...")
74
  chunks = AudioProcessor.chunk_audio(
75
  processed_audio,
76
  progress_callback=lambda msg: progress(0.4, desc=msg)
77
  )
 
 
78
  for chunk_file, _ in chunks:
79
  if chunk_file != processed_audio:
80
  temp_files.append(chunk_file)
81
 
82
  # Step 5: Transcribe
83
  progress(0.45, desc="Transcribing audio...")
 
84
  if len(chunks) == 1:
 
85
  transcription_result = self.transcriber.transcribe(
86
  chunks[0][0],
87
  language=language,
88
  progress_callback=lambda msg: progress(0.65, desc=msg)
89
  )
90
  else:
 
91
  transcription_result = self.transcriber.transcribe_chunks(
92
  chunks,
93
  language=language,
 
96
 
97
  progress(0.70, desc="Transcription complete!")
98
 
99
+ # Step 6: Diarization (optional)
100
  speaker_labels = None
101
  if enable_diarization:
102
  progress(0.75, desc="Performing speaker diarization...")
 
103
  if not SpeakerDiarizer.is_available():
 
104
  progress(0.75, desc="Skipping diarization (HF_TOKEN not set)")
105
  else:
106
  try:
107
  if self.diarizer is None:
108
  self.diarizer = SpeakerDiarizer()
 
109
  diarization_result = self.diarizer.diarize(
110
  processed_audio,
111
  progress_callback=lambda msg: progress(0.85, desc=msg)
112
  )
 
113
  speaker_labels = self.diarizer.align_with_transcription(
114
  diarization_result,
115
  transcription_result,
 
117
  )
118
  except Exception as e:
119
  logger.error(f"Diarization failed: {e}")
 
120
 
121
+ # Step 7: Generate outputs
122
  progress(0.92, desc="Generating output files...")
 
123
  output_prefix = tempfile.mktemp(prefix="whisper_output_")
124
  outputs = SubtitleFormatter.generate_all_formats(
125
  transcription_result,
 
127
  speaker_labels
128
  )
129
 
130
+ preview_text = f"""**Transcription Complete!**
 
 
131
 
132
  **Language:** {transcription_result['language']}
133
  **Duration:** {duration:.2f} seconds
134
  **Model Used:** {model_size}
 
135
 
136
+ **Preview:**
137
+ {transcription_result['text'][:500]}..."""
 
138
 
139
  progress(1.0, desc="Done!")
 
 
140
  AudioProcessor.cleanup_temp_files(*temp_files)
141
 
142
  return (
 
149
 
150
  except Exception as e:
151
  logger.error(f"Processing failed: {e}")
 
152
  AudioProcessor.cleanup_temp_files(*temp_files)
153
  raise gr.Error(f"Processing failed: {str(e)}")
154
 
155
 
156
+ # Create app instance
157
+ app = WhisperTranscriberApp()
 
 
 
 
 
 
158
 
159
+ # Get available options
160
+ model_choices = WhisperTranscriber.get_available_models()
161
+ language_choices = WhisperTranscriber.get_language_list()
 
 
 
 
162
 
163
+ # Create interface
164
+ with gr.Blocks(title="Whisper Transcriber") as demo:
165
+ gr.Markdown("# 🎤 Whisper Transcriber\nGenerate subtitles from audio/video using OpenAI Whisper")
 
 
 
 
166
 
167
+ with gr.Row():
168
+ with gr.Column():
169
+ file_input = gr.File(label="Upload Audio/Video File")
170
+ url_input = gr.Textbox(label="Or Paste URL", placeholder="YouTube or direct link")
171
+ model_size = gr.Dropdown(choices=model_choices, value='small', label="Model Size")
172
+ language = gr.Dropdown(
173
+ choices=[(f"{v} ({k})", k) for k, v in language_choices.items()],
174
+ value='auto',
175
+ label="Language"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
176
  )
177
+ enable_diarization = gr.Checkbox(label="Enable Speaker Diarization", value=False)
178
+ btn = gr.Button("Generate Transcription", variant="primary")
179
+
180
+ with gr.Column():
181
+ preview = gr.Markdown(label="Preview")
182
+ srt_file = gr.File(label="SRT File")
183
+ vtt_file = gr.File(label="VTT File")
184
+ txt_file = gr.File(label="TXT File")
185
+ json_file = gr.File(label="JSON File")
186
+
187
+ btn.click(
188
+ fn=app.process_media,
189
+ inputs=[file_input, url_input, model_size, language, enable_diarization],
190
+ outputs=[preview, srt_file, vtt_file, txt_file, json_file]
191
+ )
 
 
 
 
 
 
 
192
 
193
  if __name__ == "__main__":
194
+ demo.queue()
 
195
  demo.launch()