LT4Ryan commited on
Commit
5286b5d
·
verified ·
1 Parent(s): 805b912

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +86 -38
app.py CHANGED
@@ -191,7 +191,7 @@ def play_segment(evt: gr.SelectData, raw_ts_list, current_audio_path):
191
 
192
  article = (
193
  "<div style='font-size: 1.1em;'>"
194
- "<p>AudioDog uses <code><a href='https://huggingface.co/nvidia/parakeet-tdt-0.6b-v2'>parakeet-tdt-0.6b-v2</a></code>, a 600-million-parameter model designed for high-quality English speech recognition.</p>"
195
  "<p><strong style='color: red; font-size: 1.2em;'>Key Features:</strong></p>"
196
  "<ul>"
197
  "<li>Automatic punctuation and capitalization</li>"
@@ -202,9 +202,7 @@ article = (
202
  "</div>"
203
  )
204
 
205
- examples = [
206
- ["data/example-yt_saTD1u8PorI.mp3"],
207
- ]
208
 
209
 
210
  # Define an NVIDIA-inspired theme
@@ -226,57 +224,100 @@ nvidia_theme = gr_themes.Default(
226
  font=[gr_themes.GoogleFont("Inter"), "ui-sans-serif", "system-ui", "sans-serif"],
227
  ).set()
228
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
229
  # Apply the custom theme
 
 
 
 
230
  with gr.Blocks(theme='nvidia-theme') as demo:
 
 
 
 
 
 
 
 
 
 
 
 
 
231
  with gr.Row():
 
 
 
 
 
 
 
 
 
 
 
232
  with gr.Column(scale=1):
233
- gr.Image("pics/AD.jpg", show_label=False, show_download_button=False)
234
- with gr.Column(scale=3):
235
- model_display_name = MODEL_NAME.split('/')[-1] if '/' in MODEL_NAME else MODEL_NAME
236
- gr.Markdown(f"<h1 style='text-align: left;'>AudioDog, powered by {model_display_name}</h1>")
237
- gr.HTML(article)
238
-
239
 
240
  current_audio_path_state = gr.State(None)
241
  raw_timestamps_list_state = gr.State([])
 
 
242
 
243
- with gr.Tabs():
244
- with gr.TabItem("Audio File"):
245
- file_input = gr.Audio(sources=["upload"], type="filepath", label="Upload Audio File")
246
- gr.Examples(examples=examples, inputs=[file_input], label="Example Audio Files (Click to Load)")
247
- file_transcribe_btn = gr.Button("Transcribe Uploaded File", variant="primary")
248
-
249
- with gr.TabItem("Microphone"):
250
- mic_input = gr.Audio(sources=["microphone"], type="filepath", label="Record Audio")
251
- mic_transcribe_btn = gr.Button("Transcribe Microphone Input", variant="primary")
252
-
253
- gr.Markdown("---")
254
- gr.Markdown("<p><strong style='color: #FF0000; font-size: 1.2em;'>Transcription Results (Click row to play segment)</strong></p>")
255
-
256
- # Define the DownloadButton *before* the DataFrame
257
- download_btn = gr.DownloadButton(label="Download Transcript (CSV)", visible=False)
258
-
259
- vis_timestamps_df = gr.DataFrame(
260
- headers=["Start (s)", "End (s)", "Segment"],
261
- datatype=["number", "number", "str"],
262
- wrap=True,
263
- label="Transcription Segments"
264
- )
265
 
266
- # selected_segment_player was defined after download_btn previously, keep it after df for layout
267
- selected_segment_player = gr.Audio(label="Selected Segment", interactive=False)
 
 
 
268
 
269
  mic_transcribe_btn.click(
270
- fn=get_transcripts_and_raw_times,
271
  inputs=[mic_input],
272
- outputs=[vis_timestamps_df, raw_timestamps_list_state, current_audio_path_state, download_btn],
273
  api_name="transcribe_mic"
274
  )
275
 
276
  file_transcribe_btn.click(
277
- fn=get_transcripts_and_raw_times,
278
  inputs=[file_input],
279
- outputs=[vis_timestamps_df, raw_timestamps_list_state, current_audio_path_state, download_btn],
280
  api_name="transcribe_file"
281
  )
282
 
@@ -286,6 +327,13 @@ with gr.Blocks(theme='nvidia-theme') as demo:
286
  outputs=[selected_segment_player],
287
  )
288
 
 
 
 
 
 
 
 
289
  if __name__ == "__main__":
290
  print("Launching AudioDog...")
291
  demo.queue()
 
191
 
192
  article = (
193
  "<div style='font-size: 1.1em;'>"
194
+ "<p>AudioDog uses <code><a href='https://huggingface.co/nvidia/parakeet-tdt-0.6b-v2'>parakeet-tdt-0.6b-v2</a></code>.</p>"
195
  "<p><strong style='color: red; font-size: 1.2em;'>Key Features:</strong></p>"
196
  "<ul>"
197
  "<li>Automatic punctuation and capitalization</li>"
 
202
  "</div>"
203
  )
204
 
205
+
 
 
206
 
207
 
208
  # Define an NVIDIA-inspired theme
 
224
  font=[gr_themes.GoogleFont("Inter"), "ui-sans-serif", "system-ui", "sans-serif"],
225
  ).set()
226
 
227
+
228
+ # Helper to concatenate transcript segments
229
+ def get_full_transcript(vis_data):
230
+ if not vis_data:
231
+ return ""
232
+ return " ".join([row[2] for row in vis_data if len(row) == 3])
233
+
234
+ # Simple summary function (replace with a real model if needed)
235
+ def summarize_transcript(transcript):
236
+ if not transcript:
237
+ return "No transcript available to summarize."
238
+ # Placeholder: just return first 2 sentences or 200 chars
239
+ import re
240
+ sentences = re.split(r'(?<=[.!?]) +', transcript)
241
+ summary = " ".join(sentences[:2])
242
+ if len(summary) < 40:
243
+ summary = transcript[:200] + ("..." if len(transcript) > 200 else "")
244
+ return summary
245
+
246
  # Apply the custom theme
247
+
248
+
249
+
250
+
251
  with gr.Blocks(theme='nvidia-theme') as demo:
252
+ model_display_name = MODEL_NAME.split('/')[-1] if '/' in MODEL_NAME else MODEL_NAME
253
+ # Embed image in description HTML, left-justified and 75% width
254
+ # Replace the original description_html variable with this one
255
+
256
+ description_html = f"""
257
+ <div style='display: flex; align-items: flex-start;'>
258
+ <img src='file=pics/AD.jpg' style='width: 75%; max-width: 300px; margin-right: 20px; float: left;' alt='AudioDog logo'>
259
+ <div>
260
+ <h1 style='text-align: left;'>AudioDog, powered by {model_display_name}</h1>
261
+ {article}
262
+ </div>
263
+ </div>
264
+ """
265
  with gr.Row():
266
+ # Left: description text with embedded image and upload audio
267
+ with gr.Column(scale=2):
268
+ gr.HTML(description_html)
269
+ with gr.Tabs():
270
+ with gr.TabItem("Audio File"):
271
+ file_input = gr.Audio(sources=["upload"], type="filepath", label="Upload Audio File")
272
+ file_transcribe_btn = gr.Button("Transcribe Uploaded File", variant="primary")
273
+ with gr.TabItem("Microphone"):
274
+ mic_input = gr.Audio(sources=["microphone"], type="filepath", label="Record Audio")
275
+ mic_transcribe_btn = gr.Button("Transcribe Microphone Input", variant="primary")
276
+ # Right: transcript
277
  with gr.Column(scale=1):
278
+ transcript_box = gr.Textbox(label="Full Transcript", lines=15, interactive=False)
 
 
 
 
 
279
 
280
  current_audio_path_state = gr.State(None)
281
  raw_timestamps_list_state = gr.State([])
282
+ vis_data_state = gr.State([])
283
+ transcript_state = gr.State("")
284
 
285
+ with gr.Row():
286
+ # Left column: transcription results
287
+ with gr.Column(scale=2):
288
+ gr.Markdown("---")
289
+ gr.Markdown("<p><strong style='color: #FF0000; font-size: 1.2em;'>Transcription Results (Click row to play segment)</strong></p>")
290
+ download_btn = gr.DownloadButton(label="Download Transcript (CSV)", visible=False)
291
+ vis_timestamps_df = gr.DataFrame(
292
+ headers=["Start (s)", "End (s)", "Segment"],
293
+ datatype=["number", "number", "str"],
294
+ wrap=True,
295
+ label="Transcription Segments"
296
+ )
297
+ selected_segment_player = gr.Audio(label="Selected Segment", interactive=False)
298
+
299
+ # Right column: summary controls
300
+ with gr.Column(scale=1):
301
+ summary_btn = gr.Button("Summarize Transcript", variant="primary")
302
+ summary_box = gr.Textbox(label="Summary", lines=5, interactive=False)
 
 
 
 
303
 
304
+ # Transcribe button logic
305
+ def handle_transcribe(audio_path):
306
+ vis_data, raw_times, audio_path, download_btn_obj = get_transcripts_and_raw_times(audio_path)
307
+ transcript = get_full_transcript(vis_data)
308
+ return vis_data, raw_times, audio_path, download_btn_obj, vis_data, transcript
309
 
310
  mic_transcribe_btn.click(
311
+ fn=handle_transcribe,
312
  inputs=[mic_input],
313
+ outputs=[vis_timestamps_df, raw_timestamps_list_state, current_audio_path_state, download_btn, vis_data_state, transcript_box],
314
  api_name="transcribe_mic"
315
  )
316
 
317
  file_transcribe_btn.click(
318
+ fn=handle_transcribe,
319
  inputs=[file_input],
320
+ outputs=[vis_timestamps_df, raw_timestamps_list_state, current_audio_path_state, download_btn, vis_data_state, transcript_box],
321
  api_name="transcribe_file"
322
  )
323
 
 
327
  outputs=[selected_segment_player],
328
  )
329
 
330
+ # Summary button logic
331
+ summary_btn.click(
332
+ fn=summarize_transcript,
333
+ inputs=[transcript_box],
334
+ outputs=[summary_box],
335
+ )
336
+
337
  if __name__ == "__main__":
338
  print("Launching AudioDog...")
339
  demo.queue()