saadmannan commited on
Commit
0f00a09
Β·
1 Parent(s): 480869c

new feature added

Browse files
Files changed (1) hide show
  1. app.py +113 -15
app.py CHANGED
@@ -12,7 +12,9 @@ from pathlib import Path
12
  import json
13
  import os
14
  import tempfile
 
15
  from typing import Optional, Tuple, List, Dict
 
16
 
17
  from src.pipeline import VADDiarizationPipeline
18
  from src.utils import visualize_timeline, segment_to_rttm
@@ -38,6 +40,20 @@ except Exception as e:
38
  PIPELINE_READY = False
39
 
40
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
41
  def create_timeline_plot(segments: List[Dict], duration: float) -> plt.Figure:
42
  """Create a visual timeline plot of speaker segments."""
43
  fig, ax = plt.subplots(figsize=(12, 4))
@@ -92,19 +108,20 @@ def process_audio(
92
  audio_file,
93
  num_speakers: Optional[int] = None,
94
  vad_threshold: float = 0.5,
 
95
  progress=gr.Progress()
96
- ) -> Tuple[str, str, str, plt.Figure]:
97
  """
98
  Process audio file through the pipeline.
99
 
100
  Returns:
101
- Tuple of (summary_text, timeline_text, json_output, plot)
102
  """
103
  if audio_file is None:
104
- return "Please upload an audio file", "", "", None
105
 
106
  if not PIPELINE_READY:
107
- return "Pipeline not ready. Please set HF_TOKEN environment variable.", "", "", None
108
 
109
  try:
110
  progress(0.1, desc="Loading audio...")
@@ -128,6 +145,29 @@ def process_audio(
128
 
129
  progress(0.8, desc="Generating visualizations...")
130
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
131
  # Create summary
132
  summary_lines = []
133
  summary_lines.append("# Processing Results\n")
@@ -169,16 +209,19 @@ def process_audio(
169
  duration = max(seg['end'] for seg in result['speaker_segments'])
170
  plot = create_timeline_plot(result['speaker_segments'], duration)
171
 
 
 
 
172
  progress(1.0, desc="Complete!")
173
 
174
- return summary_text, timeline_text, json_output, plot
175
 
176
  except Exception as e:
177
  error_msg = f"Error processing audio: {str(e)}\n\n"
178
  error_msg += "Make sure you have:\n"
179
  error_msg += "1. Valid HF_TOKEN environment variable\n"
180
  error_msg += "2. Accepted model conditions at https://huggingface.co/pyannote/speaker-diarization-3.1"
181
- return error_msg, "", "", None
182
 
183
 
184
  def create_demo():
@@ -203,13 +246,29 @@ def create_demo():
203
  with gr.Column(scale=1):
204
  gr.Markdown("## Input")
205
 
206
- audio_input = gr.Audio(
207
- label="Upload Audio File",
208
- type="filepath",
209
- sources=["upload"]
210
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
211
 
212
- with gr.Accordion("Advanced Settings", open=False):
213
  num_speakers = gr.Number(
214
  label="Number of Speakers (0 for auto-detection)",
215
  value=0,
@@ -227,6 +286,27 @@ def create_demo():
227
  step=0.05,
228
  info="Lower = more sensitive to speech"
229
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
230
 
231
  process_btn = gr.Button("πŸš€ Process Audio", variant="primary", size="lg")
232
 
@@ -253,6 +333,17 @@ def create_demo():
253
  language="json",
254
  lines=20
255
  )
 
 
 
 
 
 
 
 
 
 
 
256
 
257
  # Examples
258
  gr.Markdown("## πŸ“ Examples")
@@ -265,11 +356,18 @@ def create_demo():
265
  - Processing Time: Depends on audio length and hardware
266
  """)
267
 
268
- # Event handlers
269
  process_btn.click(
270
  fn=process_audio,
271
- inputs=[audio_input, num_speakers, vad_threshold],
272
- outputs=[summary_output, timeline_output, json_output, timeline_plot]
 
 
 
 
 
 
 
273
  )
274
 
275
  # Footer
 
12
  import json
13
  import os
14
  import tempfile
15
+ import soundfile as sf
16
  from typing import Optional, Tuple, List, Dict
17
+ from datetime import datetime
18
 
19
  from src.pipeline import VADDiarizationPipeline
20
  from src.utils import visualize_timeline, segment_to_rttm
 
40
  PIPELINE_READY = False
41
 
42
 
43
+ def apply_speaker_names(segments: List[Dict], speaker_mapping: Dict[str, str]) -> List[Dict]:
44
+ """Apply custom speaker names to segments."""
45
+ if not speaker_mapping:
46
+ return segments
47
+
48
+ renamed_segments = []
49
+ for seg in segments:
50
+ new_seg = seg.copy()
51
+ if seg['speaker'] in speaker_mapping and speaker_mapping[seg['speaker']]:
52
+ new_seg['speaker'] = speaker_mapping[seg['speaker']]
53
+ renamed_segments.append(new_seg)
54
+ return renamed_segments
55
+
56
+
57
  def create_timeline_plot(segments: List[Dict], duration: float) -> plt.Figure:
58
  """Create a visual timeline plot of speaker segments."""
59
  fig, ax = plt.subplots(figsize=(12, 4))
 
108
  audio_file,
109
  num_speakers: Optional[int] = None,
110
  vad_threshold: float = 0.5,
111
+ speaker_names: str = "",
112
  progress=gr.Progress()
113
+ ) -> Tuple[str, str, str, plt.Figure, str]:
114
  """
115
  Process audio file through the pipeline.
116
 
117
  Returns:
118
+ Tuple of (summary_text, timeline_text, json_output, plot, download_path)
119
  """
120
  if audio_file is None:
121
+ return "Please upload an audio file", "", "", None, None
122
 
123
  if not PIPELINE_READY:
124
+ return "Pipeline not ready. Please set HF_TOKEN environment variable.", "", "", None, None
125
 
126
  try:
127
  progress(0.1, desc="Loading audio...")
 
145
 
146
  progress(0.8, desc="Generating visualizations...")
147
 
148
+ # Parse speaker names
149
+ speaker_mapping = {}
150
+ if speaker_names.strip():
151
+ lines = [line.strip() for line in speaker_names.strip().split('\n') if line.strip()]
152
+ for line in lines:
153
+ if ':' in line:
154
+ parts = line.split(':', 1)
155
+ speaker_id = parts[0].strip()
156
+ custom_name = parts[1].strip()
157
+ if custom_name:
158
+ speaker_mapping[speaker_id] = custom_name
159
+
160
+ # Apply custom speaker names
161
+ if speaker_mapping:
162
+ result['speaker_segments'] = apply_speaker_names(result['speaker_segments'], speaker_mapping)
163
+ # Update speaker statistics with new names
164
+ if 'speaker_statistics' in result:
165
+ new_stats = {}
166
+ for speaker, stats in result['speaker_statistics'].items():
167
+ new_name = speaker_mapping.get(speaker, speaker)
168
+ new_stats[new_name] = stats
169
+ result['speaker_statistics'] = new_stats
170
+
171
  # Create summary
172
  summary_lines = []
173
  summary_lines.append("# Processing Results\n")
 
209
  duration = max(seg['end'] for seg in result['speaker_segments'])
210
  plot = create_timeline_plot(result['speaker_segments'], duration)
211
 
212
+ # Save processed audio info for download
213
+ download_path = audio_file
214
+
215
  progress(1.0, desc="Complete!")
216
 
217
+ return summary_text, timeline_text, json_output, plot, download_path
218
 
219
  except Exception as e:
220
  error_msg = f"Error processing audio: {str(e)}\n\n"
221
  error_msg += "Make sure you have:\n"
222
  error_msg += "1. Valid HF_TOKEN environment variable\n"
223
  error_msg += "2. Accepted model conditions at https://huggingface.co/pyannote/speaker-diarization-3.1"
224
+ return error_msg, "", "", None, None
225
 
226
 
227
  def create_demo():
 
246
  with gr.Column(scale=1):
247
  gr.Markdown("## Input")
248
 
249
+ with gr.Tabs() as input_tabs:
250
+ with gr.Tab("πŸ“ Upload File"):
251
+ audio_input = gr.Audio(
252
+ label="Upload Audio File",
253
+ type="filepath",
254
+ sources=["upload"]
255
+ )
256
+
257
+ with gr.Tab("🎀 Record Live"):
258
+ audio_record = gr.Audio(
259
+ label="Record Audio",
260
+ type="filepath",
261
+ sources=["microphone"]
262
+ )
263
+ gr.Markdown("""
264
+ **Tips for recording:**
265
+ - Click the microphone icon to start recording
266
+ - Speak clearly and avoid background noise
267
+ - Click stop when finished
268
+ - The recording will be automatically processed
269
+ """)
270
 
271
+ with gr.Accordion("βš™οΈ Advanced Settings", open=False):
272
  num_speakers = gr.Number(
273
  label="Number of Speakers (0 for auto-detection)",
274
  value=0,
 
286
  step=0.05,
287
  info="Lower = more sensitive to speech"
288
  )
289
+
290
+ gr.Markdown("### πŸ‘₯ Custom Speaker Names")
291
+ gr.Markdown("""
292
+ Enter custom names for speakers (one per line):
293
+
294
+ Format: `SPEAKER_00: John Doe`
295
+
296
+ Example:
297
+ ```
298
+ SPEAKER_00: Alice
299
+ SPEAKER_01: Bob
300
+ SPEAKER_02: Charlie
301
+ ```
302
+ """)
303
+
304
+ speaker_names = gr.Textbox(
305
+ label="Speaker Name Mapping",
306
+ placeholder="SPEAKER_00: Alice\nSPEAKER_01: Bob",
307
+ lines=5,
308
+ info="Leave empty to use default speaker labels"
309
+ )
310
 
311
  process_btn = gr.Button("πŸš€ Process Audio", variant="primary", size="lg")
312
 
 
333
  language="json",
334
  lines=20
335
  )
336
+
337
+ with gr.Tab("πŸ“₯ Download"):
338
+ gr.Markdown("### Download Processed Audio")
339
+ download_audio = gr.File(
340
+ label="Download Audio File",
341
+ interactive=False
342
+ )
343
+ gr.Markdown("""
344
+ The original audio file is available for download here.
345
+ You can use it with the JSON results for further processing.
346
+ """)
347
 
348
  # Examples
349
  gr.Markdown("## πŸ“ Examples")
 
356
  - Processing Time: Depends on audio length and hardware
357
  """)
358
 
359
+ # Event handlers for file upload
360
  process_btn.click(
361
  fn=process_audio,
362
+ inputs=[audio_input, num_speakers, vad_threshold, speaker_names],
363
+ outputs=[summary_output, timeline_output, json_output, timeline_plot, download_audio]
364
+ )
365
+
366
+ # Event handler for live recording
367
+ audio_record.stop_recording(
368
+ fn=process_audio,
369
+ inputs=[audio_record, num_speakers, vad_threshold, speaker_names],
370
+ outputs=[summary_output, timeline_output, json_output, timeline_plot, download_audio]
371
  )
372
 
373
  # Footer