Spaces:

saadmannan
/

VAD-speakerDiarization

Sleeping

App Files Files Community

saadmannan commited on Nov 12, 2025

Commit

0f00a09

1 Parent(s): 480869c

new feature added

Browse files

Files changed (1) hide show

app.py +113 -15

app.py CHANGED Viewed

@@ -12,7 +12,9 @@ from pathlib import Path
 import json
 import os
 import tempfile
 from typing import Optional, Tuple, List, Dict
 from src.pipeline import VADDiarizationPipeline
 from src.utils import visualize_timeline, segment_to_rttm
@@ -38,6 +40,20 @@ except Exception as e:
     PIPELINE_READY = False
 def create_timeline_plot(segments: List[Dict], duration: float) -> plt.Figure:
     """Create a visual timeline plot of speaker segments."""
     fig, ax = plt.subplots(figsize=(12, 4))
@@ -92,19 +108,20 @@ def process_audio(
     audio_file,
     num_speakers: Optional[int] = None,
     vad_threshold: float = 0.5,
     progress=gr.Progress()
-) -> Tuple[str, str, str, plt.Figure]:
     """
     Process audio file through the pipeline.
     Returns:
-        Tuple of (summary_text, timeline_text, json_output, plot)
     """
     if audio_file is None:
-        return "Please upload an audio file", "", "", None
     if not PIPELINE_READY:
-        return "Pipeline not ready. Please set HF_TOKEN environment variable.", "", "", None
     try:
         progress(0.1, desc="Loading audio...")
@@ -128,6 +145,29 @@ def process_audio(
         progress(0.8, desc="Generating visualizations...")
         # Create summary
         summary_lines = []
         summary_lines.append("# Processing Results\n")
@@ -169,16 +209,19 @@ def process_audio(
         duration = max(seg['end'] for seg in result['speaker_segments'])
         plot = create_timeline_plot(result['speaker_segments'], duration)
         progress(1.0, desc="Complete!")
-        return summary_text, timeline_text, json_output, plot
     except Exception as e:
         error_msg = f"Error processing audio: {str(e)}\n\n"
         error_msg += "Make sure you have:\n"
         error_msg += "1. Valid HF_TOKEN environment variable\n"
         error_msg += "2. Accepted model conditions at https://huggingface.co/pyannote/speaker-diarization-3.1"
-        return error_msg, "", "", None
 def create_demo():
@@ -203,13 +246,29 @@ def create_demo():
             with gr.Column(scale=1):
                 gr.Markdown("## Input")
-                audio_input = gr.Audio(
-                    label="Upload Audio File",
-                    type="filepath",
-                    sources=["upload"]
-                )
-                with gr.Accordion("Advanced Settings", open=False):
                     num_speakers = gr.Number(
                         label="Number of Speakers (0 for auto-detection)",
                         value=0,
@@ -227,6 +286,27 @@ def create_demo():
                         step=0.05,
                         info="Lower = more sensitive to speech"
                     )
                 process_btn = gr.Button("🚀 Process Audio", variant="primary", size="lg")
@@ -253,6 +333,17 @@ def create_demo():
                         language="json",
                         lines=20
                     )
         # Examples
         gr.Markdown("## 📝 Examples")
@@ -265,11 +356,18 @@ def create_demo():
         - Processing Time: Depends on audio length and hardware
         """)
-        # Event handlers
         process_btn.click(
             fn=process_audio,
-            inputs=[audio_input, num_speakers, vad_threshold],
-            outputs=[summary_output, timeline_output, json_output, timeline_plot]
         )
         # Footer

 import json
 import os
 import tempfile
+import soundfile as sf
 from typing import Optional, Tuple, List, Dict
+from datetime import datetime
 from src.pipeline import VADDiarizationPipeline
 from src.utils import visualize_timeline, segment_to_rttm
     PIPELINE_READY = False
+def apply_speaker_names(segments: List[Dict], speaker_mapping: Dict[str, str]) -> List[Dict]:
+    """Apply custom speaker names to segments."""
+    if not speaker_mapping:
+        return segments
+    renamed_segments = []
+    for seg in segments:
+        new_seg = seg.copy()
+        if seg['speaker'] in speaker_mapping and speaker_mapping[seg['speaker']]:
+            new_seg['speaker'] = speaker_mapping[seg['speaker']]
+        renamed_segments.append(new_seg)
+    return renamed_segments
 def create_timeline_plot(segments: List[Dict], duration: float) -> plt.Figure:
     """Create a visual timeline plot of speaker segments."""
     fig, ax = plt.subplots(figsize=(12, 4))
     audio_file,
     num_speakers: Optional[int] = None,
     vad_threshold: float = 0.5,
+    speaker_names: str = "",
     progress=gr.Progress()
+) -> Tuple[str, str, str, plt.Figure, str]:
     """
     Process audio file through the pipeline.
     Returns:
+        Tuple of (summary_text, timeline_text, json_output, plot, download_path)
     """
     if audio_file is None:
+        return "Please upload an audio file", "", "", None, None
     if not PIPELINE_READY:
+        return "Pipeline not ready. Please set HF_TOKEN environment variable.", "", "", None, None
     try:
         progress(0.1, desc="Loading audio...")
         progress(0.8, desc="Generating visualizations...")
+        # Parse speaker names
+        speaker_mapping = {}
+        if speaker_names.strip():
+            lines = [line.strip() for line in speaker_names.strip().split('\n') if line.strip()]
+            for line in lines:
+                if ':' in line:
+                    parts = line.split(':', 1)
+                    speaker_id = parts[0].strip()
+                    custom_name = parts[1].strip()
+                    if custom_name:
+                        speaker_mapping[speaker_id] = custom_name
+        # Apply custom speaker names
+        if speaker_mapping:
+            result['speaker_segments'] = apply_speaker_names(result['speaker_segments'], speaker_mapping)
+            # Update speaker statistics with new names
+            if 'speaker_statistics' in result:
+                new_stats = {}
+                for speaker, stats in result['speaker_statistics'].items():
+                    new_name = speaker_mapping.get(speaker, speaker)
+                    new_stats[new_name] = stats
+                result['speaker_statistics'] = new_stats
         # Create summary
         summary_lines = []
         summary_lines.append("# Processing Results\n")
         duration = max(seg['end'] for seg in result['speaker_segments'])
         plot = create_timeline_plot(result['speaker_segments'], duration)
+        # Save processed audio info for download
+        download_path = audio_file
         progress(1.0, desc="Complete!")
+        return summary_text, timeline_text, json_output, plot, download_path
     except Exception as e:
         error_msg = f"Error processing audio: {str(e)}\n\n"
         error_msg += "Make sure you have:\n"
         error_msg += "1. Valid HF_TOKEN environment variable\n"
         error_msg += "2. Accepted model conditions at https://huggingface.co/pyannote/speaker-diarization-3.1"
+        return error_msg, "", "", None, None
 def create_demo():
             with gr.Column(scale=1):
                 gr.Markdown("## Input")
+                with gr.Tabs() as input_tabs:
+                    with gr.Tab("📁 Upload File"):
+                        audio_input = gr.Audio(
+                            label="Upload Audio File",
+                            type="filepath",
+                            sources=["upload"]
+                        )
+                    with gr.Tab("🎤 Record Live"):
+                        audio_record = gr.Audio(
+                            label="Record Audio",
+                            type="filepath",
+                            sources=["microphone"]
+                        )
+                        gr.Markdown("""
+                        **Tips for recording:**
+                        - Click the microphone icon to start recording
+                        - Speak clearly and avoid background noise
+                        - Click stop when finished
+                        - The recording will be automatically processed
+                        """)
+                with gr.Accordion("⚙️ Advanced Settings", open=False):
                     num_speakers = gr.Number(
                         label="Number of Speakers (0 for auto-detection)",
                         value=0,
                         step=0.05,
                         info="Lower = more sensitive to speech"
                     )
+                    gr.Markdown("### 👥 Custom Speaker Names")
+                    gr.Markdown("""
+                    Enter custom names for speakers (one per line):
+                    Format: `SPEAKER_00: John Doe`
+                    Example:
+                    ```
+                    SPEAKER_00: Alice
+                    SPEAKER_01: Bob
+                    SPEAKER_02: Charlie
+                    ```
+                    """)
+                    speaker_names = gr.Textbox(
+                        label="Speaker Name Mapping",
+                        placeholder="SPEAKER_00: Alice\nSPEAKER_01: Bob",
+                        lines=5,
+                        info="Leave empty to use default speaker labels"
+                    )
                 process_btn = gr.Button("🚀 Process Audio", variant="primary", size="lg")
                         language="json",
                         lines=20
                     )
+                with gr.Tab("📥 Download"):
+                    gr.Markdown("### Download Processed Audio")
+                    download_audio = gr.File(
+                        label="Download Audio File",
+                        interactive=False
+                    )
+                    gr.Markdown("""
+                    The original audio file is available for download here.
+                    You can use it with the JSON results for further processing.
+                    """)
         # Examples
         gr.Markdown("## 📝 Examples")
         - Processing Time: Depends on audio length and hardware
         """)
+        # Event handlers for file upload
         process_btn.click(
             fn=process_audio,
+            inputs=[audio_input, num_speakers, vad_threshold, speaker_names],
+            outputs=[summary_output, timeline_output, json_output, timeline_plot, download_audio]
+        )
+        # Event handler for live recording
+        audio_record.stop_recording(
+            fn=process_audio,
+            inputs=[audio_record, num_speakers, vad_threshold, speaker_names],
+            outputs=[summary_output, timeline_output, json_output, timeline_plot, download_audio]
         )
         # Footer