Spaces:

Manyue-DataScientist
/

speaker-diarization-app

Sleeping

App Files Files Community

Manyue-DataScientist commited on Jan 12, 2025

Commit

67e41d5

verified ·

1 Parent(s): d6e0f11

Update app.py

Browse files

Files changed (1) hide show

app.py +42 -22

app.py CHANGED Viewed

@@ -11,9 +11,8 @@ import io
 @st.cache_resource
 def load_models():
    try:
-       # Back to original model name
        diarization = Pipeline.from_pretrained(
-           "pyannote/speaker-diarization",  # Original model name
            use_auth_token=st.secrets["hf_token"]
        )
@@ -25,7 +24,6 @@ def load_models():
            device=0 if torch.cuda.is_available() else -1
        )
-       # Validate models loaded correctly
        if not diarization or not transcriber or not summarizer:
            raise ValueError("One or more models failed to load")
@@ -46,7 +44,6 @@ def process_audio(audio_file, max_duration=600):
                else:
                    audio = AudioSegment.from_wav(audio_bytes)
-               # Standardize format
                audio = audio.set_frame_rate(16000)
                audio = audio.set_channels(1)
                audio = audio.set_sample_width(2)
@@ -87,23 +84,46 @@ def process_audio(audio_file, max_duration=600):
        st.error(f"Error processing audio: {str(e)}")
        return None
-def format_speaker_segments(diarization_result):
-   if diarization_result is None:
        return []
    formatted_segments = []
    try:
        for turn, _, speaker in diarization_result.itertracks(yield_label=True):
-           formatted_segments.append({
-               'speaker': str(speaker),  # Ensure string
-               'start': float(turn.start) if turn.start is not None else 0.0,
-               'end': float(turn.end) if turn.end is not None else 0.0
-           })
    except Exception as e:
        st.error(f"Error formatting segments: {str(e)}")
        return []
-   return formatted_segments
 def format_timestamp(seconds):
    minutes = int(seconds // 60)
@@ -133,25 +153,25 @@ def main():
                    with tab1:
                        st.write("Speaker Timeline:")
-                       segments = format_speaker_segments(results["diarization"])
-                       if segments:  # Only proceed if we have segments
                            for segment in segments:
-                               col1, col2 = st.columns([2,8])
                                with col1:
-                                   try:
-                                       speaker_num = int(segment['speaker'].split('_')[1])
-                                       colors = ['🔵', '🔴']  # Two colors for alternating speakers
-                                       speaker_color = colors[speaker_num % len(colors)]
-                                       st.write(f"{speaker_color} {segment['speaker']}")
-                                   except (IndexError, ValueError) as e:
-                                       st.write(f"⚪ {segment['speaker']}")
                                with col2:
                                    start_time = format_timestamp(segment['start'])
                                    end_time = format_timestamp(segment['end'])
                                    st.write(f"{start_time} → {end_time}")
                                st.markdown("---")
                        else:

 @st.cache_resource
 def load_models():
    try:
        diarization = Pipeline.from_pretrained(
+           "pyannote/speaker-diarization",
            use_auth_token=st.secrets["hf_token"]
        )
            device=0 if torch.cuda.is_available() else -1
        )
        if not diarization or not transcriber or not summarizer:
            raise ValueError("One or more models failed to load")
                else:
                    audio = AudioSegment.from_wav(audio_bytes)
                audio = audio.set_frame_rate(16000)
                audio = audio.set_channels(1)
                audio = audio.set_sample_width(2)
        st.error(f"Error processing audio: {str(e)}")
        return None
+def format_speaker_segments(diarization_result, transcription):
+   if diarization_result is None or transcription is None:
        return []
    formatted_segments = []
+   # Get whisper segments that include timestamps and text
+   whisper_segments = transcription.get('segments', [])
    try:
        for turn, _, speaker in diarization_result.itertracks(yield_label=True):
+           # Find matching text from whisper segments
+           segment_text = ""
+           for ws in whisper_segments:
+               # If whisper segment overlaps with diarization segment
+               if (float(ws['start']) >= float(turn.start) and
+                   float(ws['start']) <= float(turn.end)):
+                   segment_text += ws['text'] + " "
+           # Only add segments that have text
+           if segment_text.strip():
+               formatted_segments.append({
+                   'speaker': str(speaker),
+                   'start': float(turn.start),
+                   'end': float(turn.end),
+                   'text': segment_text.strip()
+               })
    except Exception as e:
        st.error(f"Error formatting segments: {str(e)}")
        return []
+   # Sort by start time and handle overlaps
+   formatted_segments.sort(key=lambda x: x['start'])
+   cleaned_segments = []
+   for i, segment in enumerate(formatted_segments):
+       # Skip if this segment overlaps with previous one
+       if i > 0 and segment['start'] < cleaned_segments[-1]['end']:
+           continue
+       cleaned_segments.append(segment)
+   return cleaned_segments
 def format_timestamp(seconds):
    minutes = int(seconds // 60)
                    with tab1:
                        st.write("Speaker Timeline:")
+                       segments = format_speaker_segments(results["diarization"], results["transcription"])
+                       if segments:
                            for segment in segments:
+                               col1, col2, col3 = st.columns([2,3,5])
                                with col1:
+                                   speaker_num = int(segment['speaker'].split('_')[1])
+                                   colors = ['🔵', '🔴']
+                                   speaker_color = colors[speaker_num % len(colors)]
+                                   st.write(f"{speaker_color} {segment['speaker']}")
                                with col2:
                                    start_time = format_timestamp(segment['start'])
                                    end_time = format_timestamp(segment['end'])
                                    st.write(f"{start_time} → {end_time}")
+                               with col3:
+                                   st.write(f"\"{segment['text']}\"")
                                st.markdown("---")
                        else: