Spaces:

marcosremar2
/

speaker-diarization-pyannote

Sleeping

App Files Files Community

marcosremar2 commited on Jun 1, 2025

Commit

6df750f

1 Parent(s): a8b6268

Add Whisper transcription to speaker diarization

Browse files

Files changed (2) hide show

app.py +65 -26
requirements.txt +3 -1

app.py CHANGED Viewed

@@ -1,7 +1,7 @@
 import gradio as gr
 from pyannote.audio import Pipeline
 import torch
-import torchaudio
 from huggingface_hub import login
 import os
 import traceback
@@ -10,38 +10,50 @@ import traceback
 hf_token = os.environ.get("HF_TOKEN")
 if not hf_token:
     print("WARNING: HF_TOKEN environment variable not found. Please set it in the Space settings.")
-    pipeline = None
 else:
     try:
         login(token=hf_token)
         print("Successfully logged in to Hugging Face")
-        # Initialize the pipeline
         print("Loading pyannote/speaker-diarization-3.1 pipeline...")
-        pipeline = Pipeline.from_pretrained(
             "pyannote/speaker-diarization-3.1",
             use_auth_token=hf_token
         )
-        print("Pipeline loaded successfully!")
         # Send pipeline to GPU if available
         if torch.cuda.is_available():
             print("GPU detected, moving pipeline to GPU")
-            pipeline.to(torch.device("cuda"))
         else:
             print("No GPU detected, using CPU")
     except Exception as e:
-        print(f"Error loading pipeline: {e}")
         print(f"Error type: {type(e).__name__}")
         print("Traceback:")
         traceback.print_exc()
-        pipeline = None
-def diarize_audio(audio_file):
-    """Process audio file and return diarization results"""
-    if pipeline is None:
-        return "❌ Pipeline not loaded. Please ensure HF_TOKEN is set and you have access to pyannote/speaker-diarization-3.1. Check the logs for more details."
     if audio_file is None:
         return "Please upload an audio file."
@@ -49,21 +61,48 @@ def diarize_audio(audio_file):
     try:
         print(f"Processing audio file: {audio_file}")
-        # Apply pretrained pipeline
-        diarization = pipeline(audio_file)
-        # Format results
         results = []
-        for turn, _, speaker in diarization.itertracks(yield_label=True):
-            results.append(
-                f"Speaker {speaker}: {turn.start:.1f}s - {turn.end:.1f}s"
-            )
         if not results:
-            return "No speakers detected in the audio."
-        print(f"Successfully processed audio. Found {len(set([r.split(':')[0] for r in results]))} speakers")
-        return "\n".join(results)
     except Exception as e:
         error_msg = f"Error processing audio: {str(e)}"
@@ -73,11 +112,11 @@ def diarize_audio(audio_file):
 # Create Gradio interface
 demo = gr.Interface(
-    fn=diarize_audio,
     inputs=gr.Audio(type="filepath", label="Upload Audio File"),
-    outputs=gr.Textbox(label="Diarization Results", lines=10),
-    title="Speaker Diarization with Pyannote 3.1",
-    description="Upload an audio file to identify different speakers and their speaking times.",
     examples=[],
     cache_examples=False
 )

 import gradio as gr
 from pyannote.audio import Pipeline
 import torch
+import whisper
 from huggingface_hub import login
 import os
 import traceback
 hf_token = os.environ.get("HF_TOKEN")
 if not hf_token:
     print("WARNING: HF_TOKEN environment variable not found. Please set it in the Space settings.")
+    diarization_pipeline = None
 else:
     try:
         login(token=hf_token)
         print("Successfully logged in to Hugging Face")
+        # Initialize the diarization pipeline
         print("Loading pyannote/speaker-diarization-3.1 pipeline...")
+        diarization_pipeline = Pipeline.from_pretrained(
             "pyannote/speaker-diarization-3.1",
             use_auth_token=hf_token
         )
+        print("Diarization pipeline loaded successfully!")
         # Send pipeline to GPU if available
         if torch.cuda.is_available():
             print("GPU detected, moving pipeline to GPU")
+            diarization_pipeline.to(torch.device("cuda"))
         else:
             print("No GPU detected, using CPU")
     except Exception as e:
+        print(f"Error loading diarization pipeline: {e}")
         print(f"Error type: {type(e).__name__}")
         print("Traceback:")
         traceback.print_exc()
+        diarization_pipeline = None
+# Load Whisper model
+try:
+    print("Loading Whisper model...")
+    whisper_model = whisper.load_model("base")
+    print("Whisper model loaded successfully!")
+except Exception as e:
+    print(f"Error loading Whisper model: {e}")
+    whisper_model = None
+def transcribe_with_diarization(audio_file):
+    """Process audio file for both diarization and transcription"""
+    if diarization_pipeline is None:
+        return "❌ Diarization pipeline not loaded. Please ensure HF_TOKEN is set and you have access to pyannote/speaker-diarization-3.1."
+    if whisper_model is None:
+        return "❌ Whisper model not loaded."
     if audio_file is None:
         return "Please upload an audio file."
     try:
         print(f"Processing audio file: {audio_file}")
+        # Step 1: Transcribe with Whisper
+        print("Transcribing audio with Whisper...")
+        transcription_result = whisper_model.transcribe(audio_file, language="pt")
+        segments = transcription_result["segments"]
+        print(f"Transcription complete. Found {len(segments)} segments")
+        # Step 2: Diarize with pyannote
+        print("Performing speaker diarization...")
+        diarization = diarization_pipeline(audio_file)
+        print("Diarization complete")
+        # Step 3: Match transcription segments with speaker labels
         results = []
+        for segment in segments:
+            start_time = segment['start']
+            end_time = segment['end']
+            text = segment['text'].strip()
+            # Find the speaker at this timestamp
+            speaker = None
+            for turn, _, label in diarization.itertracks(yield_label=True):
+                # Check if this segment overlaps with the speaker turn
+                if turn.start <= start_time <= turn.end or turn.start <= end_time <= turn.end:
+                    speaker = label
+                    break
+            if speaker:
+                results.append(f"[{speaker}] ({start_time:.1f}s - {end_time:.1f}s): {text}")
+            else:
+                results.append(f"[Unknown] ({start_time:.1f}s - {end_time:.1f}s): {text}")
         if not results:
+            return "No transcription available."
+        # Add summary
+        speakers = set()
+        for turn, _, speaker in diarization.itertracks(yield_label=True):
+            speakers.add(speaker)
+        summary = f"Found {len(speakers)} speakers in the conversation.\n\n"
+        return summary + "\n".join(results)
     except Exception as e:
         error_msg = f"Error processing audio: {str(e)}"
 # Create Gradio interface
 demo = gr.Interface(
+    fn=transcribe_with_diarization,
     inputs=gr.Audio(type="filepath", label="Upload Audio File"),
+    outputs=gr.Textbox(label="Transcription with Speaker Identification", lines=20),
+    title="Speaker Diarization + Transcription",
+    description="Upload an audio file to identify different speakers and transcribe what they said. Uses pyannote for speaker identification and Whisper for transcription.",
     examples=[],
     cache_examples=False
 )

requirements.txt CHANGED Viewed

@@ -3,4 +3,6 @@ torch>=2.0.0
 torchaudio>=2.0.0
 gradio>=4.0.0
 huggingface_hub
-speechbrain>=0.5.16

 torchaudio>=2.0.0
 gradio>=4.0.0
 huggingface_hub
+speechbrain>=0.5.16
+openai-whisper
+ffmpeg-python