Spaces:

msmaje
/

meeting-summariser

Sleeping

App Files Files Community

msmaje commited on Jul 9, 2025

Commit

55a9df3

verified ·

1 Parent(s): 3e0afc5

Update app.py

Browse files

Files changed (1) hide show

app.py +34 -69

app.py CHANGED Viewed

@@ -4,7 +4,7 @@ import tempfile
 import time
 import torch
 from pydub import AudioSegment
-import whisperx # Using whisperx for integrated transcription and diarization
 import warnings
 import requests # For Codestral API calls
@@ -17,41 +17,24 @@ print(f"Using device: {device} with compute_type: {compute_type}")
 # Global variables for models
 whisper_model = None
-diarization_model = None
-# We'll load the whisperx model once
-def load_whisperx_models():
-    """Load WhisperX transcription and diarization models."""
-    global whisper_model, diarization_model
     if whisper_model is None:
         try:
             print("Loading WhisperX transcription model...")
-            # Pass local_files_only=False to allow downloading if not cached
             whisper_model = whisperx.load_model(
                 "base",
                 device=device,
-                compute_type=compute_type,
                 local_files_only=False
             )
             print("WhisperX transcription model loaded successfully!")
-            print("Loading WhisperX diarization model (from pyannote)...")
-            # Get HuggingFace token from environment
-            hf_token = os.environ.get("HF_TOKEN")
-            if not hf_token:
-                raise ValueError("HF_TOKEN environment variable not found. This is required for pyannote diarization models.")
-            # Fix: Pass cache_dir parameter to force downloading
-            diarization_model = whisperx.DiarizationPipeline(
-                use_auth_token=hf_token,
-                device=device
-            )
-            print("WhisperX diarization model loaded successfully!")
         except Exception as e:
-            print(f"Error loading WhisperX models: {e}")
             raise e
-    return whisper_model, diarization_model
 def convert_audio(input_file):
     """Convert uploaded audio to WAV format"""
@@ -64,7 +47,7 @@ def convert_audio(input_file):
         # Convert to WAV using pydub
         audio = AudioSegment.from_file(input_file)
-        # Ensure mono channel and reasonable sample rate for Whisper/pyannote
         audio = audio.set_channels(1).set_frame_rate(16000)
         audio.export(wav_path, format="wav")
         return wav_path
@@ -72,7 +55,7 @@ def convert_audio(input_file):
         return f"Error converting audio: {str(e)}"
 def process_audio(audio_file, progress=gr.Progress()):
-    """Process the audio file: transcribe and diarize using whisperx"""
     if not audio_file:
         return "❌ Please upload an audio file", None
@@ -85,67 +68,49 @@ def process_audio(audio_file, progress=gr.Progress()):
         if isinstance(wav_path, str) and wav_path.startswith("Error"):
             return wav_path, None
-        progress(0.2, desc="Loading AI models (WhisperX & Diarization)...")
-        # 2. Load WhisperX models
         try:
-            model_a, model_b = load_whisperx_models() # model_a is whisper_model, model_b is diarization_model
             # Audio needs to be loaded separately for whisperx
             audio = whisperx.load_audio(wav_path)
         except Exception as e:
             error_msg = str(e)
-            if "authentication" in error_msg.lower() or "token" in error_msg.lower():
-                return "❌ Authentication Error: Please ensure your HuggingFace token is set correctly in the environment variables and has access to pyannote models. Visit https://huggingface.co/pyannote/speaker-diarization-3.1 to accept the user conditions first.", None
-            return f"❌ Error loading AI models: {error_msg}", None
-        progress(0.5, desc="Transcribing audio...")
         # 3. Transcribe audio with WhisperX
         try:
             # Transcribe with batch processing
-            result = model_a.transcribe(audio, batch_size=16) # Adjust batch_size based on VRAM
             # Check if we have valid transcription results
             if not result or "segments" not in result:
                 return "❌ No transcription results obtained from the audio", None
-            progress(0.7, desc="Performing speaker diarization...")
-            # Align the transcription for better diarization
-            model_a_align, metadata = whisperx.load_align_model(
-                language_code=result["language"],
-                device=device
-            )
-            result = whisperx.align(result["segments"], model_a_align, metadata, audio, device, return_char_alignments=False)
-            # Diarize audio
-            diarize_segments = model_b(audio)
-            # Assign speakers to segments
-            result = whisperx.assign_word_speakers(diarize_segments, result)
         except Exception as e:
             error_msg = str(e)
             if "CUDA" in error_msg or "GPU" in error_msg:
                 return f"❌ GPU Error: {error_msg}. Try using CPU mode or check your CUDA installation.", None
-            return f"❌ Error during transcription or diarization: {error_msg}", None
         progress(0.9, desc="Formatting transcript...")
-        # 4. Format transcription with speaker labels
         combined_output = []
         if result and "segments" in result:
             for segment in result["segments"]:
                 start_time = segment.get("start", 0)
                 end_time = segment.get("end", 0)
-                speaker = segment.get("speaker", "UNKNOWN") # Speaker ID from diarization
                 text = segment.get("text", "").strip()
                 if not text:
                     continue
-                combined_output.append(f"🗣️ Speaker {speaker} 🕐 [{start_time:.1f}s - {end_time:.1f}s]: {text}")
         # Create final output
         combined_text = "\n\n".join(combined_output)
@@ -155,7 +120,7 @@ def process_audio(audio_file, progress=gr.Progress()):
         progress(1.0, desc="Complete!")
-        return f"✅ **Processing Complete!**\n\n{combined_text}", combined_text
     except Exception as e:
         return f"❌ Unexpected error: {str(e)}", None
@@ -173,10 +138,10 @@ def summarize_meeting(transcript_text, model_choice, progress=gr.Progress()):
         return "❌ No valid transcript available to summarize"
     # Retrieve Codestral API key from environment variable
-    codestral_api_key = os.environ.get("CODESTRAL_API_KEY") or os.environ.get("HF_TOKEN")
     if not codestral_api_key:
-        return "❌ Codestral API Key not found. Please set CODESTRAL_API_KEY or HF_TOKEN in environment variables."
     # Update progress directly within the function
     progress(0.1, desc=f"Sending transcript to Codestral ({model_choice})...")
@@ -209,7 +174,7 @@ Transcript:
             {"role": "user", "content": prompt}
         ],
         "temperature": 0.7,
-        "max_tokens": 1000 # Increased for better summaries
     }
     try:
@@ -234,7 +199,7 @@ Transcript:
 def process_and_summarize(audio_file, model_choice, progress=gr.Progress()):
     """Combined function to process audio and generate summary"""
     # Initialize overall progress.
-    progress(0.0, desc="Starting audio processing (transcription & diarization)...")
     # Process audio (takes 0-50% of overall progress)
     transcript, clean_transcript = process_audio(audio_file, progress)
@@ -245,10 +210,6 @@ def process_and_summarize(audio_file, model_choice, progress=gr.Progress()):
     # Transition to summarization (50-100% of overall progress)
     progress(0.5, desc="Starting summarization...")
-    # Create a sub-progress for summarization
-    def summary_progress(val, desc):
-        progress(0.5 + (val * 0.5), desc)
     # Create a wrapper progress object
     class SummaryProgress:
         def __call__(self, val, desc):
@@ -431,8 +392,8 @@ with gr.Blocks(
             with gr.Tabs():
                 with gr.TabItem("📝 Transcript", elem_id="transcript-tab"):
                     transcript_output = gr.TextArea(
-                        label="Meeting Transcript (with Speaker Diarization)",
-                        placeholder="Your detailed transcript with speaker labels will appear here...",
                         lines=20,
                         max_lines=30,
                         elem_classes="output-text",
@@ -457,9 +418,7 @@ with gr.Blocks(
             1. **📁 Upload Audio**: Supports MP3, WAV, OGG, M4A, and most common audio formats.
             2. **🔑 Setup Required**:
-               - **HF_TOKEN**: Required for pyannote diarization models. Get it from https://huggingface.co/settings/tokens
                - **CODESTRAL_API_KEY**: Required for summarization. Get it from Mistral AI
-               - Visit https://huggingface.co/pyannote/speaker-diarization-3.1 and accept user conditions
             3. **🚀 Process**: Click the button and wait for the magic to happen!
             ### 🎵 **Audio Requirements**
@@ -469,18 +428,24 @@ with gr.Blocks(
             - **Language**: Optimized for English conversations.
             ### ⚡ **Features**
-            - **High-Quality Transcription**: Powered by OpenAI Whisper.
-            - **Accurate Speaker Diarization**: Identifies different speakers using pyannote.
             - **Intelligent Summarization**: Powered by Codestral API.
             ### 🔧 **Troubleshooting**
-            - **Authentication Error**: Ensure HF_TOKEN is set and you've accepted pyannote user conditions
             - **GPU Issues**: The app will automatically fallback to CPU if GPU is not available
             - **Audio Format**: If upload fails, try converting to WAV format first
             ### 🔒 **Privacy & Security**
             - Your audio files are processed temporarily and not stored.
             - API keys are used securely from environment variables.
             """)
     # Footer

 import time
 import torch
 from pydub import AudioSegment
+import whisperx # Using whisperx for transcription only
 import warnings
 import requests # For Codestral API calls
 # Global variables for models
 whisper_model = None
+def load_whisperx_model():
+    """Load WhisperX transcription model only."""
+    global whisper_model
     if whisper_model is None:
         try:
             print("Loading WhisperX transcription model...")
             whisper_model = whisperx.load_model(
                 "base",
                 device=device,
+                compute_type=compute_type,
                 local_files_only=False
             )
             print("WhisperX transcription model loaded successfully!")
         except Exception as e:
+            print(f"Error loading WhisperX model: {e}")
             raise e
+    return whisper_model
 def convert_audio(input_file):
     """Convert uploaded audio to WAV format"""
         # Convert to WAV using pydub
         audio = AudioSegment.from_file(input_file)
+        # Ensure mono channel and reasonable sample rate for Whisper
         audio = audio.set_channels(1).set_frame_rate(16000)
         audio.export(wav_path, format="wav")
         return wav_path
         return f"Error converting audio: {str(e)}"
 def process_audio(audio_file, progress=gr.Progress()):
+    """Process the audio file: transcribe using whisperx"""
     if not audio_file:
         return "❌ Please upload an audio file", None
         if isinstance(wav_path, str) and wav_path.startswith("Error"):
             return wav_path, None
+        progress(0.3, desc="Loading AI transcription model...")
+        # 2. Load WhisperX model
         try:
+            model = load_whisperx_model()
             # Audio needs to be loaded separately for whisperx
             audio = whisperx.load_audio(wav_path)
         except Exception as e:
             error_msg = str(e)
+            return f"❌ Error loading AI model: {error_msg}", None
+        progress(0.6, desc="Transcribing audio...")
         # 3. Transcribe audio with WhisperX
         try:
             # Transcribe with batch processing
+            result = model.transcribe(audio, batch_size=16)
             # Check if we have valid transcription results
             if not result or "segments" not in result:
                 return "❌ No transcription results obtained from the audio", None
         except Exception as e:
             error_msg = str(e)
             if "CUDA" in error_msg or "GPU" in error_msg:
                 return f"❌ GPU Error: {error_msg}. Try using CPU mode or check your CUDA installation.", None
+            return f"❌ Error during transcription: {error_msg}", None
         progress(0.9, desc="Formatting transcript...")
+        # 4. Format transcription without speaker labels
         combined_output = []
         if result and "segments" in result:
             for segment in result["segments"]:
                 start_time = segment.get("start", 0)
                 end_time = segment.get("end", 0)
                 text = segment.get("text", "").strip()
                 if not text:
                     continue
+                combined_output.append(f"🕐 [{start_time:.1f}s - {end_time:.1f}s]: {text}")
         # Create final output
         combined_text = "\n\n".join(combined_output)
         progress(1.0, desc="Complete!")
+        return f"✅ **Transcription Complete!**\n\n{combined_text}", combined_text
     except Exception as e:
         return f"❌ Unexpected error: {str(e)}", None
         return "❌ No valid transcript available to summarize"
     # Retrieve Codestral API key from environment variable
+    codestral_api_key = os.environ.get("CODESTRAL_API_KEY")
     if not codestral_api_key:
+        return "❌ Codestral API Key not found. Please set CODESTRAL_API_KEY in environment variables."
     # Update progress directly within the function
     progress(0.1, desc=f"Sending transcript to Codestral ({model_choice})...")
             {"role": "user", "content": prompt}
         ],
         "temperature": 0.7,
+        "max_tokens": 1000
     }
     try:
 def process_and_summarize(audio_file, model_choice, progress=gr.Progress()):
     """Combined function to process audio and generate summary"""
     # Initialize overall progress.
+    progress(0.0, desc="Starting audio processing (transcription)...")
     # Process audio (takes 0-50% of overall progress)
     transcript, clean_transcript = process_audio(audio_file, progress)
     # Transition to summarization (50-100% of overall progress)
     progress(0.5, desc="Starting summarization...")
     # Create a wrapper progress object
     class SummaryProgress:
         def __call__(self, val, desc):
             with gr.Tabs():
                 with gr.TabItem("📝 Transcript", elem_id="transcript-tab"):
                     transcript_output = gr.TextArea(
+                        label="Meeting Transcript",
+                        placeholder="Your detailed transcript with timestamps will appear here...",
                         lines=20,
                         max_lines=30,
                         elem_classes="output-text",
             1. **📁 Upload Audio**: Supports MP3, WAV, OGG, M4A, and most common audio formats.
             2. **🔑 Setup Required**:
                - **CODESTRAL_API_KEY**: Required for summarization. Get it from Mistral AI
             3. **🚀 Process**: Click the button and wait for the magic to happen!
             ### 🎵 **Audio Requirements**
             - **Language**: Optimized for English conversations.
             ### ⚡ **Features**
+            - **High-Quality Transcription**: Powered by OpenAI Whisper via WhisperX.
             - **Intelligent Summarization**: Powered by Codestral API.
+            - **Timestamp Support**: Each transcript segment includes precise timestamps.
             ### 🔧 **Troubleshooting**
             - **GPU Issues**: The app will automatically fallback to CPU if GPU is not available
             - **Audio Format**: If upload fails, try converting to WAV format first
+            - **API Issues**: Ensure your CODESTRAL_API_KEY is valid and has sufficient credits
             ### 🔒 **Privacy & Security**
             - Your audio files are processed temporarily and not stored.
             - API keys are used securely from environment variables.
+            - Only transcription is done locally; summarization uses Codestral API.
+            ### 📝 **Note**
+            - Speaker diarization has been removed for simplicity and reliability.
+            - The transcript will show timestamps but not individual speaker identification.
+            - For multi-speaker meetings, you may need to manually identify speakers from context.
             """)
     # Footer