msmaje commited on
Commit
55a9df3
Β·
verified Β·
1 Parent(s): 3e0afc5

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +34 -69
app.py CHANGED
@@ -4,7 +4,7 @@ import tempfile
4
  import time
5
  import torch
6
  from pydub import AudioSegment
7
- import whisperx # Using whisperx for integrated transcription and diarization
8
  import warnings
9
  import requests # For Codestral API calls
10
 
@@ -17,41 +17,24 @@ print(f"Using device: {device} with compute_type: {compute_type}")
17
 
18
  # Global variables for models
19
  whisper_model = None
20
- diarization_model = None
21
- # We'll load the whisperx model once
22
 
23
- def load_whisperx_models():
24
- """Load WhisperX transcription and diarization models."""
25
- global whisper_model, diarization_model
26
  if whisper_model is None:
27
  try:
28
  print("Loading WhisperX transcription model...")
29
- # Pass local_files_only=False to allow downloading if not cached
30
  whisper_model = whisperx.load_model(
31
  "base",
32
  device=device,
33
- compute_type=compute_type,
34
  local_files_only=False
35
  )
36
  print("WhisperX transcription model loaded successfully!")
37
-
38
- print("Loading WhisperX diarization model (from pyannote)...")
39
- # Get HuggingFace token from environment
40
- hf_token = os.environ.get("HF_TOKEN")
41
- if not hf_token:
42
- raise ValueError("HF_TOKEN environment variable not found. This is required for pyannote diarization models.")
43
-
44
- # Fix: Pass cache_dir parameter to force downloading
45
- diarization_model = whisperx.DiarizationPipeline(
46
- use_auth_token=hf_token,
47
- device=device
48
- )
49
- print("WhisperX diarization model loaded successfully!")
50
-
51
  except Exception as e:
52
- print(f"Error loading WhisperX models: {e}")
53
  raise e
54
- return whisper_model, diarization_model
55
 
56
  def convert_audio(input_file):
57
  """Convert uploaded audio to WAV format"""
@@ -64,7 +47,7 @@ def convert_audio(input_file):
64
 
65
  # Convert to WAV using pydub
66
  audio = AudioSegment.from_file(input_file)
67
- # Ensure mono channel and reasonable sample rate for Whisper/pyannote
68
  audio = audio.set_channels(1).set_frame_rate(16000)
69
  audio.export(wav_path, format="wav")
70
  return wav_path
@@ -72,7 +55,7 @@ def convert_audio(input_file):
72
  return f"Error converting audio: {str(e)}"
73
 
74
  def process_audio(audio_file, progress=gr.Progress()):
75
- """Process the audio file: transcribe and diarize using whisperx"""
76
  if not audio_file:
77
  return "❌ Please upload an audio file", None
78
 
@@ -85,67 +68,49 @@ def process_audio(audio_file, progress=gr.Progress()):
85
  if isinstance(wav_path, str) and wav_path.startswith("Error"):
86
  return wav_path, None
87
 
88
- progress(0.2, desc="Loading AI models (WhisperX & Diarization)...")
89
 
90
- # 2. Load WhisperX models
91
  try:
92
- model_a, model_b = load_whisperx_models() # model_a is whisper_model, model_b is diarization_model
93
  # Audio needs to be loaded separately for whisperx
94
  audio = whisperx.load_audio(wav_path)
95
  except Exception as e:
96
  error_msg = str(e)
97
- if "authentication" in error_msg.lower() or "token" in error_msg.lower():
98
- return "❌ Authentication Error: Please ensure your HuggingFace token is set correctly in the environment variables and has access to pyannote models. Visit https://huggingface.co/pyannote/speaker-diarization-3.1 to accept the user conditions first.", None
99
- return f"❌ Error loading AI models: {error_msg}", None
100
 
101
- progress(0.5, desc="Transcribing audio...")
102
 
103
  # 3. Transcribe audio with WhisperX
104
  try:
105
  # Transcribe with batch processing
106
- result = model_a.transcribe(audio, batch_size=16) # Adjust batch_size based on VRAM
107
 
108
  # Check if we have valid transcription results
109
  if not result or "segments" not in result:
110
  return "❌ No transcription results obtained from the audio", None
111
 
112
- progress(0.7, desc="Performing speaker diarization...")
113
-
114
- # Align the transcription for better diarization
115
- model_a_align, metadata = whisperx.load_align_model(
116
- language_code=result["language"],
117
- device=device
118
- )
119
- result = whisperx.align(result["segments"], model_a_align, metadata, audio, device, return_char_alignments=False)
120
-
121
- # Diarize audio
122
- diarize_segments = model_b(audio)
123
-
124
- # Assign speakers to segments
125
- result = whisperx.assign_word_speakers(diarize_segments, result)
126
-
127
  except Exception as e:
128
  error_msg = str(e)
129
  if "CUDA" in error_msg or "GPU" in error_msg:
130
  return f"❌ GPU Error: {error_msg}. Try using CPU mode or check your CUDA installation.", None
131
- return f"❌ Error during transcription or diarization: {error_msg}", None
132
 
133
  progress(0.9, desc="Formatting transcript...")
134
 
135
- # 4. Format transcription with speaker labels
136
  combined_output = []
137
 
138
  if result and "segments" in result:
139
  for segment in result["segments"]:
140
  start_time = segment.get("start", 0)
141
  end_time = segment.get("end", 0)
142
- speaker = segment.get("speaker", "UNKNOWN") # Speaker ID from diarization
143
  text = segment.get("text", "").strip()
144
 
145
  if not text:
146
  continue
147
 
148
- combined_output.append(f"πŸ—£οΈ Speaker {speaker} πŸ• [{start_time:.1f}s - {end_time:.1f}s]: {text}")
149
 
150
  # Create final output
151
  combined_text = "\n\n".join(combined_output)
@@ -155,7 +120,7 @@ def process_audio(audio_file, progress=gr.Progress()):
155
 
156
  progress(1.0, desc="Complete!")
157
 
158
- return f"βœ… **Processing Complete!**\n\n{combined_text}", combined_text
159
 
160
  except Exception as e:
161
  return f"❌ Unexpected error: {str(e)}", None
@@ -173,10 +138,10 @@ def summarize_meeting(transcript_text, model_choice, progress=gr.Progress()):
173
  return "❌ No valid transcript available to summarize"
174
 
175
  # Retrieve Codestral API key from environment variable
176
- codestral_api_key = os.environ.get("CODESTRAL_API_KEY") or os.environ.get("HF_TOKEN")
177
 
178
  if not codestral_api_key:
179
- return "❌ Codestral API Key not found. Please set CODESTRAL_API_KEY or HF_TOKEN in environment variables."
180
 
181
  # Update progress directly within the function
182
  progress(0.1, desc=f"Sending transcript to Codestral ({model_choice})...")
@@ -209,7 +174,7 @@ Transcript:
209
  {"role": "user", "content": prompt}
210
  ],
211
  "temperature": 0.7,
212
- "max_tokens": 1000 # Increased for better summaries
213
  }
214
 
215
  try:
@@ -234,7 +199,7 @@ Transcript:
234
  def process_and_summarize(audio_file, model_choice, progress=gr.Progress()):
235
  """Combined function to process audio and generate summary"""
236
  # Initialize overall progress.
237
- progress(0.0, desc="Starting audio processing (transcription & diarization)...")
238
 
239
  # Process audio (takes 0-50% of overall progress)
240
  transcript, clean_transcript = process_audio(audio_file, progress)
@@ -245,10 +210,6 @@ def process_and_summarize(audio_file, model_choice, progress=gr.Progress()):
245
  # Transition to summarization (50-100% of overall progress)
246
  progress(0.5, desc="Starting summarization...")
247
 
248
- # Create a sub-progress for summarization
249
- def summary_progress(val, desc):
250
- progress(0.5 + (val * 0.5), desc)
251
-
252
  # Create a wrapper progress object
253
  class SummaryProgress:
254
  def __call__(self, val, desc):
@@ -431,8 +392,8 @@ with gr.Blocks(
431
  with gr.Tabs():
432
  with gr.TabItem("πŸ“ Transcript", elem_id="transcript-tab"):
433
  transcript_output = gr.TextArea(
434
- label="Meeting Transcript (with Speaker Diarization)",
435
- placeholder="Your detailed transcript with speaker labels will appear here...",
436
  lines=20,
437
  max_lines=30,
438
  elem_classes="output-text",
@@ -457,9 +418,7 @@ with gr.Blocks(
457
 
458
  1. **πŸ“ Upload Audio**: Supports MP3, WAV, OGG, M4A, and most common audio formats.
459
  2. **πŸ”‘ Setup Required**:
460
- - **HF_TOKEN**: Required for pyannote diarization models. Get it from https://huggingface.co/settings/tokens
461
  - **CODESTRAL_API_KEY**: Required for summarization. Get it from Mistral AI
462
- - Visit https://huggingface.co/pyannote/speaker-diarization-3.1 and accept user conditions
463
  3. **πŸš€ Process**: Click the button and wait for the magic to happen!
464
 
465
  ### 🎡 **Audio Requirements**
@@ -469,18 +428,24 @@ with gr.Blocks(
469
  - **Language**: Optimized for English conversations.
470
 
471
  ### ⚑ **Features**
472
- - **High-Quality Transcription**: Powered by OpenAI Whisper.
473
- - **Accurate Speaker Diarization**: Identifies different speakers using pyannote.
474
  - **Intelligent Summarization**: Powered by Codestral API.
 
475
 
476
  ### πŸ”§ **Troubleshooting**
477
- - **Authentication Error**: Ensure HF_TOKEN is set and you've accepted pyannote user conditions
478
  - **GPU Issues**: The app will automatically fallback to CPU if GPU is not available
479
  - **Audio Format**: If upload fails, try converting to WAV format first
 
480
 
481
  ### πŸ”’ **Privacy & Security**
482
  - Your audio files are processed temporarily and not stored.
483
  - API keys are used securely from environment variables.
 
 
 
 
 
 
484
  """)
485
 
486
  # Footer
 
4
  import time
5
  import torch
6
  from pydub import AudioSegment
7
+ import whisperx # Using whisperx for transcription only
8
  import warnings
9
  import requests # For Codestral API calls
10
 
 
17
 
18
  # Global variables for models
19
  whisper_model = None
 
 
20
 
21
+ def load_whisperx_model():
22
+ """Load WhisperX transcription model only."""
23
+ global whisper_model
24
  if whisper_model is None:
25
  try:
26
  print("Loading WhisperX transcription model...")
 
27
  whisper_model = whisperx.load_model(
28
  "base",
29
  device=device,
30
+ compute_type=compute_type,
31
  local_files_only=False
32
  )
33
  print("WhisperX transcription model loaded successfully!")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
34
  except Exception as e:
35
+ print(f"Error loading WhisperX model: {e}")
36
  raise e
37
+ return whisper_model
38
 
39
  def convert_audio(input_file):
40
  """Convert uploaded audio to WAV format"""
 
47
 
48
  # Convert to WAV using pydub
49
  audio = AudioSegment.from_file(input_file)
50
+ # Ensure mono channel and reasonable sample rate for Whisper
51
  audio = audio.set_channels(1).set_frame_rate(16000)
52
  audio.export(wav_path, format="wav")
53
  return wav_path
 
55
  return f"Error converting audio: {str(e)}"
56
 
57
  def process_audio(audio_file, progress=gr.Progress()):
58
+ """Process the audio file: transcribe using whisperx"""
59
  if not audio_file:
60
  return "❌ Please upload an audio file", None
61
 
 
68
  if isinstance(wav_path, str) and wav_path.startswith("Error"):
69
  return wav_path, None
70
 
71
+ progress(0.3, desc="Loading AI transcription model...")
72
 
73
+ # 2. Load WhisperX model
74
  try:
75
+ model = load_whisperx_model()
76
  # Audio needs to be loaded separately for whisperx
77
  audio = whisperx.load_audio(wav_path)
78
  except Exception as e:
79
  error_msg = str(e)
80
+ return f"❌ Error loading AI model: {error_msg}", None
 
 
81
 
82
+ progress(0.6, desc="Transcribing audio...")
83
 
84
  # 3. Transcribe audio with WhisperX
85
  try:
86
  # Transcribe with batch processing
87
+ result = model.transcribe(audio, batch_size=16)
88
 
89
  # Check if we have valid transcription results
90
  if not result or "segments" not in result:
91
  return "❌ No transcription results obtained from the audio", None
92
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
93
  except Exception as e:
94
  error_msg = str(e)
95
  if "CUDA" in error_msg or "GPU" in error_msg:
96
  return f"❌ GPU Error: {error_msg}. Try using CPU mode or check your CUDA installation.", None
97
+ return f"❌ Error during transcription: {error_msg}", None
98
 
99
  progress(0.9, desc="Formatting transcript...")
100
 
101
+ # 4. Format transcription without speaker labels
102
  combined_output = []
103
 
104
  if result and "segments" in result:
105
  for segment in result["segments"]:
106
  start_time = segment.get("start", 0)
107
  end_time = segment.get("end", 0)
 
108
  text = segment.get("text", "").strip()
109
 
110
  if not text:
111
  continue
112
 
113
+ combined_output.append(f"πŸ• [{start_time:.1f}s - {end_time:.1f}s]: {text}")
114
 
115
  # Create final output
116
  combined_text = "\n\n".join(combined_output)
 
120
 
121
  progress(1.0, desc="Complete!")
122
 
123
+ return f"βœ… **Transcription Complete!**\n\n{combined_text}", combined_text
124
 
125
  except Exception as e:
126
  return f"❌ Unexpected error: {str(e)}", None
 
138
  return "❌ No valid transcript available to summarize"
139
 
140
  # Retrieve Codestral API key from environment variable
141
+ codestral_api_key = os.environ.get("CODESTRAL_API_KEY")
142
 
143
  if not codestral_api_key:
144
+ return "❌ Codestral API Key not found. Please set CODESTRAL_API_KEY in environment variables."
145
 
146
  # Update progress directly within the function
147
  progress(0.1, desc=f"Sending transcript to Codestral ({model_choice})...")
 
174
  {"role": "user", "content": prompt}
175
  ],
176
  "temperature": 0.7,
177
+ "max_tokens": 1000
178
  }
179
 
180
  try:
 
199
  def process_and_summarize(audio_file, model_choice, progress=gr.Progress()):
200
  """Combined function to process audio and generate summary"""
201
  # Initialize overall progress.
202
+ progress(0.0, desc="Starting audio processing (transcription)...")
203
 
204
  # Process audio (takes 0-50% of overall progress)
205
  transcript, clean_transcript = process_audio(audio_file, progress)
 
210
  # Transition to summarization (50-100% of overall progress)
211
  progress(0.5, desc="Starting summarization...")
212
 
 
 
 
 
213
  # Create a wrapper progress object
214
  class SummaryProgress:
215
  def __call__(self, val, desc):
 
392
  with gr.Tabs():
393
  with gr.TabItem("πŸ“ Transcript", elem_id="transcript-tab"):
394
  transcript_output = gr.TextArea(
395
+ label="Meeting Transcript",
396
+ placeholder="Your detailed transcript with timestamps will appear here...",
397
  lines=20,
398
  max_lines=30,
399
  elem_classes="output-text",
 
418
 
419
  1. **πŸ“ Upload Audio**: Supports MP3, WAV, OGG, M4A, and most common audio formats.
420
  2. **πŸ”‘ Setup Required**:
 
421
  - **CODESTRAL_API_KEY**: Required for summarization. Get it from Mistral AI
 
422
  3. **πŸš€ Process**: Click the button and wait for the magic to happen!
423
 
424
  ### 🎡 **Audio Requirements**
 
428
  - **Language**: Optimized for English conversations.
429
 
430
  ### ⚑ **Features**
431
+ - **High-Quality Transcription**: Powered by OpenAI Whisper via WhisperX.
 
432
  - **Intelligent Summarization**: Powered by Codestral API.
433
+ - **Timestamp Support**: Each transcript segment includes precise timestamps.
434
 
435
  ### πŸ”§ **Troubleshooting**
 
436
  - **GPU Issues**: The app will automatically fallback to CPU if GPU is not available
437
  - **Audio Format**: If upload fails, try converting to WAV format first
438
+ - **API Issues**: Ensure your CODESTRAL_API_KEY is valid and has sufficient credits
439
 
440
  ### πŸ”’ **Privacy & Security**
441
  - Your audio files are processed temporarily and not stored.
442
  - API keys are used securely from environment variables.
443
+ - Only transcription is done locally; summarization uses Codestral API.
444
+
445
+ ### πŸ“ **Note**
446
+ - Speaker diarization has been removed for simplicity and reliability.
447
+ - The transcript will show timestamps but not individual speaker identification.
448
+ - For multi-speaker meetings, you may need to manually identify speakers from context.
449
  """)
450
 
451
  # Footer