Capstone04
/

Bootstrapping

@@ -17,11 +17,11 @@ class ASR_Diarization:
                  diar_model="pyannote/speaker-diarization-3.1",
                  asr_model="Capstone04/TrainedWhisper_Medium",
                  model_path=None,
-                 use_vad=True,           # NEW: VAD after diarization
-                 vad_threshold=0.3,      # NEW: VAD speech ratio threshold
-                 min_segment_duration=0.5, # NEW: Minimum segment duration
-                 snr_threshold=15.0,     # NEW: SNR threshold for adaptive processing
-                 min_whisper_duration=0.3): # NEW: Minimum duration for Whisper
         self.HF_TOKEN = HF_TOKEN
         self.device = "cuda" if torch.cuda.is_available() else "cpu"
@@ -31,26 +31,26 @@ class ASR_Diarization:
         self.snr_threshold = snr_threshold
         self.min_whisper_duration = min_whisper_duration
-        # Load diarization model - FIX: Add device
         self.diar_pipeline = Pipeline.from_pretrained(diar_model, use_auth_token=HF_TOKEN)
         self.diar_pipeline = self.diar_pipeline.to(torch.device(self.device))
-        # Load WebRTC VAD for post-diarization filtering - NEW
         if self.use_vad:
             try:
                 import webrtcvad
-                self.vad = webrtcvad.Vad(2)  # Medium aggressiveness
-                print("✅ WebRTC VAD loaded for post-diarization filtering")
             except ImportError:
-                print("⚠️ WebRTC VAD not available")
                 self.use_vad = False
         # Load ASR model
         if model_path and os.path.exists(model_path):
-            print(f"🔄 Loading custom ASR model from: {model_path}")
             actual_asr_model = model_path
         else:
-            print(f"🔄 Loading default ASR model: {asr_model}")
             actual_asr_model = asr_model
         processor = WhisperProcessor.from_pretrained(actual_asr_model, token=HF_TOKEN)
@@ -93,15 +93,15 @@ class ASR_Diarization:
             return snr
         except Exception as e:
-            print(f"⚠️ SNR calculation failed: {e}")
             return float('inf')
     def calculate_rms_energy(self, audio_chunk):
-        """NEW: Calculate RMS energy for audio chunk"""
         return np.sqrt(np.mean(audio_chunk**2))
     def run_webrtc_vad_on_segment(self, audio_path, segment_start, segment_end):
-        """NEW: Run WebRTC VAD on segment to get speech ratio"""
         if not self.use_vad:
             return 1.0
@@ -138,11 +138,11 @@ class ASR_Diarization:
             return speech_frames / total_frames if total_frames > 0 else 0.0
         except Exception as e:
-            print(f"⚠️ WebRTC VAD failed: {e}")
             return 0.0
     def run_diarization(self, audio_path):
-        """FIXED: Run diarization with VAD AFTER approach"""
         # Step 1: Diarization sees FULL audio first
         diarization = self.diar_pipeline(audio_path)
         diar_segments = [
@@ -176,7 +176,7 @@ class ASR_Diarization:
             diar_segments = filtered_segments
         else:
-            print(f"✅ Good SNR ({snr:.1f} dB), using all diarization segments")
         # Step 4: Duration filtering for Whisper
         filtered_segments = [
@@ -184,11 +184,11 @@ class ASR_Diarization:
             if (seg["end"] - seg["start"]) >= self.min_whisper_duration
         ]
-        print(f"🎯 Final: {len(filtered_segments)} segments for Whisper")
         return filtered_segments
     def map_speaker_labels(self, segments, original_speakers=['A', 'B', 'C', 'D']):
-        """NEW: Map SPEAKER_XX labels to A, B, C, D format to match original"""
         unique_speakers = list(set([seg['speaker'] for seg in segments]))
         speaker_map = {}
@@ -205,12 +205,43 @@ class ASR_Diarization:
         return segments, list(speaker_map.values())
     def run_transcription(self, audio_path, diar_json):
-        """FIXED: Transcription with proper word-level timestamp extraction"""
-        # FIX: Load and standardize audio
         audio, sr = torchaudio.load(audio_path)
-        # FIX: Resample to 16kHz for consistency
         if sr != 16000:
             resampler = torchaudio.transforms.Resample(sr, 16000)
             audio = resampler(audio)
@@ -219,13 +250,13 @@ class ASR_Diarization:
         merged_segments = []
         speaker_segments = {}
-        # NEW: Calculate SNR for adaptive noise reduction
         snr = self.calculate_snr(audio_path)
         for seg in diar_json:
             start, end, spk = seg["start"], seg["end"], seg["speaker"]
-            # NEW: Skip segments that are too short for Whisper
             segment_duration = end - start
             if segment_duration < self.min_whisper_duration:
                 print(f"⏩ Skipping short segment for Whisper: {start:.2f}-{end:.2f} ({segment_duration:.2f}s)")
@@ -233,16 +264,16 @@ class ASR_Diarization:
             start_sample, end_sample = int(start * sr), int(end * sr)
-            # FIX: Handle both mono and stereo audio
             if audio.shape[0] > 1:  # Stereo
                 chunk = torch.mean(audio[:, start_sample:end_sample], dim=0).numpy()
             else:  # Mono
                 chunk = audio[0, start_sample:end_sample].numpy()
-            # NEW: Calculate RMS energy for this segment
             rms_energy = self.calculate_rms_energy(chunk)
-            # NEW: Adaptive noise reduction based on SNR + RMS
             if len(chunk) > int(0.1 * sr):
                 if snr < 10 or rms_energy < 0.01:  # Very noisy or low energy
                     reduced = nr.reduce_noise(y=chunk, sr=sr, stationary=True, prop_decrease=0.8)
@@ -254,112 +285,68 @@ class ASR_Diarization:
                 reduced = chunk
             try:
-                # FIX: Force word-level timestamps and better configuration
                 result = self.asr_pipeline(
                     reduced,
-                    return_timestamps="word",  # FORCE word-level timestamps
                     generate_kwargs={
                         "task": "transcribe",
-                        "language": "en"
                     }
                 )
             except Exception as e:
                 print(f"⚠️ Whisper failed on segment {start:.2f}-{end:.2f}: {e}")
                 continue
-            tokens = []
-            segment_text = ""
-            # FIXED: Proper word-level timestamp extraction
-            if "chunks" in result:
-                for chunk_info in result["chunks"]:
-                    timestamp = chunk_info.get("timestamp")
-                    text = chunk_info.get("text", "").strip()
-                    if text and timestamp:
-                        chunk_start, chunk_end = timestamp
-                        # Validate and convert to absolute time
-                        if 0 <= chunk_start <= chunk_end <= (end - start):
-                            abs_start = start + chunk_start
-                            abs_end = start + chunk_end
-                        else:
-                            # Fallback: use segment boundaries
-                            abs_start = start
-                            abs_end = end
-                        # NEW: Split into individual words with distributed timestamps
-                        words = text.split()
-                        if len(words) == 1:
-                            # Single word - use original timestamp
-                            tokens.append({
-                                "start": abs_start,
-                                "end": abs_end,
-                                "text": text,
-                                "tag": "w"
-                            })
-                        else:
-                            # Multiple words - distribute time evenly
-                            word_duration = (abs_end - abs_start) / len(words)
-                            for i, word in enumerate(words):
-                                word_start = abs_start + (i * word_duration)
-                                word_end = word_start + word_duration
-                                tokens.append({
-                                    "start": word_start,
-                                    "end": word_end,
-                                    "text": word,
-                                    "tag": "w"
-                                })
-                        segment_text += text + " "
-            # NEW: Only add segment if we got content
-            if tokens or segment_text.strip():
                 seg_dict = {
                     "speaker": spk,
-                    "start": start,
-                    "end": end,
-                    "tokens": tokens,
-                    "text": segment_text.strip(),  # NEW: Add full segment text
-                    "rms_energy": float(rms_energy)  # NEW: Store RMS energy
                 }
                 merged_segments.append(seg_dict)
                 if spk not in speaker_segments:
                     speaker_segments[spk] = []
                 speaker_segments[spk].append(seg_dict)
-            else:
-                print(f"🔇 Empty transcription for segment {start:.2f}-{end:.2f}")
         return merged_segments, list(speaker_segments.keys())
     def run_pipeline(self, audio_path, output_dir=None, base_name=None,
-                     ref_rttm=None, ref_json=None, nse_events=None):  # NEW: nse_events parameter
-        """FIXED: Add input validation and proper RTTM format"""
-        # NEW: Validate input audio file
         if not os.path.exists(audio_path):
             raise FileNotFoundError(f"Audio file not found: {audio_path}")
         try:
-            # NEW: Quick validation that it's loadable audio
             audio, sr = torchaudio.load(audio_path)
             if audio.numel() == 0:
                 raise ValueError("Audio file is empty")
         except Exception as e:
             raise ValueError(f"Invalid audio file: {e}")
-        print(f"🔊 Processing with VAD: {'ON' if self.use_vad else 'OFF'}")
         # Run diarization and transcription
         diar_json = self.run_diarization(audio_path)
         merged_segments, speakers = self.run_transcription(audio_path, diar_json)
-        # NEW: Map speaker labels to match original format (A, B, C, D)
         merged_segments, speakers = self.map_speaker_labels(merged_segments)
-        # NEW: Combine ASR segments with NSE events if provided
         if nse_events:
-            print(f"🔊 Combining {len(merged_segments)} ASR segments with {len(nse_events)} NSE events")
             all_segments = merged_segments + nse_events
             # Sort by start time for proper timeline
             all_segments.sort(key=lambda x: x["start"])
@@ -369,14 +356,14 @@ class ASR_Diarization:
         if output_dir and base_name:
             os.makedirs(output_dir, exist_ok=True)
-            # FIX: Save RTTM with standard format and precision
             rttm_path = os.path.join(output_dir, f"{base_name}.rttm")
             with open(rttm_path, "w") as f:
                 for seg in diar_json:
                     f.write(
                         f"SPEAKER {base_name} 1 {seg['start']:.3f} "
                         f"{seg['end']-seg['start']:.3f} <NA> <NA> "
-                        f"{seg['speaker']} <NA> <NA>\n"  # FIX: Standard 9 fields
                     )
             # Save transcription (with NSE events if available)
@@ -397,7 +384,7 @@ class ASR_Diarization:
         }
     def evaluate(self, output_dir, base_name, ref_rttm=None, ref_json=None):
-        # FIX: Add output_dir validation
         if not output_dir or not base_name:
             return None
@@ -421,9 +408,10 @@ class ASR_Diarization:
         if ref_json and os.path.exists(hyp_json):
             def load_words(path):
                 data = json.load(open(path))
-                # NEW: Filter out NSE events for WER calculation (only use speech)
                 speech_segments = [seg for seg in data if seg.get("speaker") != "NSE"]
-                return " ".join([tok["text"] for seg in speech_segments for tok in seg["tokens"]])
             ref_text, hyp_text = load_words(ref_json), load_words(hyp_json)
             transform = Compose([ToLowerCase(), RemovePunctuation(),
@@ -433,7 +421,7 @@ class ASR_Diarization:
         return results if results else None
-    def __call__(self, inputs, nse_events=None):  # NEW: nse_events parameter
         """FIXED: Add proper temporary file cleanup"""
         if isinstance(inputs, dict):
             if "audio_bytes" in inputs:
@@ -456,6 +444,6 @@ class ASR_Diarization:
             result = self.run_pipeline(tmp_path, nse_events=nse_events)
             return result
         finally:
-            # FIX: Always clean up temporary file
             if tmp_path and os.path.exists(tmp_path):
                 os.unlink(tmp_path)

                  diar_model="pyannote/speaker-diarization-3.1",
                  asr_model="Capstone04/TrainedWhisper_Medium",
                  model_path=None,
+                 use_vad=True,
+                 vad_threshold=0.3,
+                 min_segment_duration=0.5,
+                 snr_threshold=15.0,
+                 min_whisper_duration=0.3):
         self.HF_TOKEN = HF_TOKEN
         self.device = "cuda" if torch.cuda.is_available() else "cpu"
         self.snr_threshold = snr_threshold
         self.min_whisper_duration = min_whisper_duration
+        # Load diarization model
         self.diar_pipeline = Pipeline.from_pretrained(diar_model, use_auth_token=HF_TOKEN)
         self.diar_pipeline = self.diar_pipeline.to(torch.device(self.device))
+        # Load WebRTC VAD for post-diarization filtering
         if self.use_vad:
             try:
                 import webrtcvad
+                self.vad = webrtcvad.Vad(2)
+                print("WebRTC VAD loaded for post-diarization filtering")
             except ImportError:
+                print("WebRTC VAD not available")
                 self.use_vad = False
         # Load ASR model
         if model_path and os.path.exists(model_path):
+            print(f"Loading custom ASR model from: {model_path}")
             actual_asr_model = model_path
         else:
+            print(f"Loading default ASR model: {asr_model}")
             actual_asr_model = asr_model
         processor = WhisperProcessor.from_pretrained(actual_asr_model, token=HF_TOKEN)
             return snr
         except Exception as e:
+            print(f"SNR calculation failed: {e}")
             return float('inf')
     def calculate_rms_energy(self, audio_chunk):
+        """Calculate RMS energy for audio chunk"""
         return np.sqrt(np.mean(audio_chunk**2))
     def run_webrtc_vad_on_segment(self, audio_path, segment_start, segment_end):
+        """Run WebRTC VAD on segment to get speech ratio"""
         if not self.use_vad:
             return 1.0
             return speech_frames / total_frames if total_frames > 0 else 0.0
         except Exception as e:
+            print(f"WebRTC VAD failed: {e}")
             return 0.0
     def run_diarization(self, audio_path):
+        """Run diarization with VAD AFTER approach"""
         # Step 1: Diarization sees FULL audio first
         diarization = self.diar_pipeline(audio_path)
         diar_segments = [
             diar_segments = filtered_segments
         else:
+            print(f"Good SNR ({snr:.1f} dB), using all diarization segments")
         # Step 4: Duration filtering for Whisper
         filtered_segments = [
             if (seg["end"] - seg["start"]) >= self.min_whisper_duration
         ]
+        print(f"Final: {len(filtered_segments)} segments for Whisper")
         return filtered_segments
     def map_speaker_labels(self, segments, original_speakers=['A', 'B', 'C', 'D']):
+        """Map SPEAKER_XX labels to A, B, C, D format to match original"""
         unique_speakers = list(set([seg['speaker'] for seg in segments]))
         speaker_map = {}
         return segments, list(speaker_map.values())
+    def merge_consecutive_speaker_segments(self, segments):
+        """Merge only consecutive segments from the same speaker while preserving order"""
+        if not segments:
+            return []
+        # Sort by start time to ensure correct order
+        segments.sort(key=lambda x: x["start"])
+        merged_segments = []
+        for seg in segments:
+            if not merged_segments:
+                # First segment
+                merged_segments.append(seg)
+            else:
+                last_seg = merged_segments[-1]
+                # Check if same speaker AND consecutive (small gap < 2 seconds)
+                if (seg["speaker"] == last_seg["speaker"] and
+                    (seg["start"] - last_seg["end"]) < 2.0):
+                    # Merge with previous segment
+                    last_seg["text"] += " " + seg["text"]
+                    last_seg["end"] = seg["end"]
+                else:
+                    # Different speaker or large gap - keep as separate segment
+                    merged_segments.append(seg)
+        print(f"🔀 Reduced {len(segments)} segments to {len(merged_segments)} while preserving order")
+        return merged_segments
     def run_transcription(self, audio_path, diar_json):
+        """SIMPLIFIED: Segment-level transcription without word timestamps"""
+        # Load and standardize audio
         audio, sr = torchaudio.load(audio_path)
+        # Resample to 16kHz for consistency
         if sr != 16000:
             resampler = torchaudio.transforms.Resample(sr, 16000)
             audio = resampler(audio)
         merged_segments = []
         speaker_segments = {}
+        # Calculate SNR for adaptive noise reduction
         snr = self.calculate_snr(audio_path)
         for seg in diar_json:
             start, end, spk = seg["start"], seg["end"], seg["speaker"]
+            # Skip segments that are too short for Whisper
             segment_duration = end - start
             if segment_duration < self.min_whisper_duration:
                 print(f"⏩ Skipping short segment for Whisper: {start:.2f}-{end:.2f} ({segment_duration:.2f}s)")
             start_sample, end_sample = int(start * sr), int(end * sr)
+            # Handle both mono and stereo audio
             if audio.shape[0] > 1:  # Stereo
                 chunk = torch.mean(audio[:, start_sample:end_sample], dim=0).numpy()
             else:  # Mono
                 chunk = audio[0, start_sample:end_sample].numpy()
+            # Calculate RMS energy for this segment
             rms_energy = self.calculate_rms_energy(chunk)
+            # Adaptive noise reduction based on SNR + RMS
             if len(chunk) > int(0.1 * sr):
                 if snr < 10 or rms_energy < 0.01:  # Very noisy or low energy
                     reduced = nr.reduce_noise(y=chunk, sr=sr, stationary=True, prop_decrease=0.8)
                 reduced = chunk
             try:
+                # SIMPLIFIED: Get text without timestamps
                 result = self.asr_pipeline(
                     reduced,
                     generate_kwargs={
                         "task": "transcribe",
+                        "language": "en",
+                        "temperature": 0.0  # More accurate transcription
                     }
                 )
             except Exception as e:
                 print(f"⚠️ Whisper failed on segment {start:.2f}-{end:.2f}: {e}")
                 continue
+            # Extract just the text (no timestamp processing)
+            text = result.get("text", "").strip()
+            if text:
                 seg_dict = {
                     "speaker": spk,
+                    "start": start,  # Keep segment boundaries
+                    "end": end,      # Keep segment boundaries
+                    "text": text,    # Just the full segment text
+                    "rms_energy": float(rms_energy)
                 }
                 merged_segments.append(seg_dict)
                 if spk not in speaker_segments:
                     speaker_segments[spk] = []
                 speaker_segments[spk].append(seg_dict)
         return merged_segments, list(speaker_segments.keys())
     def run_pipeline(self, audio_path, output_dir=None, base_name=None,
+                     ref_rttm=None, ref_json=None, nse_events=None):
+        """Add input validation and proper RTTM format"""
+        # Validate input audio file
         if not os.path.exists(audio_path):
             raise FileNotFoundError(f"Audio file not found: {audio_path}")
         try:
+            # Quick validation that it's loadable audio
             audio, sr = torchaudio.load(audio_path)
             if audio.numel() == 0:
                 raise ValueError("Audio file is empty")
         except Exception as e:
             raise ValueError(f"Invalid audio file: {e}")
+        print(f"Processing with VAD: {'ON' if self.use_vad else 'OFF'}")
         # Run diarization and transcription
         diar_json = self.run_diarization(audio_path)
         merged_segments, speakers = self.run_transcription(audio_path, diar_json)
+        # NEW: Merge consecutive segments by same speaker
+        merged_segments = self.merge_consecutive_speaker_segments(merged_segments)
+        # Map speaker labels to match original format (A, B, C, D)
         merged_segments, speakers = self.map_speaker_labels(merged_segments)
+        # Combine ASR segments with NSE events if provided
         if nse_events:
+            print(f"Combining {len(merged_segments)} ASR segments with {len(nse_events)} NSE events")
             all_segments = merged_segments + nse_events
             # Sort by start time for proper timeline
             all_segments.sort(key=lambda x: x["start"])
         if output_dir and base_name:
             os.makedirs(output_dir, exist_ok=True)
+            # Save RTTM with standard format and precision
             rttm_path = os.path.join(output_dir, f"{base_name}.rttm")
             with open(rttm_path, "w") as f:
                 for seg in diar_json:
                     f.write(
                         f"SPEAKER {base_name} 1 {seg['start']:.3f} "
                         f"{seg['end']-seg['start']:.3f} <NA> <NA> "
+                        f"{seg['speaker']} <NA> <NA>\n"
                     )
             # Save transcription (with NSE events if available)
         }
     def evaluate(self, output_dir, base_name, ref_rttm=None, ref_json=None):
+        # Add output_dir validation
         if not output_dir or not base_name:
             return None
         if ref_json and os.path.exists(hyp_json):
             def load_words(path):
                 data = json.load(open(path))
+                # Filter out NSE events for WER calculation (only use speech)
                 speech_segments = [seg for seg in data if seg.get("speaker") != "NSE"]
+                # NEW: Directly use segment text instead of tokens
+                return " ".join([seg["text"] for seg in speech_segments])
             ref_text, hyp_text = load_words(ref_json), load_words(hyp_json)
             transform = Compose([ToLowerCase(), RemovePunctuation(),
         return results if results else None
+    def __call__(self, inputs, nse_events=None):
         """FIXED: Add proper temporary file cleanup"""
         if isinstance(inputs, dict):
             if "audio_bytes" in inputs:
             result = self.run_pipeline(tmp_path, nse_events=nse_events)
             return result
         finally:
+            # Always clean up temporary file
             if tmp_path and os.path.exists(tmp_path):
                 os.unlink(tmp_path)