Capstone04
/

Bootstrapping

@@ -3,6 +3,7 @@ import json
 import torch
 import torchaudio
 import noisereduce as nr
 from pyannote.audio import Pipeline
 from transformers import WhisperProcessor, WhisperForConditionalGeneration, pipeline as hf_pipeline
 import tempfile
@@ -15,14 +16,36 @@ class ASR_Diarization:
     def __init__(self, HF_TOKEN,
                  diar_model="pyannote/speaker-diarization-3.1",
                  asr_model="Capstone04/TrainedWhisper_Medium",
-                 model_path=None):  # NEW: model_path parameter
         self.HF_TOKEN = HF_TOKEN
         self.device = "cuda" if torch.cuda.is_available() else "cpu"
-        # Load diarization model
         self.diar_pipeline = Pipeline.from_pretrained(diar_model, use_auth_token=HF_TOKEN)
-        # MODIFIED: Use custom model_path if provided, otherwise use asr_model
         if model_path and os.path.exists(model_path):
             print(f"🔄 Loading custom ASR model from: {model_path}")
             actual_asr_model = model_path
@@ -42,75 +65,285 @@ class ASR_Diarization:
             return_timestamps=True
         )
     def run_diarization(self, audio_path):
         diarization = self.diar_pipeline(audio_path)
-        return [
             {"start": t.start, "end": t.end, "speaker": spk}
             for t, _, spk in diarization.itertracks(yield_label=True)
         ]
     def run_transcription(self, audio_path, diar_json):
         audio, sr = torchaudio.load(audio_path)
         merged_segments = []
         speaker_segments = {}
         for seg in diar_json:
             start, end, spk = seg["start"], seg["end"], seg["speaker"]
             start_sample, end_sample = int(start * sr), int(end * sr)
-            chunk = audio[0, start_sample:end_sample].numpy()
-            reduced = nr.reduce_noise(y=chunk, sr=sr)
-            result = self.asr_pipeline(reduced)
             tokens = []
             if "chunks" in result:
                 for word_info in result["chunks"]:
-                    start_ts, end_ts = word_info.get("timestamp", (None, None)) or (None, None)
-                    tokens.append({
-                        "start": start_ts,
-                        "end": end_ts,
-                        "text": word_info["text"],
-                        "tag": "w"
-                    })
-            seg_dict = {
-                "speaker": spk,
-                "start": start,
-                "end": end,
-                "tokens": tokens
-            }
-            merged_segments.append(seg_dict)
-            if spk not in speaker_segments:
-                speaker_segments[spk] = []
-            speaker_segments[spk].append(seg_dict)
         return merged_segments, list(speaker_segments.keys())
     def run_pipeline(self, audio_path, output_dir=None, base_name=None,
-                     ref_rttm=None, ref_json=None):
         diar_json = self.run_diarization(audio_path)
         merged_segments, speakers = self.run_transcription(audio_path, diar_json)
         if output_dir and base_name:
             os.makedirs(output_dir, exist_ok=True)
-            # Save RTTM
             rttm_path = os.path.join(output_dir, f"{base_name}.rttm")
             with open(rttm_path, "w") as f:
                 for seg in diar_json:
                     f.write(
-                        f"SPEAKER {base_name} 1 {seg['start']:.6f} "
-                        f"{seg['end']-seg['start']:.6f} <NA> <NA> "
-                        f"{seg['speaker']} <NA>\n"
                     )
-            # Save transcription
             merged_path = os.path.join(output_dir, f"{base_name}_merged_transcription.json")
             with open(merged_path, "w") as f:
-                json.dump(merged_segments, f, indent=2)
-        # --- evaluation if refs are provided ---
         eval_results = None
         if ref_rttm or ref_json:
             eval_results = self.evaluate(output_dir, base_name,
@@ -118,17 +351,20 @@ class ASR_Diarization:
         return {
             "speakers": speakers,
-            "segments": merged_segments,
             "evaluation": eval_results
         }
     def evaluate(self, output_dir, base_name, ref_rttm=None, ref_json=None):
-        results = {}
         hyp_rttm = os.path.join(output_dir, f"{base_name}.rttm")
         hyp_json = os.path.join(output_dir, f"{base_name}_merged_transcription.json")
-        if ref_rttm:
             def load_rttm(path):
                 ann = Annotation()
                 for line in open(path):
@@ -141,10 +377,12 @@ class ASR_Diarization:
             der_score = DiarizationErrorRate()(load_rttm(ref_rttm), load_rttm(hyp_rttm))
             results["DER"] = round(der_score * 100, 2)
-        if ref_json:
             def load_words(path):
                 data = json.load(open(path))
-                return " ".join([tok["text"] for seg in data for tok in seg["tokens"]])
             ref_text, hyp_text = load_words(ref_json), load_words(hyp_json)
             transform = Compose([ToLowerCase(), RemovePunctuation(),
@@ -154,7 +392,8 @@ class ASR_Diarization:
         return results if results else None
-    def __call__(self, inputs):
         if isinstance(inputs, dict):
             if "audio_bytes" in inputs:
                 audio_bytes = inputs["audio_bytes"]
@@ -165,8 +404,17 @@ class ASR_Diarization:
         else:
             audio_bytes = inputs
-        with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp:
-            tmp.write(audio_bytes)
-            tmp_path = tmp.name
-        return self.run_pipeline(tmp_path)

 import torch
 import torchaudio
 import noisereduce as nr
+import numpy as np
 from pyannote.audio import Pipeline
 from transformers import WhisperProcessor, WhisperForConditionalGeneration, pipeline as hf_pipeline
 import tempfile
     def __init__(self, HF_TOKEN,
                  diar_model="pyannote/speaker-diarization-3.1",
                  asr_model="Capstone04/TrainedWhisper_Medium",
+                 model_path=None,
+                 use_vad=True,           # NEW: VAD after diarization
+                 vad_threshold=0.3,      # NEW: VAD speech ratio threshold
+                 min_segment_duration=0.5, # NEW: Minimum segment duration
+                 snr_threshold=15.0,     # NEW: SNR threshold for adaptive processing
+                 min_whisper_duration=0.3): # NEW: Minimum duration for Whisper
         self.HF_TOKEN = HF_TOKEN
         self.device = "cuda" if torch.cuda.is_available() else "cpu"
+        self.use_vad = use_vad
+        self.vad_threshold = vad_threshold
+        self.min_segment_duration = min_segment_duration
+        self.snr_threshold = snr_threshold
+        self.min_whisper_duration = min_whisper_duration
+        # Load diarization model - FIX: Add device
         self.diar_pipeline = Pipeline.from_pretrained(diar_model, use_auth_token=HF_TOKEN)
+        self.diar_pipeline = self.diar_pipeline.to(torch.device(self.device))
+        # Load WebRTC VAD for post-diarization filtering - NEW
+        if self.use_vad:
+            try:
+                import webrtcvad
+                self.vad = webrtcvad.Vad(2)  # Medium aggressiveness
+                print("✅ WebRTC VAD loaded for post-diarization filtering")
+            except ImportError:
+                print("⚠️ WebRTC VAD not available")
+                self.use_vad = False
+        # Load ASR model
         if model_path and os.path.exists(model_path):
             print(f"🔄 Loading custom ASR model from: {model_path}")
             actual_asr_model = model_path
             return_timestamps=True
         )
+    def calculate_snr(self, audio_path):
+        """NEW: Calculate SNR using RMS energy"""
+        try:
+            import librosa
+            y, sr = librosa.load(audio_path, sr=16000, mono=True)
+            # RMS-based SNR
+            rms = librosa.feature.rms(y=y)[0]
+            if len(rms) == 0:
+                return float('inf')
+            # Signal = high RMS regions, Noise = low RMS regions
+            high_rms = rms[rms > np.percentile(rms, 70)]
+            low_rms = rms[rms <= np.percentile(rms, 30)]
+            if len(high_rms) == 0 or len(low_rms) == 0:
+                return float('inf')
+            signal_power = np.mean(high_rms)
+            noise_power = np.mean(low_rms)
+            if noise_power == 0:
+                return float('inf')
+            snr = 10 * np.log10(signal_power / noise_power)
+            return snr
+        except Exception as e:
+            print(f"⚠️ SNR calculation failed: {e}")
+            return float('inf')
+    def calculate_rms_energy(self, audio_chunk):
+        """NEW: Calculate RMS energy for audio chunk"""
+        return np.sqrt(np.mean(audio_chunk**2))
+    def run_webrtc_vad_on_segment(self, audio_path, segment_start, segment_end):
+        """NEW: Run WebRTC VAD on segment to get speech ratio"""
+        if not self.use_vad:
+            return 1.0
+        try:
+            import wave
+            # Load audio
+            with wave.open(audio_path, "rb") as wf:
+                sample_rate = wf.getframerate()
+                n_frames = wf.getnframes()
+                audio_data = wf.readframes(n_frames)
+            audio_array = np.frombuffer(audio_data, dtype=np.int16)
+            start_sample = int(segment_start * sample_rate)
+            end_sample = int(segment_end * sample_rate)
+            segment_audio = audio_array[start_sample:end_sample]
+            segment_bytes = segment_audio.tobytes()
+            # WebRTC VAD processing (30ms frames)
+            frame_duration = 30
+            bytes_per_sample = 2
+            frame_size = int(sample_rate * frame_duration / 1000) * bytes_per_sample
+            speech_frames = 0
+            total_frames = 0
+            for i in range(0, len(segment_bytes) - frame_size + 1, frame_size):
+                frame = segment_bytes[i:i + frame_size]
+                if len(frame) == frame_size:
+                    is_speech = self.vad.is_speech(frame, sample_rate)
+                    if is_speech:
+                        speech_frames += 1
+                    total_frames += 1
+            return speech_frames / total_frames if total_frames > 0 else 0.0
+        except Exception as e:
+            print(f"⚠️ WebRTC VAD failed: {e}")
+            return 0.0
     def run_diarization(self, audio_path):
+        """FIXED: Run diarization with VAD AFTER approach"""
+        # Step 1: Diarization sees FULL audio first
         diarization = self.diar_pipeline(audio_path)
+        diar_segments = [
             {"start": t.start, "end": t.end, "speaker": spk}
             for t, _, spk in diarization.itertracks(yield_label=True)
         ]
+        print(f"🎯 Diarization found {len(diar_segments)} segments")
+        # Step 2: Calculate SNR for adaptive processing
+        snr = self.calculate_snr(audio_path)
+        # Step 3: Apply VAD filtering ONLY if low SNR
+        if snr < self.snr_threshold and self.use_vad:
+            print(f"🔇 Low SNR ({snr:.1f} dB), applying VAD filtering")
+            filtered_segments = []
+            for seg in diar_segments:
+                # Skip VAD for very short segments
+                if (seg["end"] - seg["start"]) < 0.2:
+                    continue
+                speech_ratio = self.run_webrtc_vad_on_segment(
+                    audio_path, seg["start"], seg["end"]
+                )
+                if speech_ratio >= self.vad_threshold:
+                    filtered_segments.append(seg)
+                else:
+                    print(f"🔇 Filtered low-speech segment: {seg['start']:.2f}-{seg['end']:.2f} (speech: {speech_ratio:.1%})")
+            diar_segments = filtered_segments
+        else:
+            print(f"✅ Good SNR ({snr:.1f} dB), using all diarization segments")
+        # Step 4: Duration filtering for Whisper
+        filtered_segments = [
+            seg for seg in diar_segments
+            if (seg["end"] - seg["start"]) >= self.min_whisper_duration
+        ]
+        print(f"🎯 Final: {len(filtered_segments)} segments for Whisper")
+        return filtered_segments
     def run_transcription(self, audio_path, diar_json):
+        """FIXED: Transcription with proper timestamp conversion and error handling"""
+        # FIX: Load and standardize audio
         audio, sr = torchaudio.load(audio_path)
+        # FIX: Resample to 16kHz for consistency
+        if sr != 16000:
+            resampler = torchaudio.transforms.Resample(sr, 16000)
+            audio = resampler(audio)
+            sr = 16000
         merged_segments = []
         speaker_segments = {}
+        # NEW: Calculate SNR for adaptive noise reduction
+        snr = self.calculate_snr(audio_path)
         for seg in diar_json:
             start, end, spk = seg["start"], seg["end"], seg["speaker"]
+            # NEW: Skip segments that are too short for Whisper
+            segment_duration = end - start
+            if segment_duration < self.min_whisper_duration:
+                print(f"⏩ Skipping short segment for Whisper: {start:.2f}-{end:.2f} ({segment_duration:.2f}s)")
+                continue
             start_sample, end_sample = int(start * sr), int(end * sr)
+            # FIX: Handle both mono and stereo audio
+            if audio.shape[0] > 1:  # Stereo
+                chunk = torch.mean(audio[:, start_sample:end_sample], dim=0).numpy()
+            else:  # Mono
+                chunk = audio[0, start_sample:end_sample].numpy()
+            # NEW: Calculate RMS energy for this segment
+            rms_energy = self.calculate_rms_energy(chunk)
+            # NEW: Adaptive noise reduction based on SNR + RMS
+            if len(chunk) > int(0.1 * sr):
+                if snr < 10 or rms_energy < 0.01:  # Very noisy or low energy
+                    reduced = nr.reduce_noise(y=chunk, sr=sr, stationary=True, prop_decrease=0.8)
+                elif snr < 20:  # Moderately noisy
+                    reduced = nr.reduce_noise(y=chunk, sr=sr, stationary=True, prop_decrease=0.5)
+                else:  # Clean audio
+                    reduced = chunk
+            else:
+                reduced = chunk
+            try:
+                result = self.asr_pipeline(reduced)
+            except Exception as e:
+                print(f"⚠️ Whisper failed on segment {start:.2f}-{end:.2f}: {e}")
+                continue
             tokens = []
+            segment_text = ""
             if "chunks" in result:
                 for word_info in result["chunks"]:
+                    # FIX: Convert relative timestamps to absolute
+                    timestamp = word_info.get("timestamp")
+                    text = word_info.get("text", "").strip()
+                    if text:
+                        if timestamp and isinstance(timestamp, (list, tuple)) and len(timestamp) == 2:
+                            rel_start, rel_end = timestamp
+                            # Validate timestamps are reasonable
+                            if 0 <= rel_start < rel_end <= (end - start):
+                                abs_start = start + rel_start  # Convert to absolute time
+                                abs_end = start + rel_end      # Convert to absolute time
+                            else:
+                                # Invalid timestamps, use segment boundaries
+                                abs_start = start
+                                abs_end = end
+                        else:
+                            # No timestamps from Whisper, use segment boundaries
+                            abs_start = start
+                            abs_end = end
+                        tokens.append({
+                            "start": abs_start,  # Store absolute time
+                            "end": abs_end,      # Store absolute time
+                            "text": text,
+                            "tag": "w"
+                        })
+                        segment_text += text + " "
+            # NEW: Only add segment if we got content
+            if tokens or segment_text.strip():
+                seg_dict = {
+                    "speaker": spk,
+                    "start": start,
+                    "end": end,
+                    "tokens": tokens,
+                    "text": segment_text.strip(),  # NEW: Add full segment text
+                    "rms_energy": float(rms_energy)  # NEW: Store RMS energy
+                }
+                merged_segments.append(seg_dict)
+                if spk not in speaker_segments:
+                    speaker_segments[spk] = []
+                speaker_segments[spk].append(seg_dict)
+            else:
+                print(f"🔇 Empty transcription for segment {start:.2f}-{end:.2f}")
         return merged_segments, list(speaker_segments.keys())
     def run_pipeline(self, audio_path, output_dir=None, base_name=None,
+                     ref_rttm=None, ref_json=None, nse_events=None):  # NEW: nse_events parameter
+        """FIXED: Add input validation and proper RTTM format"""
+        # NEW: Validate input audio file
+        if not os.path.exists(audio_path):
+            raise FileNotFoundError(f"Audio file not found: {audio_path}")
+        try:
+            # NEW: Quick validation that it's loadable audio
+            audio, sr = torchaudio.load(audio_path)
+            if audio.numel() == 0:
+                raise ValueError("Audio file is empty")
+        except Exception as e:
+            raise ValueError(f"Invalid audio file: {e}")
+        print(f"🔊 Processing with VAD: {'ON' if self.use_vad else 'OFF'}")
+        # Run diarization and transcription
         diar_json = self.run_diarization(audio_path)
         merged_segments, speakers = self.run_transcription(audio_path, diar_json)
+        # NEW: Combine ASR segments with NSE events if provided
+        if nse_events:
+            print(f"🔊 Combining {len(merged_segments)} ASR segments with {len(nse_events)} NSE events")
+            all_segments = merged_segments + nse_events
+            # Sort by start time for proper timeline
+            all_segments.sort(key=lambda x: x["start"])
+        else:
+            all_segments = merged_segments
         if output_dir and base_name:
             os.makedirs(output_dir, exist_ok=True)
+            # FIX: Save RTTM with standard format and precision
             rttm_path = os.path.join(output_dir, f"{base_name}.rttm")
             with open(rttm_path, "w") as f:
                 for seg in diar_json:
                     f.write(
+                        f"SPEAKER {base_name} 1 {seg['start']:.3f} "
+                        f"{seg['end']-seg['start']:.3f} <NA> <NA> "
+                        f"{seg['speaker']} <NA> <NA>\n"  # FIX: Standard 9 fields
                     )
+            # Save transcription (with NSE events if available)
             merged_path = os.path.join(output_dir, f"{base_name}_merged_transcription.json")
             with open(merged_path, "w") as f:
+                json.dump(all_segments, f, indent=2)
+        # Evaluation if refs are provided
         eval_results = None
         if ref_rttm or ref_json:
             eval_results = self.evaluate(output_dir, base_name,
         return {
             "speakers": speakers,
+            "segments": all_segments,  # Return combined segments
             "evaluation": eval_results
         }
     def evaluate(self, output_dir, base_name, ref_rttm=None, ref_json=None):
+        # FIX: Add output_dir validation
+        if not output_dir or not base_name:
+            return None
+        results = {}
         hyp_rttm = os.path.join(output_dir, f"{base_name}.rttm")
         hyp_json = os.path.join(output_dir, f"{base_name}_merged_transcription.json")
+        if ref_rttm and os.path.exists(hyp_rttm):
             def load_rttm(path):
                 ann = Annotation()
                 for line in open(path):
             der_score = DiarizationErrorRate()(load_rttm(ref_rttm), load_rttm(hyp_rttm))
             results["DER"] = round(der_score * 100, 2)
+        if ref_json and os.path.exists(hyp_json):
             def load_words(path):
                 data = json.load(open(path))
+                # NEW: Filter out NSE events for WER calculation (only use speech)
+                speech_segments = [seg for seg in data if seg.get("speaker") != "NSE"]
+                return " ".join([tok["text"] for seg in speech_segments for tok in seg["tokens"]])
             ref_text, hyp_text = load_words(ref_json), load_words(hyp_json)
             transform = Compose([ToLowerCase(), RemovePunctuation(),
         return results if results else None
+    def __call__(self, inputs, nse_events=None):  # NEW: nse_events parameter
+        """FIXED: Add proper temporary file cleanup"""
         if isinstance(inputs, dict):
             if "audio_bytes" in inputs:
                 audio_bytes = inputs["audio_bytes"]
         else:
             audio_bytes = inputs
+        tmp_path = None
+        try:
+            # Create temporary file for processing
+            with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp:
+                tmp.write(audio_bytes)
+                tmp_path = tmp.name
+            # Run pipeline with NSE events
+            result = self.run_pipeline(tmp_path, nse_events=nse_events)
+            return result
+        finally:
+            # FIX: Always clean up temporary file
+            if tmp_path and os.path.exists(tmp_path):
+                os.unlink(tmp_path)