Capstone04
/

Bootstrapping

@@ -150,14 +150,14 @@ class ASR_Diarization:
             for t, _, spk in diarization.itertracks(yield_label=True)
         ]
-        print(f"🎯 Diarization found {len(diar_segments)} segments")
         # Step 2: Calculate SNR for adaptive processing
         snr = self.calculate_snr(audio_path)
         # Step 3: Apply VAD filtering ONLY if low SNR
         if snr < self.snr_threshold and self.use_vad:
-            print(f"🔇 Low SNR ({snr:.1f} dB), applying VAD filtering")
             filtered_segments = []
             for seg in diar_segments:
@@ -172,7 +172,7 @@ class ASR_Diarization:
                 if speech_ratio >= self.vad_threshold:
                     filtered_segments.append(seg)
                 else:
-                    print(f"🔇 Filtered low-speech segment: {seg['start']:.2f}-{seg['end']:.2f} (speech: {speech_ratio:.1%})")
             diar_segments = filtered_segments
         else:
@@ -233,11 +233,11 @@ class ASR_Diarization:
                     # Different speaker or large gap - keep as separate segment
                     merged_segments.append(seg)
-        print(f"🔀 Reduced {len(segments)} segments to {len(merged_segments)} while preserving order")
         return merged_segments
     def run_transcription(self, audio_path, diar_json):
-        """SIMPLIFIED: Segment-level transcription without word timestamps"""
         # Load and standardize audio
         audio, sr = torchaudio.load(audio_path)
@@ -259,7 +259,7 @@ class ASR_Diarization:
             # Skip segments that are too short for Whisper
             segment_duration = end - start
             if segment_duration < self.min_whisper_duration:
-                print(f"⏩ Skipping short segment for Whisper: {start:.2f}-{end:.2f} ({segment_duration:.2f}s)")
                 continue
             start_sample, end_sample = int(start * sr), int(end * sr)
@@ -285,7 +285,7 @@ class ASR_Diarization:
                 reduced = chunk
             try:
-                # SIMPLIFIED: Get text without timestamps
                 result = self.asr_pipeline(
                     reduced,
                     generate_kwargs={
@@ -295,7 +295,7 @@ class ASR_Diarization:
                     }
                 )
             except Exception as e:
-                print(f"⚠️ Whisper failed on segment {start:.2f}-{end:.2f}: {e}")
                 continue
             # Extract just the text (no timestamp processing)
@@ -338,7 +338,7 @@ class ASR_Diarization:
         diar_json = self.run_diarization(audio_path)
         merged_segments, speakers = self.run_transcription(audio_path, diar_json)
-        # NEW: Merge consecutive segments by same speaker
         merged_segments = self.merge_consecutive_speaker_segments(merged_segments)
         # Map speaker labels to match original format (A, B, C, D)
@@ -411,7 +411,7 @@ class ASR_Diarization:
                 data = json.load(open(path))
                 # Filter out NSE events for WER calculation (only use speech)
                 speech_segments = [seg for seg in data if seg.get("speaker") != "NSE"]
-                # NEW: Directly use segment text instead of tokens
                 return " ".join([seg["text"] for seg in speech_segments])
             def load_words_from_reference(path):

             for t, _, spk in diarization.itertracks(yield_label=True)
         ]
+        print(f"Diarization found {len(diar_segments)} segments")
         # Step 2: Calculate SNR for adaptive processing
         snr = self.calculate_snr(audio_path)
         # Step 3: Apply VAD filtering ONLY if low SNR
         if snr < self.snr_threshold and self.use_vad:
+            print(f"Low SNR ({snr:.1f} dB), applying VAD filtering")
             filtered_segments = []
             for seg in diar_segments:
                 if speech_ratio >= self.vad_threshold:
                     filtered_segments.append(seg)
                 else:
+                    print(f"Filtered low-speech segment: {seg['start']:.2f}-{seg['end']:.2f} (speech: {speech_ratio:.1%})")
             diar_segments = filtered_segments
         else:
                     # Different speaker or large gap - keep as separate segment
                     merged_segments.append(seg)
+        print(f"Reduced {len(segments)} segments to {len(merged_segments)} while preserving order")
         return merged_segments
     def run_transcription(self, audio_path, diar_json):
+        """Segment-level transcription without word timestamps"""
         # Load and standardize audio
         audio, sr = torchaudio.load(audio_path)
             # Skip segments that are too short for Whisper
             segment_duration = end - start
             if segment_duration < self.min_whisper_duration:
+                print(f"Skipping short segment for Whisper: {start:.2f}-{end:.2f} ({segment_duration:.2f}s)")
                 continue
             start_sample, end_sample = int(start * sr), int(end * sr)
                 reduced = chunk
             try:
+                # Get text without timestamps
                 result = self.asr_pipeline(
                     reduced,
                     generate_kwargs={
                     }
                 )
             except Exception as e:
+                print(f"Whisper failed on segment {start:.2f}-{end:.2f}: {e}")
                 continue
             # Extract just the text (no timestamp processing)
         diar_json = self.run_diarization(audio_path)
         merged_segments, speakers = self.run_transcription(audio_path, diar_json)
+        # Merge consecutive segments by same speaker
         merged_segments = self.merge_consecutive_speaker_segments(merged_segments)
         # Map speaker labels to match original format (A, B, C, D)
                 data = json.load(open(path))
                 # Filter out NSE events for WER calculation (only use speech)
                 speech_segments = [seg for seg in data if seg.get("speaker") != "NSE"]
+                # Directly use segment text instead of tokens
                 return " ".join([seg["text"] for seg in speech_segments])
             def load_words_from_reference(path):