Capstone04
/

Bootstrapping

Automatic Speech Recognition

speaker-diarization

Model card Files Files and versions

Capstone04 commited on Nov 16, 2025

Commit

e89eb31

·

verified ·

1 Parent(s): c300c20

Upload folder using huggingface_hub

Files changed (1) hide show

asr_diarization/pipeline.py +40 -2

asr_diarization/pipeline.py CHANGED Viewed

@@ -1,5 +1,6 @@
 import os
 import json
 import torch
 import torchaudio
 import noisereduce as nr
@@ -65,6 +66,40 @@ class ASR_Diarization:
             return_timestamps=True
         )
     def calculate_snr(self, audio_path):
         """NEW: Calculate SNR using RMS energy"""
         try:
@@ -301,12 +336,15 @@ class ASR_Diarization:
             # Extract just the text (no timestamp processing)
             text = result.get("text", "").strip()
-            if text:
                 seg_dict = {
                     "speaker": spk,
                     "start": start,  # Keep segment boundaries
                     "end": end,      # Keep segment boundaries
-                    "text": text,    # Just the full segment text
                     "rms_energy": float(rms_energy)
                 }
                 merged_segments.append(seg_dict)

 import os
 import json
+import re
 import torch
 import torchaudio
 import noisereduce as nr
             return_timestamps=True
         )
+    def clean_transcription_text(self, text):
+        """Clean ASR text for better TTS performance"""
+        if not text:
+            return ""
+        # Basic cleaning
+        text = text.strip()
+        # Fix punctuation spacing for TTS
+        text = re.sub(r'\s+([.,!?;:])', r'\1', text)  # Remove space before punctuation
+        text = re.sub(r'([.,!?;:])(?=\w)', r'\1 ', text)  # Add space after punctuation
+        # Normalize whitespace
+        text = re.sub(r'\s+', ' ', text)
+        return text.strip()
+    def should_keep_segment(self, text, duration, rms_energy):
+        """Generalized segment quality assessment"""
+        # Duration too short
+        if duration < self.min_whisper_duration:
+            return False
+        # Energy too low (likely noise)
+        if rms_energy < 0.001:
+            return False
+        # Text too short or just punctuation
+        clean_text = text.strip()
+        if len(clean_text) <= 1:
+            return False
+        return True
     def calculate_snr(self, audio_path):
         """NEW: Calculate SNR using RMS energy"""
         try:
             # Extract just the text (no timestamp processing)
             text = result.get("text", "").strip()
+            # Clean the text for TTS and apply quality filtering
+            clean_text = self.clean_transcription_text(text)
+            if clean_text and self.should_keep_segment(clean_text, segment_duration, rms_energy):
                 seg_dict = {
                     "speaker": spk,
                     "start": start,  # Keep segment boundaries
                     "end": end,      # Keep segment boundaries
+                    "text": clean_text,    # Use cleaned text
                     "rms_energy": float(rms_energy)
                 }
                 merged_segments.append(seg_dict)