Spaces:

ai-coustics
/

VoiceFocus

Running on CPU Upgrade

App Files Files Community

mariesig commited on Mar 18

Commit

a7c506c

1 Parent(s): 6606020

VAD in spectrogram

Browse files

Files changed (2) hide show

offline_pipeline.py +3 -3
utils.py +122 -54

offline_pipeline.py CHANGED Viewed

@@ -122,7 +122,7 @@ def _process_audio_chunks(
         loop_progress = (i + original_chunk_len) / n if n > 0 else 1.0
         _safe_progress(
             progress,
-            0.20 + 0.60 * loop_progress,
             "Enhancing audio...",
         )
@@ -189,9 +189,9 @@ def run_offline_pipeline(
         progress=progress,
     )
-    _safe_progress(progress, 0.82, "Finalizing transcripts...")
     noisy_transcript = _finalize_stream_transcript(streamer_noisy)
-    _safe_progress(progress, 0.88, "Finalizing transcripts...")
     enhanced_transcript = _finalize_stream_transcript(streamer_enhanced)
     _safe_progress(progress, 0.94, "Loading reference transcript...")

         loop_progress = (i + original_chunk_len) / n if n > 0 else 1.0
         _safe_progress(
             progress,
+            0.20 + 0.50 * loop_progress,
             "Enhancing audio...",
         )
         progress=progress,
     )
+    _safe_progress(progress, 0.72, "Finalizing transcripts...")
     noisy_transcript = _finalize_stream_transcript(streamer_noisy)
+    _safe_progress(progress, 0.80, "Finalizing transcripts...")
     enhanced_transcript = _finalize_stream_transcript(streamer_enhanced)
     _safe_progress(progress, 0.94, "Loading reference transcript...")

utils.py CHANGED Viewed

@@ -1,59 +1,86 @@
 from typing import Optional
-import numpy as np
-import librosa
-from PIL import Image
 import io
 import matplotlib.pyplot as plt
-from constants import TARGET_LOUDNESS, TARGET_TP, VAD_OFF, VAD_ON
 import pyloudnorm as pyln
-import warnings
 def get_vad_labels(vad_timestamps: list[list[float]], length: float) -> list[dict]:
     subtitles = []
     cur = 0.0
     for start, end in vad_timestamps:
         if start > cur:
-            subtitles.append({
-                "text": f"Voice Detection: {VAD_OFF}",
-                "timestamp": [cur, start]
-            })
-        subtitles.append({
-            "text": f"Voice Detection: {VAD_ON}",
-            "timestamp": [start, end]
-        })
         cur = end
     if cur < length:
-        subtitles.append({
-            "text": f"Voice Detection: {VAD_OFF}",
-            "timestamp": [cur, length]
-        })
     return subtitles
 def to_gradio_audio(x: np.ndarray, sr: int) -> tuple[int, np.ndarray]:
-    """Return (sample_rate, int16 mono array) for Gradio Audio. Gradio expects int16;
-    passing float32 triggers an internal conversion and a warning."""
     x = np.asarray(x)
-    # Remove extra dims like (1, n, 1) etc.
     x = np.squeeze(x)
-    # If it's (channels, samples), transpose to (samples, channels)
     if x.ndim == 2 and x.shape[0] in (1, 2) and x.shape[1] > x.shape[0]:
         x = x.T
-    # Ensure mono is (n_samples,)
     if x.ndim == 2 and x.shape[1] == 1:
         x = x[:, 0]
     x = x.astype(np.float32)
     x = np.clip(x, -1.0, 1.0)
-    # Gradio Audio expects int16; convert here so Gradio doesn't convert and warn
     x = (x * 32767).astype(np.int16)
-    return (sr, x)
 def spec_image(
@@ -65,10 +92,8 @@ def spec_image(
     fmax: Optional[float] = None,
     vad_timestamps: Optional[list[list[float]]] = None,
 ) -> Image.Image:
-    """
-    Generate a mel-spectrogram image from an audio array.
-    """
-    y = audio_array.flatten()  # Ensure it's 1D
     S = librosa.feature.melspectrogram(
         y=y,
         sr=sr,
@@ -77,24 +102,65 @@ def spec_image(
         n_mels=n_mels,
         fmax=fmax or sr // 2,
     )
-    S_db = librosa.power_to_db(S, ref=np.max(S))
     fig, ax = plt.subplots(figsize=(8, 3), dpi=150)
     img = librosa.display.specshow(
-        S_db, sr=sr, hop_length=hop_length, x_axis="time", y_axis="mel", ax=ax
     )
     cbar = fig.colorbar(img, ax=ax, format="%+2.0f dB")
     cbar.set_label("dB")
     ax.set_title("Mel-spectrogram")
     ax.set_xlabel("Time in s")
     ax.set_ylabel("Frequency in Hz")
     fig.tight_layout(pad=0.2)
     buf = io.BytesIO()
     fig.savefig(buf, format="png", bbox_inches="tight", pad_inches=0)
-    if vad_timestamps:
-        for start, end in vad_timestamps:
-            ax.axvspan(start, end, color="red", alpha=0.3)
     plt.close(fig)
     buf.seek(0)
     return Image.open(buf).convert("RGB")
@@ -105,24 +171,24 @@ def compute_wer(reference: str, hypothesis: str) -> float:
     """
     ref_words = reference.split()
     hyp_words = hypothesis.split()
-    d = np.zeros((len(ref_words) + 1, len(hyp_words) + 1), dtype=np.uint8)
     for i in range(len(ref_words) + 1):
         d[i][0] = i
     for j in range(len(hyp_words) + 1):
         d[0][j] = j
     for i in range(1, len(ref_words) + 1):
         for j in range(1, len(hyp_words) + 1):
-            if ref_words[i - 1] == hyp_words[j - 1]:
-                cost = 0
-            else:
-                cost = 1
             d[i][j] = min(
-                d[i - 1][j] + 1,      # Deletion
-                d[i][j - 1] + 1,      # Insertion
-                d[i - 1][j - 1] + cost,  # Substitution
             )
-    wer = d[len(ref_words)][len(hyp_words)] / max(len(ref_words), 1)
-    return wer
 def measure_loudness(x: np.ndarray, sr: int) -> float:
@@ -130,7 +196,11 @@ def measure_loudness(x: np.ndarray, sr: int) -> float:
     return float(meter.integrated_loudness(x))
-def true_peak_limiter(x: np.ndarray, sr: int, max_true_peak: float = TARGET_TP) -> np.ndarray:
     upsampled_sr = 192000
     x_upsampled = librosa.resample(x, orig_sr=sr, target_sr=upsampled_sr)
     true_peak = np.max(np.abs(x_upsampled))
@@ -144,7 +214,7 @@ def true_peak_limiter(x: np.ndarray, sr: int, max_true_peak: float = TARGET_TP)
     x_limited = librosa.resample(x_upsampled, orig_sr=upsampled_sr, target_sr=sr)
     x_limited = librosa.util.fix_length(x_limited, size=x.shape[-1])
-    return x_limited.astype("float32")
 def normalize_lufs(x: np.ndarray, sr: int) -> np.ndarray:
@@ -153,9 +223,9 @@ def normalize_lufs(x: np.ndarray, sr: int) -> np.ndarray:
     """
     try:
         current_lufs = measure_loudness(x, sr)
         if not np.isfinite(current_lufs):
-            return x.astype("float32")
         gain_db = TARGET_LOUDNESS - current_lufs
         gain = 10 ** (gain_db / 20)
@@ -163,9 +233,7 @@ def normalize_lufs(x: np.ndarray, sr: int) -> np.ndarray:
         y = x * gain
         y = true_peak_limiter(y, sr, max_true_peak=TARGET_TP)
-        return y.astype("float32")
     except Exception as e:
         warnings.warn(f"LUFS normalization failed, returning input unchanged: {e}")
-        return x.astype("float32")

 from typing import Optional
 import io
+import warnings
+import librosa
+import librosa.display
 import matplotlib.pyplot as plt
+import numpy as np
 import pyloudnorm as pyln
+from matplotlib.patches import Patch
+from PIL import Image
+from constants import TARGET_LOUDNESS, TARGET_TP, VAD_OFF, VAD_ON
 def get_vad_labels(vad_timestamps: list[list[float]], length: float) -> list[dict]:
     subtitles = []
     cur = 0.0
     for start, end in vad_timestamps:
         if start > cur:
+            subtitles.append(
+                {
+                    "text": f"Voice Detection: {VAD_OFF}",
+                    "timestamp": [cur, start],
+                }
+            )
+        subtitles.append(
+            {
+                "text": f"Voice Detection: {VAD_ON}",
+                "timestamp": [start, end],
+            }
+        )
         cur = end
     if cur < length:
+        subtitles.append(
+            {
+                "text": f"Voice Detection: {VAD_OFF}",
+                "timestamp": [cur, length],
+            }
+        )
     return subtitles
 def to_gradio_audio(x: np.ndarray, sr: int) -> tuple[int, np.ndarray]:
+    """Return (sample_rate, int16 array) for Gradio Audio."""
     x = np.asarray(x)
     x = np.squeeze(x)
     if x.ndim == 2 and x.shape[0] in (1, 2) and x.shape[1] > x.shape[0]:
         x = x.T
     if x.ndim == 2 and x.shape[1] == 1:
         x = x[:, 0]
     x = x.astype(np.float32)
     x = np.clip(x, -1.0, 1.0)
     x = (x * 32767).astype(np.int16)
+    return sr, x
+def _merge_vad_segments(
+    vad_timestamps: list[list[float]],
+    gap_tolerance: float = 0.05,
+) -> list[tuple[float, float]]:
+    if not vad_timestamps:
+        return []
+    segments = sorted((float(start), float(end)) for start, end in vad_timestamps)
+    merged: list[tuple[float, float]] = [segments[0]]
+    for start, end in segments[1:]:
+        last_start, last_end = merged[-1]
+        if start <= last_end + gap_tolerance:
+            merged[-1] = (last_start, max(last_end, end))
+        else:
+            merged.append((start, end))
+    return merged
 def spec_image(
     fmax: Optional[float] = None,
     vad_timestamps: Optional[list[list[float]]] = None,
 ) -> Image.Image:
+    y = np.asarray(audio_array, dtype=np.float32).flatten()
     S = librosa.feature.melspectrogram(
         y=y,
         sr=sr,
         n_mels=n_mels,
         fmax=fmax or sr // 2,
     )
+    S_db = librosa.power_to_db(S, ref=np.max)
     fig, ax = plt.subplots(figsize=(8, 3), dpi=150)
     img = librosa.display.specshow(
+        S_db,
+        sr=sr,
+        hop_length=hop_length,
+        x_axis="time",
+        y_axis="mel",
+        cmap="magma",
+        ax=ax,
     )
+    if vad_timestamps:
+        vad_color = "#22C55E"  # softer, cleaner green
+        merged_segments = _merge_vad_segments(vad_timestamps, gap_tolerance=0.05)
+        ymin, ymax = ax.get_ylim()
+        bar_height = (ymax - ymin) * 0.02
+        bar_bottom = ymin
+        for start, end in merged_segments:
+            ax.fill_between(
+                [start, end],
+                [bar_bottom, bar_bottom],
+                [bar_bottom + bar_height, bar_bottom + bar_height],
+                color=vad_color,
+                alpha=0.95,
+                linewidth=0,
+                zorder=5,
+            )
+        vad_patch = Patch(
+            facecolor=vad_color,
+            edgecolor=vad_color,
+            label="Voice Activity",
+        )
+        ax.legend(
+            handles=[vad_patch],
+            loc="upper right",
+            fontsize=8,
+            frameon=True,
+            framealpha=0.9,
+        )
     cbar = fig.colorbar(img, ax=ax, format="%+2.0f dB")
     cbar.set_label("dB")
     ax.set_title("Mel-spectrogram")
     ax.set_xlabel("Time in s")
     ax.set_ylabel("Frequency in Hz")
     fig.tight_layout(pad=0.2)
     buf = io.BytesIO()
     fig.savefig(buf, format="png", bbox_inches="tight", pad_inches=0)
     plt.close(fig)
     buf.seek(0)
     return Image.open(buf).convert("RGB")
     """
     ref_words = reference.split()
     hyp_words = hypothesis.split()
+    d = np.zeros((len(ref_words) + 1, len(hyp_words) + 1), dtype=np.uint16)
     for i in range(len(ref_words) + 1):
         d[i][0] = i
     for j in range(len(hyp_words) + 1):
         d[0][j] = j
     for i in range(1, len(ref_words) + 1):
         for j in range(1, len(hyp_words) + 1):
+            cost = 0 if ref_words[i - 1] == hyp_words[j - 1] else 1
             d[i][j] = min(
+                d[i - 1][j] + 1,
+                d[i][j - 1] + 1,
+                d[i - 1][j - 1] + cost,
             )
+    return d[len(ref_words)][len(hyp_words)] / max(len(ref_words), 1)
 def measure_loudness(x: np.ndarray, sr: int) -> float:
     return float(meter.integrated_loudness(x))
+def true_peak_limiter(
+    x: np.ndarray,
+    sr: int,
+    max_true_peak: float = TARGET_TP,
+) -> np.ndarray:
     upsampled_sr = 192000
     x_upsampled = librosa.resample(x, orig_sr=sr, target_sr=upsampled_sr)
     true_peak = np.max(np.abs(x_upsampled))
     x_limited = librosa.resample(x_upsampled, orig_sr=upsampled_sr, target_sr=sr)
     x_limited = librosa.util.fix_length(x_limited, size=x.shape[-1])
+    return x_limited.astype(np.float32)
 def normalize_lufs(x: np.ndarray, sr: int) -> np.ndarray:
     """
     try:
         current_lufs = measure_loudness(x, sr)
         if not np.isfinite(current_lufs):
+            return x.astype(np.float32)
         gain_db = TARGET_LOUDNESS - current_lufs
         gain = 10 ** (gain_db / 20)
         y = x * gain
         y = true_peak_limiter(y, sr, max_true_peak=TARGET_TP)
+        return y.astype(np.float32)
     except Exception as e:
         warnings.warn(f"LUFS normalization failed, returning input unchanged: {e}")
+        return x.astype(np.float32)