Spaces:

LocalAI-io
/

LocalVQE-demo

Running

App Files Files Community

richiejp commited on 26 days ago

Commit

9264232

verified ·

1 Parent(s): 50954ed

space: add residual-echo gate UI + int16 audio output

Browse files

Files changed (1) hide show

app.py +65 -4

app.py CHANGED Viewed

@@ -74,7 +74,30 @@ def _load_mono_16k(path: str) -> np.ndarray:
     return wav
-def enhance(mic_path: str, ref_path: str) -> tuple[int, np.ndarray]:
     if mic_path is None:
         raise gr.Error("Upload or pick a mic recording first.")
@@ -101,6 +124,11 @@ def enhance(mic_path: str, ref_path: str) -> tuple[int, np.ndarray]:
     peak = float(np.abs(out).max())
     if peak > 0.95:
         out = out / peak * 0.95
     # Convert to int16 ourselves: Gradio's gr.Audio output otherwise
     # peak-normalises float arrays via convert_to_16_bit_wav (data /=
     # np.abs(data).max(); * 32767), which amplifies the cancelled-echo
@@ -132,6 +160,18 @@ EXAMPLES = [
         str(EXAMPLES_DIR / "dt_mic.wav"),
         str(EXAMPLES_DIR / "dt_ref.wav"),
     ],
 ]
 DESCRIPTION = """
@@ -165,6 +205,21 @@ with gr.Blocks(title="LocalVQE Demo") as demo:
     with gr.Row():
         mic_in = gr.Audio(label="Mic (microphone recording)", type="filepath")
         ref_in = gr.Audio(label="Far-end reference (speaker playback)", type="filepath")
     btn = gr.Button("Enhance", variant="primary")
     out = gr.Audio(label="Enhanced output", type="numpy")
@@ -176,11 +231,17 @@ with gr.Blocks(title="LocalVQE Demo") as demo:
             "pure NS), near-end + light noise (20 dB SNR, NS preserving "
             "clean speech), far-end single-talk (pure AEC), far-end with "
             "brief near-end overlap (AEC while preserving NE), double-talk "
-            "(AEC while near-end is also talking)."
         ),
     )
-    btn.click(enhance, inputs=[mic_in, ref_in], outputs=out)
     gr.Markdown(
         f"<sub>Loaded: <code>{MODEL_INFO['source']}</code> · "
@@ -189,4 +250,4 @@ with gr.Blocks(title="LocalVQE Demo") as demo:
     )
 if __name__ == "__main__":
-    demo.launch()

     return wav
+def _noise_gate(x: np.ndarray, threshold_dbfs: float) -> np.ndarray:
+    """Hard-gate frames whose RMS is below `threshold_dbfs` to zero.
+    Operates on 10 ms frames (160 samples at 16 kHz) — short enough
+    that speech bursts aren't truncated, long enough that a single
+    out-of-band sample inside an active region doesn't get muted.
+    The ungated tail (samples that don't fill a full final frame) is
+    passed through unchanged.
+    """
+    frame = 160
+    n = len(x) // frame
+    if n == 0:
+        return x
+    f = x[: n * frame].reshape(n, frame).astype(np.float32)
+    rms = np.sqrt((f * f).mean(axis=-1) + 1e-12)
+    rms_db = 20.0 * np.log10(rms + 1e-12)
+    keep = (rms_db > threshold_dbfs).astype(np.float32)
+    gated = (f * keep[:, None]).reshape(-1)
+    return np.concatenate([gated, x[n * frame:]]).astype(x.dtype)
+def enhance(mic_path: str, ref_path: str,
+            gate_enabled: bool = False,
+            gate_threshold_db: float = -45.0) -> tuple[int, np.ndarray]:
     if mic_path is None:
         raise gr.Error("Upload or pick a mic recording first.")
     peak = float(np.abs(out).max())
     if peak > 0.95:
         out = out / peak * 0.95
+    # Optional residual-echo gate: silence frames whose RMS sits below
+    # `gate_threshold_db` dBFS. Off by default so listeners can A/B
+    # against the raw model output via the slider.
+    if gate_enabled:
+        out = _noise_gate(out, gate_threshold_db)
     # Convert to int16 ourselves: Gradio's gr.Audio output otherwise
     # peak-normalises float arrays via convert_to_16_bit_wav (data /=
     # np.abs(data).max(); * 32767), which amplifies the cancelled-echo
         str(EXAMPLES_DIR / "dt_mic.wav"),
         str(EXAMPLES_DIR / "dt_ref.wav"),
     ],
+    [
+        str(EXAMPLES_DIR / "dt_silenced1_mic.wav"),
+        str(EXAMPLES_DIR / "dt_silenced1_ref.wav"),
+    ],
+    [
+        str(EXAMPLES_DIR / "dt_silenced2_mic.wav"),
+        str(EXAMPLES_DIR / "dt_silenced2_ref.wav"),
+    ],
+    [
+        str(EXAMPLES_DIR / "dt_silenced3_mic.wav"),
+        str(EXAMPLES_DIR / "dt_silenced3_ref.wav"),
+    ],
 ]
 DESCRIPTION = """
     with gr.Row():
         mic_in = gr.Audio(label="Mic (microphone recording)", type="filepath")
         ref_in = gr.Audio(label="Far-end reference (speaker playback)", type="filepath")
+    with gr.Row():
+        gate_enabled = gr.Checkbox(
+            label="Residual-echo gate",
+            value=False,
+            info=(
+                "Post-process the enhanced output: silence any 10 ms frame "
+                "whose RMS falls below the threshold. Cleans up the quiet "
+                "residual you'd hear during far-end-only stretches; will "
+                "also mute genuinely quiet speech below the threshold."
+            ),
+        )
+        gate_threshold_db = gr.Slider(
+            label="Gate threshold (dBFS)",
+            minimum=-70.0, maximum=-20.0, value=-45.0, step=1.0,
+        )
     btn = gr.Button("Enhance", variant="primary")
     out = gr.Audio(label="Enhanced output", type="numpy")
             "pure NS), near-end + light noise (20 dB SNR, NS preserving "
             "clean speech), far-end single-talk (pure AEC), far-end with "
             "brief near-end overlap (AEC while preserving NE), double-talk "
+            "(AEC while near-end is also talking), then three DT clips "
+            "where the model currently silences the entire output "
+            "below the noise floor (target for ongoing training)."
         ),
     )
+    btn.click(
+        enhance,
+        inputs=[mic_in, ref_in, gate_enabled, gate_threshold_db],
+        outputs=out,
+    )
     gr.Markdown(
         f"<sub>Loaded: <code>{MODEL_INFO['source']}</code> · "
     )
 if __name__ == "__main__":
+    demo.launch(server_name=os.environ.get("GRADIO_SERVER_NAME", "127.0.0.1"))