Spaces:
Running
Running
space: add residual-echo gate UI + int16 audio output
Browse files
app.py
CHANGED
|
@@ -74,7 +74,30 @@ def _load_mono_16k(path: str) -> np.ndarray:
|
|
| 74 |
return wav
|
| 75 |
|
| 76 |
|
| 77 |
-
def
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 78 |
if mic_path is None:
|
| 79 |
raise gr.Error("Upload or pick a mic recording first.")
|
| 80 |
|
|
@@ -101,6 +124,11 @@ def enhance(mic_path: str, ref_path: str) -> tuple[int, np.ndarray]:
|
|
| 101 |
peak = float(np.abs(out).max())
|
| 102 |
if peak > 0.95:
|
| 103 |
out = out / peak * 0.95
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 104 |
# Convert to int16 ourselves: Gradio's gr.Audio output otherwise
|
| 105 |
# peak-normalises float arrays via convert_to_16_bit_wav (data /=
|
| 106 |
# np.abs(data).max(); * 32767), which amplifies the cancelled-echo
|
|
@@ -132,6 +160,18 @@ EXAMPLES = [
|
|
| 132 |
str(EXAMPLES_DIR / "dt_mic.wav"),
|
| 133 |
str(EXAMPLES_DIR / "dt_ref.wav"),
|
| 134 |
],
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 135 |
]
|
| 136 |
|
| 137 |
DESCRIPTION = """
|
|
@@ -165,6 +205,21 @@ with gr.Blocks(title="LocalVQE Demo") as demo:
|
|
| 165 |
with gr.Row():
|
| 166 |
mic_in = gr.Audio(label="Mic (microphone recording)", type="filepath")
|
| 167 |
ref_in = gr.Audio(label="Far-end reference (speaker playback)", type="filepath")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 168 |
btn = gr.Button("Enhance", variant="primary")
|
| 169 |
out = gr.Audio(label="Enhanced output", type="numpy")
|
| 170 |
|
|
@@ -176,11 +231,17 @@ with gr.Blocks(title="LocalVQE Demo") as demo:
|
|
| 176 |
"pure NS), near-end + light noise (20 dB SNR, NS preserving "
|
| 177 |
"clean speech), far-end single-talk (pure AEC), far-end with "
|
| 178 |
"brief near-end overlap (AEC while preserving NE), double-talk "
|
| 179 |
-
"(AEC while near-end is also talking)
|
|
|
|
|
|
|
| 180 |
),
|
| 181 |
)
|
| 182 |
|
| 183 |
-
btn.click(
|
|
|
|
|
|
|
|
|
|
|
|
|
| 184 |
|
| 185 |
gr.Markdown(
|
| 186 |
f"<sub>Loaded: <code>{MODEL_INFO['source']}</code> · "
|
|
@@ -189,4 +250,4 @@ with gr.Blocks(title="LocalVQE Demo") as demo:
|
|
| 189 |
)
|
| 190 |
|
| 191 |
if __name__ == "__main__":
|
| 192 |
-
demo.launch()
|
|
|
|
| 74 |
return wav
|
| 75 |
|
| 76 |
|
| 77 |
+
def _noise_gate(x: np.ndarray, threshold_dbfs: float) -> np.ndarray:
|
| 78 |
+
"""Hard-gate frames whose RMS is below `threshold_dbfs` to zero.
|
| 79 |
+
|
| 80 |
+
Operates on 10 ms frames (160 samples at 16 kHz) — short enough
|
| 81 |
+
that speech bursts aren't truncated, long enough that a single
|
| 82 |
+
out-of-band sample inside an active region doesn't get muted.
|
| 83 |
+
The ungated tail (samples that don't fill a full final frame) is
|
| 84 |
+
passed through unchanged.
|
| 85 |
+
"""
|
| 86 |
+
frame = 160
|
| 87 |
+
n = len(x) // frame
|
| 88 |
+
if n == 0:
|
| 89 |
+
return x
|
| 90 |
+
f = x[: n * frame].reshape(n, frame).astype(np.float32)
|
| 91 |
+
rms = np.sqrt((f * f).mean(axis=-1) + 1e-12)
|
| 92 |
+
rms_db = 20.0 * np.log10(rms + 1e-12)
|
| 93 |
+
keep = (rms_db > threshold_dbfs).astype(np.float32)
|
| 94 |
+
gated = (f * keep[:, None]).reshape(-1)
|
| 95 |
+
return np.concatenate([gated, x[n * frame:]]).astype(x.dtype)
|
| 96 |
+
|
| 97 |
+
|
| 98 |
+
def enhance(mic_path: str, ref_path: str,
|
| 99 |
+
gate_enabled: bool = False,
|
| 100 |
+
gate_threshold_db: float = -45.0) -> tuple[int, np.ndarray]:
|
| 101 |
if mic_path is None:
|
| 102 |
raise gr.Error("Upload or pick a mic recording first.")
|
| 103 |
|
|
|
|
| 124 |
peak = float(np.abs(out).max())
|
| 125 |
if peak > 0.95:
|
| 126 |
out = out / peak * 0.95
|
| 127 |
+
# Optional residual-echo gate: silence frames whose RMS sits below
|
| 128 |
+
# `gate_threshold_db` dBFS. Off by default so listeners can A/B
|
| 129 |
+
# against the raw model output via the slider.
|
| 130 |
+
if gate_enabled:
|
| 131 |
+
out = _noise_gate(out, gate_threshold_db)
|
| 132 |
# Convert to int16 ourselves: Gradio's gr.Audio output otherwise
|
| 133 |
# peak-normalises float arrays via convert_to_16_bit_wav (data /=
|
| 134 |
# np.abs(data).max(); * 32767), which amplifies the cancelled-echo
|
|
|
|
| 160 |
str(EXAMPLES_DIR / "dt_mic.wav"),
|
| 161 |
str(EXAMPLES_DIR / "dt_ref.wav"),
|
| 162 |
],
|
| 163 |
+
[
|
| 164 |
+
str(EXAMPLES_DIR / "dt_silenced1_mic.wav"),
|
| 165 |
+
str(EXAMPLES_DIR / "dt_silenced1_ref.wav"),
|
| 166 |
+
],
|
| 167 |
+
[
|
| 168 |
+
str(EXAMPLES_DIR / "dt_silenced2_mic.wav"),
|
| 169 |
+
str(EXAMPLES_DIR / "dt_silenced2_ref.wav"),
|
| 170 |
+
],
|
| 171 |
+
[
|
| 172 |
+
str(EXAMPLES_DIR / "dt_silenced3_mic.wav"),
|
| 173 |
+
str(EXAMPLES_DIR / "dt_silenced3_ref.wav"),
|
| 174 |
+
],
|
| 175 |
]
|
| 176 |
|
| 177 |
DESCRIPTION = """
|
|
|
|
| 205 |
with gr.Row():
|
| 206 |
mic_in = gr.Audio(label="Mic (microphone recording)", type="filepath")
|
| 207 |
ref_in = gr.Audio(label="Far-end reference (speaker playback)", type="filepath")
|
| 208 |
+
with gr.Row():
|
| 209 |
+
gate_enabled = gr.Checkbox(
|
| 210 |
+
label="Residual-echo gate",
|
| 211 |
+
value=False,
|
| 212 |
+
info=(
|
| 213 |
+
"Post-process the enhanced output: silence any 10 ms frame "
|
| 214 |
+
"whose RMS falls below the threshold. Cleans up the quiet "
|
| 215 |
+
"residual you'd hear during far-end-only stretches; will "
|
| 216 |
+
"also mute genuinely quiet speech below the threshold."
|
| 217 |
+
),
|
| 218 |
+
)
|
| 219 |
+
gate_threshold_db = gr.Slider(
|
| 220 |
+
label="Gate threshold (dBFS)",
|
| 221 |
+
minimum=-70.0, maximum=-20.0, value=-45.0, step=1.0,
|
| 222 |
+
)
|
| 223 |
btn = gr.Button("Enhance", variant="primary")
|
| 224 |
out = gr.Audio(label="Enhanced output", type="numpy")
|
| 225 |
|
|
|
|
| 231 |
"pure NS), near-end + light noise (20 dB SNR, NS preserving "
|
| 232 |
"clean speech), far-end single-talk (pure AEC), far-end with "
|
| 233 |
"brief near-end overlap (AEC while preserving NE), double-talk "
|
| 234 |
+
"(AEC while near-end is also talking), then three DT clips "
|
| 235 |
+
"where the model currently silences the entire output "
|
| 236 |
+
"below the noise floor (target for ongoing training)."
|
| 237 |
),
|
| 238 |
)
|
| 239 |
|
| 240 |
+
btn.click(
|
| 241 |
+
enhance,
|
| 242 |
+
inputs=[mic_in, ref_in, gate_enabled, gate_threshold_db],
|
| 243 |
+
outputs=out,
|
| 244 |
+
)
|
| 245 |
|
| 246 |
gr.Markdown(
|
| 247 |
f"<sub>Loaded: <code>{MODEL_INFO['source']}</code> · "
|
|
|
|
| 250 |
)
|
| 251 |
|
| 252 |
if __name__ == "__main__":
|
| 253 |
+
demo.launch(server_name=os.environ.get("GRADIO_SERVER_NAME", "127.0.0.1"))
|