richiejp commited on
Commit
9264232
·
verified ·
1 Parent(s): 50954ed

space: add residual-echo gate UI + int16 audio output

Browse files
Files changed (1) hide show
  1. app.py +65 -4
app.py CHANGED
@@ -74,7 +74,30 @@ def _load_mono_16k(path: str) -> np.ndarray:
74
  return wav
75
 
76
 
77
- def enhance(mic_path: str, ref_path: str) -> tuple[int, np.ndarray]:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
78
  if mic_path is None:
79
  raise gr.Error("Upload or pick a mic recording first.")
80
 
@@ -101,6 +124,11 @@ def enhance(mic_path: str, ref_path: str) -> tuple[int, np.ndarray]:
101
  peak = float(np.abs(out).max())
102
  if peak > 0.95:
103
  out = out / peak * 0.95
 
 
 
 
 
104
  # Convert to int16 ourselves: Gradio's gr.Audio output otherwise
105
  # peak-normalises float arrays via convert_to_16_bit_wav (data /=
106
  # np.abs(data).max(); * 32767), which amplifies the cancelled-echo
@@ -132,6 +160,18 @@ EXAMPLES = [
132
  str(EXAMPLES_DIR / "dt_mic.wav"),
133
  str(EXAMPLES_DIR / "dt_ref.wav"),
134
  ],
 
 
 
 
 
 
 
 
 
 
 
 
135
  ]
136
 
137
  DESCRIPTION = """
@@ -165,6 +205,21 @@ with gr.Blocks(title="LocalVQE Demo") as demo:
165
  with gr.Row():
166
  mic_in = gr.Audio(label="Mic (microphone recording)", type="filepath")
167
  ref_in = gr.Audio(label="Far-end reference (speaker playback)", type="filepath")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
168
  btn = gr.Button("Enhance", variant="primary")
169
  out = gr.Audio(label="Enhanced output", type="numpy")
170
 
@@ -176,11 +231,17 @@ with gr.Blocks(title="LocalVQE Demo") as demo:
176
  "pure NS), near-end + light noise (20 dB SNR, NS preserving "
177
  "clean speech), far-end single-talk (pure AEC), far-end with "
178
  "brief near-end overlap (AEC while preserving NE), double-talk "
179
- "(AEC while near-end is also talking)."
 
 
180
  ),
181
  )
182
 
183
- btn.click(enhance, inputs=[mic_in, ref_in], outputs=out)
 
 
 
 
184
 
185
  gr.Markdown(
186
  f"<sub>Loaded: <code>{MODEL_INFO['source']}</code> · "
@@ -189,4 +250,4 @@ with gr.Blocks(title="LocalVQE Demo") as demo:
189
  )
190
 
191
  if __name__ == "__main__":
192
- demo.launch()
 
74
  return wav
75
 
76
 
77
+ def _noise_gate(x: np.ndarray, threshold_dbfs: float) -> np.ndarray:
78
+ """Hard-gate frames whose RMS is below `threshold_dbfs` to zero.
79
+
80
+ Operates on 10 ms frames (160 samples at 16 kHz) — short enough
81
+ that speech bursts aren't truncated, long enough that a single
82
+ out-of-band sample inside an active region doesn't get muted.
83
+ The ungated tail (samples that don't fill a full final frame) is
84
+ passed through unchanged.
85
+ """
86
+ frame = 160
87
+ n = len(x) // frame
88
+ if n == 0:
89
+ return x
90
+ f = x[: n * frame].reshape(n, frame).astype(np.float32)
91
+ rms = np.sqrt((f * f).mean(axis=-1) + 1e-12)
92
+ rms_db = 20.0 * np.log10(rms + 1e-12)
93
+ keep = (rms_db > threshold_dbfs).astype(np.float32)
94
+ gated = (f * keep[:, None]).reshape(-1)
95
+ return np.concatenate([gated, x[n * frame:]]).astype(x.dtype)
96
+
97
+
98
+ def enhance(mic_path: str, ref_path: str,
99
+ gate_enabled: bool = False,
100
+ gate_threshold_db: float = -45.0) -> tuple[int, np.ndarray]:
101
  if mic_path is None:
102
  raise gr.Error("Upload or pick a mic recording first.")
103
 
 
124
  peak = float(np.abs(out).max())
125
  if peak > 0.95:
126
  out = out / peak * 0.95
127
+ # Optional residual-echo gate: silence frames whose RMS sits below
128
+ # `gate_threshold_db` dBFS. Off by default so listeners can A/B
129
+ # against the raw model output via the slider.
130
+ if gate_enabled:
131
+ out = _noise_gate(out, gate_threshold_db)
132
  # Convert to int16 ourselves: Gradio's gr.Audio output otherwise
133
  # peak-normalises float arrays via convert_to_16_bit_wav (data /=
134
  # np.abs(data).max(); * 32767), which amplifies the cancelled-echo
 
160
  str(EXAMPLES_DIR / "dt_mic.wav"),
161
  str(EXAMPLES_DIR / "dt_ref.wav"),
162
  ],
163
+ [
164
+ str(EXAMPLES_DIR / "dt_silenced1_mic.wav"),
165
+ str(EXAMPLES_DIR / "dt_silenced1_ref.wav"),
166
+ ],
167
+ [
168
+ str(EXAMPLES_DIR / "dt_silenced2_mic.wav"),
169
+ str(EXAMPLES_DIR / "dt_silenced2_ref.wav"),
170
+ ],
171
+ [
172
+ str(EXAMPLES_DIR / "dt_silenced3_mic.wav"),
173
+ str(EXAMPLES_DIR / "dt_silenced3_ref.wav"),
174
+ ],
175
  ]
176
 
177
  DESCRIPTION = """
 
205
  with gr.Row():
206
  mic_in = gr.Audio(label="Mic (microphone recording)", type="filepath")
207
  ref_in = gr.Audio(label="Far-end reference (speaker playback)", type="filepath")
208
+ with gr.Row():
209
+ gate_enabled = gr.Checkbox(
210
+ label="Residual-echo gate",
211
+ value=False,
212
+ info=(
213
+ "Post-process the enhanced output: silence any 10 ms frame "
214
+ "whose RMS falls below the threshold. Cleans up the quiet "
215
+ "residual you'd hear during far-end-only stretches; will "
216
+ "also mute genuinely quiet speech below the threshold."
217
+ ),
218
+ )
219
+ gate_threshold_db = gr.Slider(
220
+ label="Gate threshold (dBFS)",
221
+ minimum=-70.0, maximum=-20.0, value=-45.0, step=1.0,
222
+ )
223
  btn = gr.Button("Enhance", variant="primary")
224
  out = gr.Audio(label="Enhanced output", type="numpy")
225
 
 
231
  "pure NS), near-end + light noise (20 dB SNR, NS preserving "
232
  "clean speech), far-end single-talk (pure AEC), far-end with "
233
  "brief near-end overlap (AEC while preserving NE), double-talk "
234
+ "(AEC while near-end is also talking), then three DT clips "
235
+ "where the model currently silences the entire output "
236
+ "below the noise floor (target for ongoing training)."
237
  ),
238
  )
239
 
240
+ btn.click(
241
+ enhance,
242
+ inputs=[mic_in, ref_in, gate_enabled, gate_threshold_db],
243
+ outputs=out,
244
+ )
245
 
246
  gr.Markdown(
247
  f"<sub>Loaded: <code>{MODEL_INFO['source']}</code> · "
 
250
  )
251
 
252
  if __name__ == "__main__":
253
+ demo.launch(server_name=os.environ.get("GRADIO_SERVER_NAME", "127.0.0.1"))