Diggz10 commited on
Commit
fea7d56
·
verified ·
1 Parent(s): b67ceda

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +27 -49
app.py CHANGED
@@ -1,9 +1,15 @@
 
 
 
 
 
 
1
  import io
2
  import os
3
  import tempfile
4
  from typing import Tuple, Optional
5
 
6
- # ---- tame noisy deprecation warnings (optional but nice) ----
7
  import warnings
8
  warnings.filterwarnings(
9
  "ignore",
@@ -20,8 +26,6 @@ import numpy as np
20
  import soundfile as sf
21
  import torch
22
  import torchaudio
23
- from fastapi import FastAPI, File, UploadFile, Query
24
- from fastapi.responses import StreamingResponse
25
 
26
  # ---- SpeechBrain import: prefer new API, fall back if older version ----
27
  try:
@@ -40,6 +44,7 @@ _DEVICE = "cpu"
40
 
41
 
42
  def _get_enhancer() -> SpectralMaskEnhancement:
 
43
  global _ENHANCER
44
  if _ENHANCER is None:
45
  _ENHANCER = SpectralMaskEnhancement.from_hparams(
@@ -59,7 +64,9 @@ def _to_mono(wav: np.ndarray) -> np.ndarray:
59
  return wav.astype(np.float32)
60
  # [T, C] or [C, T]
61
  if wav.shape[0] < wav.shape[1]:
 
62
  return wav.mean(axis=1).astype(np.float32)
 
63
  return wav.mean(axis=0).astype(np.float32)
64
 
65
 
@@ -85,6 +92,7 @@ def _presence_boost(wav: torch.Tensor, sr: int, gain_db: float) -> torch.Tensor:
85
 
86
 
87
  def _limit_peak(wav: torch.Tensor, target_dbfs: float = -1.0) -> torch.Tensor:
 
88
  target_amp = 10.0 ** (target_dbfs / 20.0)
89
  peak = torch.max(torch.abs(wav)).item()
90
  if peak > 0:
@@ -100,7 +108,7 @@ def _enhance_numpy_audio(
100
  out_sr: Optional[int] = None,
101
  ) -> Tuple[int, np.ndarray]:
102
  """
103
- Core pipeline used by both Gradio UI and raw FastAPI route.
104
  Input: (sr, np.float32 [T] or [T,C])
105
  Returns: (sr_out, np.float32 [T])
106
  """
@@ -112,12 +120,11 @@ def _enhance_numpy_audio(
112
  enh = _get_enhancer()
113
  wav_16k = _resample_torch(wav_t, sr_in, 16000)
114
 
115
- # Enhance via file path API for broad compatibility
116
  with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_in:
117
  sf.write(tmp_in.name, wav_16k.squeeze(0).numpy(), 16000, subtype="PCM_16")
118
  tmp_in.flush()
119
  clean = enh.enhance_file(tmp_in.name) # torch.Tensor [1, T]
120
-
121
  try:
122
  os.remove(tmp_in.name)
123
  except Exception:
@@ -137,45 +144,8 @@ def _enhance_numpy_audio(
137
  return sr_out, clean_out
138
 
139
 
140
- def _wav_bytes(sr: int, mono_f32: np.ndarray) -> bytes:
141
- """Encode mono float32 array as 16-bit PCM WAV bytes."""
142
- buf = io.BytesIO()
143
- sf.write(buf, mono_f32, sr, subtype="PCM_16", format="WAV")
144
- buf.seek(0)
145
- return buf.read()
146
-
147
-
148
- # -----------------------------
149
- # FastAPI app with raw endpoint
150
- # -----------------------------
151
- app = FastAPI(title="Voice Clarity Booster (MetricGAN+)", version="1.0.1")
152
-
153
-
154
- @app.post("/enhance")
155
- async def enhance_endpoint(
156
- file: UploadFile = File(..., description="Audio file (wav/mp3/ogg etc.)"),
157
- presence_db: float = Query(3.0, ge=-12.0, le=12.0, description="Presence EQ gain in dB"),
158
- lowcut_hz: float = Query(75.0, ge=0.0, le=200.0, description="High-pass cutoff in Hz"),
159
- output_sr: int = Query(0, ge=0, description="0=keep original, or set to e.g. 44100/48000"),
160
- ):
161
- """Raw REST endpoint. Returns enhanced audio as audio/wav bytes."""
162
- data = await file.read()
163
- wav_np, sr_in = sf.read(io.BytesIO(data), always_2d=False, dtype="float32")
164
- sr_out, enhanced = _enhance_numpy_audio(
165
- (sr_in, wav_np),
166
- presence_db=presence_db,
167
- lowcut_hz=lowcut_hz,
168
- out_sr=output_sr if output_sr > 0 else None,
169
- )
170
- wav_bytes = _wav_bytes(sr_out, enhanced)
171
- headers = {
172
- "Content-Disposition": f'attachment; filename="{os.path.splitext(file.filename or "audio")[0]}_enhanced.wav"'
173
- }
174
- return StreamingResponse(io.BytesIO(wav_bytes), media_type="audio/wav", headers=headers)
175
-
176
-
177
  # -----------------------------
178
- # Gradio UI (for quick testing)
179
  # -----------------------------
180
  def gradio_enhance(
181
  audio: Tuple[int, np.ndarray],
@@ -198,9 +168,17 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
198
  gr.Markdown("## Voice Clarity Booster (MetricGAN+)")
199
  with gr.Row():
200
  with gr.Column():
201
- in_audio = gr.Audio(sources=["upload", "microphone"], type="numpy", label="Input")
202
- presence = gr.Slider(-12, 12, value=3, step=0.5, label="Presence Boost (dB)")
203
- lowcut = gr.Slider(0, 200, value=75, step=5, label="Low-Cut (Hz)")
 
 
 
 
 
 
 
 
204
  out_sr = gr.Radio(
205
  choices=["Original", "44100", "48000"],
206
  value="Original",
@@ -212,5 +190,5 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
212
 
213
  btn.click(gradio_enhance, inputs=[in_audio, presence, lowcut, out_sr], outputs=[out_audio])
214
 
215
- # Mount Gradio at root path and keep FastAPI for /enhance
216
- app = gr.mount_gradio_app(app, demo, path="/")
 
1
+ # app.py — Voice Clarity Booster (MetricGAN+) for Hugging Face Spaces
2
+ # Notes:
3
+ # - Pure Gradio app with demo.launch() so Spaces initializes correctly.
4
+ # - Uses SpeechBrain MetricGAN+ for denoise/enhance at 16 kHz, plus optional
5
+ # high-pass and presence EQ polish, then resamples back to your chosen rate.
6
+
7
  import io
8
  import os
9
  import tempfile
10
  from typing import Tuple, Optional
11
 
12
+ # ---- Quiet noisy deprecation warnings (optional) ----
13
  import warnings
14
  warnings.filterwarnings(
15
  "ignore",
 
26
  import soundfile as sf
27
  import torch
28
  import torchaudio
 
 
29
 
30
  # ---- SpeechBrain import: prefer new API, fall back if older version ----
31
  try:
 
44
 
45
 
46
  def _get_enhancer() -> SpectralMaskEnhancement:
47
+ """Lazily load the enhancer and cache it."""
48
  global _ENHANCER
49
  if _ENHANCER is None:
50
  _ENHANCER = SpectralMaskEnhancement.from_hparams(
 
64
  return wav.astype(np.float32)
65
  # [T, C] or [C, T]
66
  if wav.shape[0] < wav.shape[1]:
67
+ # likely [T, C]
68
  return wav.mean(axis=1).astype(np.float32)
69
+ # likely [C, T]
70
  return wav.mean(axis=0).astype(np.float32)
71
 
72
 
 
92
 
93
 
94
  def _limit_peak(wav: torch.Tensor, target_dbfs: float = -1.0) -> torch.Tensor:
95
+ """Peak-normalize to target dBFS and hard-limit to [-1, 1]."""
96
  target_amp = 10.0 ** (target_dbfs / 20.0)
97
  peak = torch.max(torch.abs(wav)).item()
98
  if peak > 0:
 
108
  out_sr: Optional[int] = None,
109
  ) -> Tuple[int, np.ndarray]:
110
  """
111
+ Core pipeline used by the Gradio UI.
112
  Input: (sr, np.float32 [T] or [T,C])
113
  Returns: (sr_out, np.float32 [T])
114
  """
 
120
  enh = _get_enhancer()
121
  wav_16k = _resample_torch(wav_t, sr_in, 16000)
122
 
123
+ # Enhance via file path API for broad codec compatibility
124
  with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_in:
125
  sf.write(tmp_in.name, wav_16k.squeeze(0).numpy(), 16000, subtype="PCM_16")
126
  tmp_in.flush()
127
  clean = enh.enhance_file(tmp_in.name) # torch.Tensor [1, T]
 
128
  try:
129
  os.remove(tmp_in.name)
130
  except Exception:
 
144
  return sr_out, clean_out
145
 
146
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
147
  # -----------------------------
148
+ # Gradio UI
149
  # -----------------------------
150
  def gradio_enhance(
151
  audio: Tuple[int, np.ndarray],
 
168
  gr.Markdown("## Voice Clarity Booster (MetricGAN+)")
169
  with gr.Row():
170
  with gr.Column():
171
+ in_audio = gr.Audio(
172
+ sources=["upload", "microphone"],
173
+ type="numpy",
174
+ label="Input (noisy speech)",
175
+ )
176
+ presence = gr.Slider(
177
+ minimum=-12, maximum=12, value=3, step=0.5, label="Presence Boost (dB)"
178
+ )
179
+ lowcut = gr.Slider(
180
+ minimum=0, maximum=200, value=75, step=5, label="Low-Cut (Hz)"
181
+ )
182
  out_sr = gr.Radio(
183
  choices=["Original", "44100", "48000"],
184
  value="Original",
 
190
 
191
  btn.click(gradio_enhance, inputs=[in_audio, presence, lowcut, out_sr], outputs=[out_audio])
192
 
193
+ # IMPORTANT for Hugging Face Spaces: call launch() unguarded so the app starts.
194
+ demo.launch()