Spaces:

Diggz10
/

voiceclear

Running

App Files Files Community

Diggz10 commited on Aug 21, 2025

Commit

fea7d56

verified ·

1 Parent(s): b67ceda

Update app.py

Browse files

Files changed (1) hide show

app.py +27 -49

app.py CHANGED Viewed

@@ -1,9 +1,15 @@
 import io
 import os
 import tempfile
 from typing import Tuple, Optional
-# ---- tame noisy deprecation warnings (optional but nice) ----
 import warnings
 warnings.filterwarnings(
     "ignore",
@@ -20,8 +26,6 @@ import numpy as np
 import soundfile as sf
 import torch
 import torchaudio
-from fastapi import FastAPI, File, UploadFile, Query
-from fastapi.responses import StreamingResponse
 # ---- SpeechBrain import: prefer new API, fall back if older version ----
 try:
@@ -40,6 +44,7 @@ _DEVICE = "cpu"
 def _get_enhancer() -> SpectralMaskEnhancement:
     global _ENHANCER
     if _ENHANCER is None:
         _ENHANCER = SpectralMaskEnhancement.from_hparams(
@@ -59,7 +64,9 @@ def _to_mono(wav: np.ndarray) -> np.ndarray:
         return wav.astype(np.float32)
     # [T, C] or [C, T]
     if wav.shape[0] < wav.shape[1]:
         return wav.mean(axis=1).astype(np.float32)
     return wav.mean(axis=0).astype(np.float32)
@@ -85,6 +92,7 @@ def _presence_boost(wav: torch.Tensor, sr: int, gain_db: float) -> torch.Tensor:
 def _limit_peak(wav: torch.Tensor, target_dbfs: float = -1.0) -> torch.Tensor:
     target_amp = 10.0 ** (target_dbfs / 20.0)
     peak = torch.max(torch.abs(wav)).item()
     if peak > 0:
@@ -100,7 +108,7 @@ def _enhance_numpy_audio(
     out_sr: Optional[int] = None,
 ) -> Tuple[int, np.ndarray]:
     """
-    Core pipeline used by both Gradio UI and raw FastAPI route.
     Input: (sr, np.float32 [T] or [T,C])
     Returns: (sr_out, np.float32 [T])
     """
@@ -112,12 +120,11 @@ def _enhance_numpy_audio(
     enh = _get_enhancer()
     wav_16k = _resample_torch(wav_t, sr_in, 16000)
-    # Enhance via file path API for broad compatibility
     with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_in:
         sf.write(tmp_in.name, wav_16k.squeeze(0).numpy(), 16000, subtype="PCM_16")
         tmp_in.flush()
         clean = enh.enhance_file(tmp_in.name)  # torch.Tensor [1, T]
     try:
         os.remove(tmp_in.name)
     except Exception:
@@ -137,45 +144,8 @@ def _enhance_numpy_audio(
     return sr_out, clean_out
-def _wav_bytes(sr: int, mono_f32: np.ndarray) -> bytes:
-    """Encode mono float32 array as 16-bit PCM WAV bytes."""
-    buf = io.BytesIO()
-    sf.write(buf, mono_f32, sr, subtype="PCM_16", format="WAV")
-    buf.seek(0)
-    return buf.read()
-# -----------------------------
-# FastAPI app with raw endpoint
-# -----------------------------
-app = FastAPI(title="Voice Clarity Booster (MetricGAN+)", version="1.0.1")
-@app.post("/enhance")
-async def enhance_endpoint(
-    file: UploadFile = File(..., description="Audio file (wav/mp3/ogg etc.)"),
-    presence_db: float = Query(3.0, ge=-12.0, le=12.0, description="Presence EQ gain in dB"),
-    lowcut_hz: float = Query(75.0, ge=0.0, le=200.0, description="High-pass cutoff in Hz"),
-    output_sr: int = Query(0, ge=0, description="0=keep original, or set to e.g. 44100/48000"),
-):
-    """Raw REST endpoint. Returns enhanced audio as audio/wav bytes."""
-    data = await file.read()
-    wav_np, sr_in = sf.read(io.BytesIO(data), always_2d=False, dtype="float32")
-    sr_out, enhanced = _enhance_numpy_audio(
-        (sr_in, wav_np),
-        presence_db=presence_db,
-        lowcut_hz=lowcut_hz,
-        out_sr=output_sr if output_sr > 0 else None,
-    )
-    wav_bytes = _wav_bytes(sr_out, enhanced)
-    headers = {
-        "Content-Disposition": f'attachment; filename="{os.path.splitext(file.filename or "audio")[0]}_enhanced.wav"'
-    }
-    return StreamingResponse(io.BytesIO(wav_bytes), media_type="audio/wav", headers=headers)
 # -----------------------------
-# Gradio UI (for quick testing)
 # -----------------------------
 def gradio_enhance(
     audio: Tuple[int, np.ndarray],
@@ -198,9 +168,17 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
     gr.Markdown("## Voice Clarity Booster (MetricGAN+)")
     with gr.Row():
         with gr.Column():
-            in_audio = gr.Audio(sources=["upload", "microphone"], type="numpy", label="Input")
-            presence = gr.Slider(-12, 12, value=3, step=0.5, label="Presence Boost (dB)")
-            lowcut = gr.Slider(0, 200, value=75, step=5, label="Low-Cut (Hz)")
             out_sr = gr.Radio(
                 choices=["Original", "44100", "48000"],
                 value="Original",
@@ -212,5 +190,5 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
     btn.click(gradio_enhance, inputs=[in_audio, presence, lowcut, out_sr], outputs=[out_audio])
-# Mount Gradio at root path and keep FastAPI for /enhance
-app = gr.mount_gradio_app(app, demo, path="/")

+# app.py — Voice Clarity Booster (MetricGAN+) for Hugging Face Spaces
+# Notes:
+# - Pure Gradio app with demo.launch() so Spaces initializes correctly.
+# - Uses SpeechBrain MetricGAN+ for denoise/enhance at 16 kHz, plus optional
+#   high-pass and presence EQ polish, then resamples back to your chosen rate.
 import io
 import os
 import tempfile
 from typing import Tuple, Optional
+# ---- Quiet noisy deprecation warnings (optional) ----
 import warnings
 warnings.filterwarnings(
     "ignore",
 import soundfile as sf
 import torch
 import torchaudio
 # ---- SpeechBrain import: prefer new API, fall back if older version ----
 try:
 def _get_enhancer() -> SpectralMaskEnhancement:
+    """Lazily load the enhancer and cache it."""
     global _ENHANCER
     if _ENHANCER is None:
         _ENHANCER = SpectralMaskEnhancement.from_hparams(
         return wav.astype(np.float32)
     # [T, C] or [C, T]
     if wav.shape[0] < wav.shape[1]:
+        # likely [T, C]
         return wav.mean(axis=1).astype(np.float32)
+    # likely [C, T]
     return wav.mean(axis=0).astype(np.float32)
 def _limit_peak(wav: torch.Tensor, target_dbfs: float = -1.0) -> torch.Tensor:
+    """Peak-normalize to target dBFS and hard-limit to [-1, 1]."""
     target_amp = 10.0 ** (target_dbfs / 20.0)
     peak = torch.max(torch.abs(wav)).item()
     if peak > 0:
     out_sr: Optional[int] = None,
 ) -> Tuple[int, np.ndarray]:
     """
+    Core pipeline used by the Gradio UI.
     Input: (sr, np.float32 [T] or [T,C])
     Returns: (sr_out, np.float32 [T])
     """
     enh = _get_enhancer()
     wav_16k = _resample_torch(wav_t, sr_in, 16000)
+    # Enhance via file path API for broad codec compatibility
     with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_in:
         sf.write(tmp_in.name, wav_16k.squeeze(0).numpy(), 16000, subtype="PCM_16")
         tmp_in.flush()
         clean = enh.enhance_file(tmp_in.name)  # torch.Tensor [1, T]
     try:
         os.remove(tmp_in.name)
     except Exception:
     return sr_out, clean_out
 # -----------------------------
+# Gradio UI
 # -----------------------------
 def gradio_enhance(
     audio: Tuple[int, np.ndarray],
     gr.Markdown("## Voice Clarity Booster (MetricGAN+)")
     with gr.Row():
         with gr.Column():
+            in_audio = gr.Audio(
+                sources=["upload", "microphone"],
+                type="numpy",
+                label="Input (noisy speech)",
+            )
+            presence = gr.Slider(
+                minimum=-12, maximum=12, value=3, step=0.5, label="Presence Boost (dB)"
+            )
+            lowcut = gr.Slider(
+                minimum=0, maximum=200, value=75, step=5, label="Low-Cut (Hz)"
+            )
             out_sr = gr.Radio(
                 choices=["Original", "44100", "48000"],
                 value="Original",
     btn.click(gradio_enhance, inputs=[in_audio, presence, lowcut, out_sr], outputs=[out_audio])
+# IMPORTANT for Hugging Face Spaces: call launch() unguarded so the app starts.
+demo.launch()