Spaces:

teamup-tech
/

denoiser

Sleeping

File size: 3,870 Bytes

bc9667c

import os
import tempfile
from pathlib import Path  

import gradio as gr
import numpy as np
import soundfile as sf
import librosa
import torch

from pyharp import ModelCard, build_endpoint
from df import enhance, init_df

# -----------------------------
# Model metadata for HARP
# -----------------------------
model_card = ModelCard(
    name="Background Noise Remover (DeepFilterNet3)",
    description=(
        "Background noise suppression / speech enhancement using DeepFilterNet3. "
        "Input is converted to mono 48kHz. Slider controls strength through wet/dry blend."
    ),
    author="Derek Llanes",
    tags=["denoise", "speech enhancement", "deepfilternet", "v3"],
)

# -----------------------------
# Device & Model Initialization
# -----------------------------
# Auto-detect GPU for Hugging Face deployment
DEVICE_STR = "cuda" if torch.cuda.is_available() else "cpu"
DEVICE = torch.device(DEVICE_STR)

# Load model 1 time into global memory
MODEL, DF_STATE, _ = init_df()
MODEL = MODEL.to(DEVICE)


def load_audio_mono_48k(path: str):
    # Load audio from filepath, convert to mono float32, resample to 48kHz.
    try:
        audio, sr = sf.read(path, always_2d=False)
    except Exception:
        audio, sr = librosa.load(path, sr=None, mono=False)

    audio = np.asarray(audio)

    # stereo to mono
    if audio.ndim == 2:
        audio = audio.mean(axis=1)

    audio = audio.astype(np.float32)

    # resample to 48k
    if sr != 48000:
        audio = librosa.resample(audio, orig_sr=sr, target_sr=48000)
        sr = 48000

    return audio, sr


def apply_attenuation_db(noisy: np.ndarray, enhanced: np.ndarray, noise_atten_db: float, max_db: float = 30.0):
    """
    Map Noise Attenuation (dB) slider to a stable strength in [0,1]
    and crossfade between original and enhanced audio.
    """
    # convert dB slider into [0,1]
    s = float(noise_atten_db) / float(max_db)
    s = max(0.0, min(1.0, s))

    # make same length
    n = min(len(noisy), len(enhanced))
    noisy = noisy[:n]
    enhanced = enhanced[:n]

    out = (1.0 - s) * noisy + s * enhanced
    return out.astype(np.float32)


@torch.inference_mode()
def process_fn(input_audio_path: str, noise_atten_db: float) -> str:
    if not input_audio_path:
        raise ValueError("No input audio provided.")

    # Load and normalize
    noisy, sr = load_audio_mono_48k(input_audio_path)

    # numpy to torch, add channel dim [T] to [1, T]
    noisy_t = torch.from_numpy(noisy).float().unsqueeze(0).to(DEVICE)

    # Denoise, then remove added channel and back to numpy
    enhanced_t = enhance(MODEL, DF_STATE, noisy_t)
    enhanced = enhanced_t.squeeze(0).detach().cpu().numpy()

    # Slider strength
    out = apply_attenuation_db(noisy, enhanced, noise_atten_db, max_db=30.0)

    # Save output WAV and return path
    out_dir = Path(tempfile.gettempdir()) / "pyharp_dfnet_outputs"
    out_dir.mkdir(parents=True, exist_ok=True)
    out_path = out_dir / "denoised.wav"

    sf.write(str(out_path), out, sr)
    return str(out_path)


# -----------------------------
# Gradio endpoint
# -----------------------------
with gr.Blocks() as demo:
    input_components = [
        gr.Audio(type="filepath", label="Input Audio").harp_required(True),
        gr.Slider(
            minimum=0,
            maximum=30,
            step=1,
            value=12,
            label="Noise Attenuation (dB)",
            info="0 = no change, 30 = strongest. Implemented as wet/dry strength.",
        ),
    ]

    output_components = [
        gr.Audio(type="filepath", label="Output Audio")
        .set_info("Denoised audio output."),
    ]

    app = build_endpoint(
        model_card=model_card,
        input_components=input_components,
        output_components=output_components,
        process_fn=process_fn,
    )

demo.queue().launch(show_error=False, pwa=True)