denoiser / app.py
harp-dev's picture
Create app.py
bc9667c verified
import os
import tempfile
from pathlib import Path
import gradio as gr
import numpy as np
import soundfile as sf
import librosa
import torch
from pyharp import ModelCard, build_endpoint
from df import enhance, init_df
# -----------------------------
# Model metadata for HARP
# -----------------------------
model_card = ModelCard(
name="Background Noise Remover (DeepFilterNet3)",
description=(
"Background noise suppression / speech enhancement using DeepFilterNet3. "
"Input is converted to mono 48kHz. Slider controls strength through wet/dry blend."
),
author="Derek Llanes",
tags=["denoise", "speech enhancement", "deepfilternet", "v3"],
)
# -----------------------------
# Device & Model Initialization
# -----------------------------
# Auto-detect GPU for Hugging Face deployment
DEVICE_STR = "cuda" if torch.cuda.is_available() else "cpu"
DEVICE = torch.device(DEVICE_STR)
# Load model 1 time into global memory
MODEL, DF_STATE, _ = init_df()
MODEL = MODEL.to(DEVICE)
def load_audio_mono_48k(path: str):
# Load audio from filepath, convert to mono float32, resample to 48kHz.
try:
audio, sr = sf.read(path, always_2d=False)
except Exception:
audio, sr = librosa.load(path, sr=None, mono=False)
audio = np.asarray(audio)
# stereo to mono
if audio.ndim == 2:
audio = audio.mean(axis=1)
audio = audio.astype(np.float32)
# resample to 48k
if sr != 48000:
audio = librosa.resample(audio, orig_sr=sr, target_sr=48000)
sr = 48000
return audio, sr
def apply_attenuation_db(noisy: np.ndarray, enhanced: np.ndarray, noise_atten_db: float, max_db: float = 30.0):
"""
Map Noise Attenuation (dB) slider to a stable strength in [0,1]
and crossfade between original and enhanced audio.
"""
# convert dB slider into [0,1]
s = float(noise_atten_db) / float(max_db)
s = max(0.0, min(1.0, s))
# make same length
n = min(len(noisy), len(enhanced))
noisy = noisy[:n]
enhanced = enhanced[:n]
out = (1.0 - s) * noisy + s * enhanced
return out.astype(np.float32)
@torch.inference_mode()
def process_fn(input_audio_path: str, noise_atten_db: float) -> str:
if not input_audio_path:
raise ValueError("No input audio provided.")
# Load and normalize
noisy, sr = load_audio_mono_48k(input_audio_path)
# numpy to torch, add channel dim [T] to [1, T]
noisy_t = torch.from_numpy(noisy).float().unsqueeze(0).to(DEVICE)
# Denoise, then remove added channel and back to numpy
enhanced_t = enhance(MODEL, DF_STATE, noisy_t)
enhanced = enhanced_t.squeeze(0).detach().cpu().numpy()
# Slider strength
out = apply_attenuation_db(noisy, enhanced, noise_atten_db, max_db=30.0)
# Save output WAV and return path
out_dir = Path(tempfile.gettempdir()) / "pyharp_dfnet_outputs"
out_dir.mkdir(parents=True, exist_ok=True)
out_path = out_dir / "denoised.wav"
sf.write(str(out_path), out, sr)
return str(out_path)
# -----------------------------
# Gradio endpoint
# -----------------------------
with gr.Blocks() as demo:
input_components = [
gr.Audio(type="filepath", label="Input Audio").harp_required(True),
gr.Slider(
minimum=0,
maximum=30,
step=1,
value=12,
label="Noise Attenuation (dB)",
info="0 = no change, 30 = strongest. Implemented as wet/dry strength.",
),
]
output_components = [
gr.Audio(type="filepath", label="Output Audio")
.set_info("Denoised audio output."),
]
app = build_endpoint(
model_card=model_card,
input_components=input_components,
output_components=output_components,
process_fn=process_fn,
)
demo.queue().launch(show_error=False, pwa=True)