import os import tempfile from pathlib import Path import gradio as gr import numpy as np import soundfile as sf import librosa import torch from pyharp import ModelCard, build_endpoint from df import enhance, init_df # ----------------------------- # Model metadata for HARP # ----------------------------- model_card = ModelCard( name="Background Noise Remover (DeepFilterNet3)", description=( "Background noise suppression / speech enhancement using DeepFilterNet3. " "Input is converted to mono 48kHz. Slider controls strength through wet/dry blend." ), author="Derek Llanes", tags=["denoise", "speech enhancement", "deepfilternet", "v3"], ) # ----------------------------- # Device & Model Initialization # ----------------------------- # Auto-detect GPU for Hugging Face deployment DEVICE_STR = "cuda" if torch.cuda.is_available() else "cpu" DEVICE = torch.device(DEVICE_STR) # Load model 1 time into global memory MODEL, DF_STATE, _ = init_df() MODEL = MODEL.to(DEVICE) def load_audio_mono_48k(path: str): # Load audio from filepath, convert to mono float32, resample to 48kHz. try: audio, sr = sf.read(path, always_2d=False) except Exception: audio, sr = librosa.load(path, sr=None, mono=False) audio = np.asarray(audio) # stereo to mono if audio.ndim == 2: audio = audio.mean(axis=1) audio = audio.astype(np.float32) # resample to 48k if sr != 48000: audio = librosa.resample(audio, orig_sr=sr, target_sr=48000) sr = 48000 return audio, sr def apply_attenuation_db(noisy: np.ndarray, enhanced: np.ndarray, noise_atten_db: float, max_db: float = 30.0): """ Map Noise Attenuation (dB) slider to a stable strength in [0,1] and crossfade between original and enhanced audio. """ # convert dB slider into [0,1] s = float(noise_atten_db) / float(max_db) s = max(0.0, min(1.0, s)) # make same length n = min(len(noisy), len(enhanced)) noisy = noisy[:n] enhanced = enhanced[:n] out = (1.0 - s) * noisy + s * enhanced return out.astype(np.float32) @torch.inference_mode() def process_fn(input_audio_path: str, noise_atten_db: float) -> str: if not input_audio_path: raise ValueError("No input audio provided.") # Load and normalize noisy, sr = load_audio_mono_48k(input_audio_path) # numpy to torch, add channel dim [T] to [1, T] noisy_t = torch.from_numpy(noisy).float().unsqueeze(0).to(DEVICE) # Denoise, then remove added channel and back to numpy enhanced_t = enhance(MODEL, DF_STATE, noisy_t) enhanced = enhanced_t.squeeze(0).detach().cpu().numpy() # Slider strength out = apply_attenuation_db(noisy, enhanced, noise_atten_db, max_db=30.0) # Save output WAV and return path out_dir = Path(tempfile.gettempdir()) / "pyharp_dfnet_outputs" out_dir.mkdir(parents=True, exist_ok=True) out_path = out_dir / "denoised.wav" sf.write(str(out_path), out, sr) return str(out_path) # ----------------------------- # Gradio endpoint # ----------------------------- with gr.Blocks() as demo: input_components = [ gr.Audio(type="filepath", label="Input Audio").harp_required(True), gr.Slider( minimum=0, maximum=30, step=1, value=12, label="Noise Attenuation (dB)", info="0 = no change, 30 = strongest. Implemented as wet/dry strength.", ), ] output_components = [ gr.Audio(type="filepath", label="Output Audio") .set_info("Denoised audio output."), ] app = build_endpoint( model_card=model_card, input_components=input_components, output_components=output_components, process_fn=process_fn, ) demo.queue().launch(show_error=False, pwa=True)