File size: 3,870 Bytes
bc9667c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
import os
import tempfile
from pathlib import Path  

import gradio as gr
import numpy as np
import soundfile as sf
import librosa
import torch

from pyharp import ModelCard, build_endpoint
from df import enhance, init_df

# -----------------------------
# Model metadata for HARP
# -----------------------------
model_card = ModelCard(
    name="Background Noise Remover (DeepFilterNet3)",
    description=(
        "Background noise suppression / speech enhancement using DeepFilterNet3. "
        "Input is converted to mono 48kHz. Slider controls strength through wet/dry blend."
    ),
    author="Derek Llanes",
    tags=["denoise", "speech enhancement", "deepfilternet", "v3"],
)

# -----------------------------
# Device & Model Initialization
# -----------------------------
# Auto-detect GPU for Hugging Face deployment
DEVICE_STR = "cuda" if torch.cuda.is_available() else "cpu"
DEVICE = torch.device(DEVICE_STR)

# Load model 1 time into global memory
MODEL, DF_STATE, _ = init_df()
MODEL = MODEL.to(DEVICE)


def load_audio_mono_48k(path: str):
    # Load audio from filepath, convert to mono float32, resample to 48kHz.
    try:
        audio, sr = sf.read(path, always_2d=False)
    except Exception:
        audio, sr = librosa.load(path, sr=None, mono=False)

    audio = np.asarray(audio)

    # stereo to mono
    if audio.ndim == 2:
        audio = audio.mean(axis=1)

    audio = audio.astype(np.float32)

    # resample to 48k
    if sr != 48000:
        audio = librosa.resample(audio, orig_sr=sr, target_sr=48000)
        sr = 48000

    return audio, sr


def apply_attenuation_db(noisy: np.ndarray, enhanced: np.ndarray, noise_atten_db: float, max_db: float = 30.0):
    """
    Map Noise Attenuation (dB) slider to a stable strength in [0,1]
    and crossfade between original and enhanced audio.
    """
    # convert dB slider into [0,1]
    s = float(noise_atten_db) / float(max_db)
    s = max(0.0, min(1.0, s))

    # make same length
    n = min(len(noisy), len(enhanced))
    noisy = noisy[:n]
    enhanced = enhanced[:n]

    out = (1.0 - s) * noisy + s * enhanced
    return out.astype(np.float32)


@torch.inference_mode()
def process_fn(input_audio_path: str, noise_atten_db: float) -> str:
    if not input_audio_path:
        raise ValueError("No input audio provided.")

    # Load and normalize
    noisy, sr = load_audio_mono_48k(input_audio_path)

    # numpy to torch, add channel dim [T] to [1, T]
    noisy_t = torch.from_numpy(noisy).float().unsqueeze(0).to(DEVICE)

    # Denoise, then remove added channel and back to numpy
    enhanced_t = enhance(MODEL, DF_STATE, noisy_t)
    enhanced = enhanced_t.squeeze(0).detach().cpu().numpy()

    # Slider strength
    out = apply_attenuation_db(noisy, enhanced, noise_atten_db, max_db=30.0)

    # Save output WAV and return path
    out_dir = Path(tempfile.gettempdir()) / "pyharp_dfnet_outputs"
    out_dir.mkdir(parents=True, exist_ok=True)
    out_path = out_dir / "denoised.wav"

    sf.write(str(out_path), out, sr)
    return str(out_path)


# -----------------------------
# Gradio endpoint
# -----------------------------
with gr.Blocks() as demo:
    input_components = [
        gr.Audio(type="filepath", label="Input Audio").harp_required(True),
        gr.Slider(
            minimum=0,
            maximum=30,
            step=1,
            value=12,
            label="Noise Attenuation (dB)",
            info="0 = no change, 30 = strongest. Implemented as wet/dry strength.",
        ),
    ]

    output_components = [
        gr.Audio(type="filepath", label="Output Audio")
        .set_info("Denoised audio output."),
    ]

    app = build_endpoint(
        model_card=model_card,
        input_components=input_components,
        output_components=output_components,
        process_fn=process_fn,
    )

demo.queue().launch(show_error=False, pwa=True)