File size: 5,416 Bytes
f032a60
 
 
 
 
 
 
 
 
 
 
 
 
 
2054fa6
f032a60
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2054fa6
 
 
 
f032a60
 
2054fa6
f032a60
 
 
 
 
 
 
2054fa6
f032a60
 
 
 
2054fa6
 
 
 
f032a60
 
 
2054fa6
 
f032a60
 
 
2054fa6
f032a60
 
 
 
2054fa6
f032a60
 
 
 
 
 
 
2054fa6
 
 
 
 
f032a60
 
 
 
 
2054fa6
f032a60
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2054fa6
f032a60
 
 
 
 
2054fa6
 
 
 
 
 
f032a60
 
 
 
 
 
 
 
 
 
 
 
2054fa6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
import torch
import librosa
import soundfile as sf
import numpy as np
import tempfile
import gradio as gr

from denoiser import pretrained
from denoiser.dsp import convert_audio
from pydub import AudioSegment, silence
from tqdm import tqdm


# -----------------------------
# Load model ONCE
# -----------------------------
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = pretrained.dns64().to(device)


# -----------------------------
# Silence trimming helpers
# -----------------------------
def safe_append(base, chunk, crossfade_ms=30):
    if len(base) > 0 and len(chunk) > 0:
        safe_crossfade = min(crossfade_ms, len(base), len(chunk))
        if safe_crossfade > 0:
            return base.append(chunk, crossfade=safe_crossfade)
    return base + chunk


def shorten_silences(audio, silence_thresh=-50, min_silence_len=400, max_keep_silence=1500, crossfade_ms=30):
    """
    Detects silences and reduces them using a stepped approach.
    """
    silent_ranges = silence.detect_silence(
        audio,
        min_silence_len=min_silence_len,
        silence_thresh=silence_thresh
    )

    output = AudioSegment.silent(duration=0)
    prev_end = 0

    for start, end in silent_ranges:
        # Add the non-silent chunk
        chunk = audio[prev_end:start]
        output = safe_append(output, chunk, crossfade_ms)

        silence_len = end - start
        
        # Smart stepped reduction logic:
        # - Short pauses (< 500ms) are kept as is.
        # - Longer ones are compressed in steps of 500ms for every 1000ms extra.
        if silence_len < 500:
            keep = silence_len
        else:
            # Calculation: starts at 500ms, adds 500ms for every full second beyond the first.
            keep = min(max_keep_silence, (silence_len // 1000) * 500 + 500)

        output = safe_append(
            output,
            AudioSegment.silent(duration=int(keep)),
            crossfade_ms
        )
        prev_end = end

    # Add final chunk
    output = safe_append(output, audio[prev_end:], crossfade_ms)
    return output


# -----------------------------
# Main processing function
# -----------------------------
def denoise_audio(audio_file, trim_silence, silence_thresh, min_silence_len, max_keep_silence):
    if audio_file is None:
        return None
        
    # Load audio
    wav, sr = librosa.load(audio_file, sr=16000)

    chunk_size = 16000 * 10
    denoised_chunks = []

    # Process in chunks to avoid OOM
    for i in range(0, len(wav), chunk_size):
        chunk = wav[i:i + chunk_size]
        wav_tensor = torch.tensor(chunk).unsqueeze(0).to(device)
        wav_tensor = convert_audio(
            wav_tensor, sr, model.sample_rate, model.chin
        )

        with torch.no_grad():
            denoised = model(wav_tensor)[0]

        denoised_chunks.append(
            denoised.squeeze().cpu().numpy()
        )

    denoised_np = np.concatenate(denoised_chunks)

    # Save denoised to temp
    tmp_wav = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
    sf.write(tmp_wav.name, denoised_np, model.sample_rate)

    if trim_silence:
        audio = AudioSegment.from_file(tmp_wav.name, format="wav")
        processed = shorten_silences(
            audio, 
            silence_thresh=silence_thresh, 
            min_silence_len=min_silence_len,
            max_keep_silence=max_keep_silence
        )
        final_file = tempfile.NamedTemporaryFile(
            suffix="_final.wav", delete=False
        )
        processed.export(final_file.name, format="wav")
        return final_file.name

    return tmp_wav.name


# -----------------------------
# Gradio UI
# -----------------------------
with gr.Blocks(title="🎧 Advanced Audio Denoiser") as demo:
    gr.Markdown("# 🎧 Audio Denoiser (Demucs DNS64)")
    gr.Markdown("Upload audio to remove noise and intelligently shorten silences.")
    
    with gr.Row():
        with gr.Column():
            input_audio = gr.Audio(type="filepath", label="Upload Audio")
            
            with gr.Accordion("Silence Trimming Settings", open=True):
                do_trim = gr.Checkbox(label="Enable Silence Trimming", value=True)
                
                threshold = gr.Slider(
                    minimum=-70, maximum=-20, value=-50, step=1,
                    label="Silence Threshold (dB)",
                    info="Lower is stricter (quieter sounds count as silence)"
                )
                
                min_len = gr.Slider(
                    minimum=100, maximum=2000, value=400, step=50,
                    label="Min Silence Duration (ms)",
                    info="How long a pause must be to be considered 'silence'"
                )
                
                max_keep = gr.Slider(
                    minimum=0, maximum=5000, value=1500, step=100,
                    label="Max Silence to Keep (ms)",
                    info="Longer silences will be shortened towards this value"
                )
            
            submit_btn = gr.Button("Process Audio", variant="primary")
            
        with gr.Column():
            output_audio = gr.Audio(label="Denoised & Trimmed Output")

    submit_btn.click(
        fn=denoise_audio,
        inputs=[input_audio, do_trim, threshold, min_len, max_keep],
        outputs=output_audio
    )

if __name__ == "__main__":
    demo.launch()