denoiser / app.py
hmdlohar's picture
add more options
2054fa6
import torch
import librosa
import soundfile as sf
import numpy as np
import tempfile
import gradio as gr
from denoiser import pretrained
from denoiser.dsp import convert_audio
from pydub import AudioSegment, silence
from tqdm import tqdm
# -----------------------------
# Load model ONCE
# -----------------------------
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = pretrained.dns64().to(device)
# -----------------------------
# Silence trimming helpers
# -----------------------------
def safe_append(base, chunk, crossfade_ms=30):
if len(base) > 0 and len(chunk) > 0:
safe_crossfade = min(crossfade_ms, len(base), len(chunk))
if safe_crossfade > 0:
return base.append(chunk, crossfade=safe_crossfade)
return base + chunk
def shorten_silences(audio, silence_thresh=-50, min_silence_len=400, max_keep_silence=1500, crossfade_ms=30):
"""
Detects silences and reduces them using a stepped approach.
"""
silent_ranges = silence.detect_silence(
audio,
min_silence_len=min_silence_len,
silence_thresh=silence_thresh
)
output = AudioSegment.silent(duration=0)
prev_end = 0
for start, end in silent_ranges:
# Add the non-silent chunk
chunk = audio[prev_end:start]
output = safe_append(output, chunk, crossfade_ms)
silence_len = end - start
# Smart stepped reduction logic:
# - Short pauses (< 500ms) are kept as is.
# - Longer ones are compressed in steps of 500ms for every 1000ms extra.
if silence_len < 500:
keep = silence_len
else:
# Calculation: starts at 500ms, adds 500ms for every full second beyond the first.
keep = min(max_keep_silence, (silence_len // 1000) * 500 + 500)
output = safe_append(
output,
AudioSegment.silent(duration=int(keep)),
crossfade_ms
)
prev_end = end
# Add final chunk
output = safe_append(output, audio[prev_end:], crossfade_ms)
return output
# -----------------------------
# Main processing function
# -----------------------------
def denoise_audio(audio_file, trim_silence, silence_thresh, min_silence_len, max_keep_silence):
if audio_file is None:
return None
# Load audio
wav, sr = librosa.load(audio_file, sr=16000)
chunk_size = 16000 * 10
denoised_chunks = []
# Process in chunks to avoid OOM
for i in range(0, len(wav), chunk_size):
chunk = wav[i:i + chunk_size]
wav_tensor = torch.tensor(chunk).unsqueeze(0).to(device)
wav_tensor = convert_audio(
wav_tensor, sr, model.sample_rate, model.chin
)
with torch.no_grad():
denoised = model(wav_tensor)[0]
denoised_chunks.append(
denoised.squeeze().cpu().numpy()
)
denoised_np = np.concatenate(denoised_chunks)
# Save denoised to temp
tmp_wav = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
sf.write(tmp_wav.name, denoised_np, model.sample_rate)
if trim_silence:
audio = AudioSegment.from_file(tmp_wav.name, format="wav")
processed = shorten_silences(
audio,
silence_thresh=silence_thresh,
min_silence_len=min_silence_len,
max_keep_silence=max_keep_silence
)
final_file = tempfile.NamedTemporaryFile(
suffix="_final.wav", delete=False
)
processed.export(final_file.name, format="wav")
return final_file.name
return tmp_wav.name
# -----------------------------
# Gradio UI
# -----------------------------
with gr.Blocks(title="🎧 Advanced Audio Denoiser") as demo:
gr.Markdown("# 🎧 Audio Denoiser (Demucs DNS64)")
gr.Markdown("Upload audio to remove noise and intelligently shorten silences.")
with gr.Row():
with gr.Column():
input_audio = gr.Audio(type="filepath", label="Upload Audio")
with gr.Accordion("Silence Trimming Settings", open=True):
do_trim = gr.Checkbox(label="Enable Silence Trimming", value=True)
threshold = gr.Slider(
minimum=-70, maximum=-20, value=-50, step=1,
label="Silence Threshold (dB)",
info="Lower is stricter (quieter sounds count as silence)"
)
min_len = gr.Slider(
minimum=100, maximum=2000, value=400, step=50,
label="Min Silence Duration (ms)",
info="How long a pause must be to be considered 'silence'"
)
max_keep = gr.Slider(
minimum=0, maximum=5000, value=1500, step=100,
label="Max Silence to Keep (ms)",
info="Longer silences will be shortened towards this value"
)
submit_btn = gr.Button("Process Audio", variant="primary")
with gr.Column():
output_audio = gr.Audio(label="Denoised & Trimmed Output")
submit_btn.click(
fn=denoise_audio,
inputs=[input_audio, do_trim, threshold, min_len, max_keep],
outputs=output_audio
)
if __name__ == "__main__":
demo.launch()