import torch import librosa import soundfile as sf import numpy as np import tempfile import gradio as gr from denoiser import pretrained from denoiser.dsp import convert_audio from pydub import AudioSegment, silence from tqdm import tqdm # ----------------------------- # Load model ONCE # ----------------------------- device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model = pretrained.dns64().to(device) # ----------------------------- # Silence trimming helpers # ----------------------------- def safe_append(base, chunk, crossfade_ms=30): if len(base) > 0 and len(chunk) > 0: safe_crossfade = min(crossfade_ms, len(base), len(chunk)) if safe_crossfade > 0: return base.append(chunk, crossfade=safe_crossfade) return base + chunk def shorten_silences(audio, silence_thresh=-50, min_silence_len=400, max_keep_silence=1500, crossfade_ms=30): """ Detects silences and reduces them using a stepped approach. """ silent_ranges = silence.detect_silence( audio, min_silence_len=min_silence_len, silence_thresh=silence_thresh ) output = AudioSegment.silent(duration=0) prev_end = 0 for start, end in silent_ranges: # Add the non-silent chunk chunk = audio[prev_end:start] output = safe_append(output, chunk, crossfade_ms) silence_len = end - start # Smart stepped reduction logic: # - Short pauses (< 500ms) are kept as is. # - Longer ones are compressed in steps of 500ms for every 1000ms extra. if silence_len < 500: keep = silence_len else: # Calculation: starts at 500ms, adds 500ms for every full second beyond the first. keep = min(max_keep_silence, (silence_len // 1000) * 500 + 500) output = safe_append( output, AudioSegment.silent(duration=int(keep)), crossfade_ms ) prev_end = end # Add final chunk output = safe_append(output, audio[prev_end:], crossfade_ms) return output # ----------------------------- # Main processing function # ----------------------------- def denoise_audio(audio_file, trim_silence, silence_thresh, min_silence_len, max_keep_silence): if audio_file is None: return None # Load audio wav, sr = librosa.load(audio_file, sr=16000) chunk_size = 16000 * 10 denoised_chunks = [] # Process in chunks to avoid OOM for i in range(0, len(wav), chunk_size): chunk = wav[i:i + chunk_size] wav_tensor = torch.tensor(chunk).unsqueeze(0).to(device) wav_tensor = convert_audio( wav_tensor, sr, model.sample_rate, model.chin ) with torch.no_grad(): denoised = model(wav_tensor)[0] denoised_chunks.append( denoised.squeeze().cpu().numpy() ) denoised_np = np.concatenate(denoised_chunks) # Save denoised to temp tmp_wav = tempfile.NamedTemporaryFile(suffix=".wav", delete=False) sf.write(tmp_wav.name, denoised_np, model.sample_rate) if trim_silence: audio = AudioSegment.from_file(tmp_wav.name, format="wav") processed = shorten_silences( audio, silence_thresh=silence_thresh, min_silence_len=min_silence_len, max_keep_silence=max_keep_silence ) final_file = tempfile.NamedTemporaryFile( suffix="_final.wav", delete=False ) processed.export(final_file.name, format="wav") return final_file.name return tmp_wav.name # ----------------------------- # Gradio UI # ----------------------------- with gr.Blocks(title="🎧 Advanced Audio Denoiser") as demo: gr.Markdown("# 🎧 Audio Denoiser (Demucs DNS64)") gr.Markdown("Upload audio to remove noise and intelligently shorten silences.") with gr.Row(): with gr.Column(): input_audio = gr.Audio(type="filepath", label="Upload Audio") with gr.Accordion("Silence Trimming Settings", open=True): do_trim = gr.Checkbox(label="Enable Silence Trimming", value=True) threshold = gr.Slider( minimum=-70, maximum=-20, value=-50, step=1, label="Silence Threshold (dB)", info="Lower is stricter (quieter sounds count as silence)" ) min_len = gr.Slider( minimum=100, maximum=2000, value=400, step=50, label="Min Silence Duration (ms)", info="How long a pause must be to be considered 'silence'" ) max_keep = gr.Slider( minimum=0, maximum=5000, value=1500, step=100, label="Max Silence to Keep (ms)", info="Longer silences will be shortened towards this value" ) submit_btn = gr.Button("Process Audio", variant="primary") with gr.Column(): output_audio = gr.Audio(label="Denoised & Trimmed Output") submit_btn.click( fn=denoise_audio, inputs=[input_audio, do_trim, threshold, min_len, max_keep], outputs=output_audio ) if __name__ == "__main__": demo.launch()