import os import io import tempfile import zipfile import numpy as np import pandas as pd import librosa import librosa.display import matplotlib.pyplot as plt import soundfile as sf import gradio as gr from scipy.signal import medfilt from noisereduce import reduce_noise import webrtcvad from pesq import pesq from pystoi import stoi def load_audio(file_obj): y, sr = librosa.load(file_obj, sr=16000) return y, sr def save_audio(y, sr, path): sf.write(path, y, sr) def plot_waveform(y, sr, title): plt.figure(figsize=(10, 2)) librosa.display.waveshow(y, sr=sr) plt.title(title) buf = io.BytesIO() plt.tight_layout() plt.savefig(buf, format='png') plt.close() buf.seek(0) return buf def plot_spectrogram(y, sr, title): plt.figure(figsize=(10, 3)) D = librosa.amplitude_to_db(np.abs(librosa.stft(y)), ref=np.max) librosa.display.specshow(D, sr=sr, x_axis='time', y_axis='log') plt.colorbar(format='%+2.0f dB') plt.title(title) buf = io.BytesIO() plt.tight_layout() plt.savefig(buf, format='png') plt.close() buf.seek(0) return buf def vad_plot(y, sr, title): vad = webrtcvad.Vad(2) if sr != 16000: y = librosa.resample(y, orig_sr=sr, target_sr=16000) sr = 16000 frame_duration_ms = 30 frame_size = int(sr * frame_duration_ms / 1000) y = np.pad(y, (0, frame_size - len(y) % frame_size)) if len(y) % frame_size != 0 else y frames = np.split(y, len(y) // frame_size) voiced = [] for frame in frames: pcm = (frame * 32767).astype(np.int16).tobytes() try: voiced.append(vad.is_speech(pcm, sr)) except: voiced.append(False) plt.figure(figsize=(10, 1.5)) plt.plot(voiced, drawstyle='steps-mid') plt.title(title) buf = io.BytesIO() plt.tight_layout() plt.savefig(buf, format='png') plt.close() buf.seek(0) return buf def compute_pesq_mfcc_stoi(original_path, enhanced_path): sr = 16000 original, _ = librosa.load(original_path, sr=sr) enhanced, _ = librosa.load(enhanced_path, sr=sr) pesq_score = pesq(sr, original, enhanced, 'wb') stoi_score = stoi(original, enhanced, sr, extended=False) mfcc_diff = np.mean(np.abs( librosa.feature.mfcc(original, sr, n_mfcc=13) - librosa.feature.mfcc(enhanced, sr, n_mfcc=13) )) return pesq_score, stoi_score, mfcc_diff def compute_snr(original, enhanced): noise = original - enhanced snr = 10 * np.log10(np.sum(original ** 2) / (np.sum(noise ** 2) + 1e-9)) return snr def noise_reduction(y, sr): return reduce_noise(y=y, sr=sr) def voice_isolation(y, sr): return y # Placeholder def reverb_cleanup(y, sr): return medfilt(y, kernel_size=5) def volume_normalize(y): return y / np.max(np.abs(y)) if np.max(np.abs(y)) > 0 else y def language_aware_tuning(y, sr): return librosa.effects.preemphasis(y) def process_files(files, nr, vi, reverb, vol, lang, skip_metrics=False, progress=gr.Progress()): results, metrics = [], [] temp_dir = tempfile.mkdtemp() zip_path = os.path.join(temp_dir, "enhanced_output.zip") zipf = zipfile.ZipFile(zip_path, 'w') total = len(files) for i, file_obj in enumerate(files): progress((i + 1) / total, desc=f"Processing {file_obj.name}") y, sr = load_audio(file_obj) original_y = y.copy() if nr: y = noise_reduction(y, sr) if vi: y = voice_isolation(y, sr) if reverb: y = reverb_cleanup(y, sr) if vol: y = volume_normalize(y) if lang: y = language_aware_tuning(y, sr) name = os.path.splitext(file_obj.name)[0] orig_path = os.path.join(temp_dir, f"{name}_original.wav") enh_path = os.path.join(temp_dir, f"{name}_enhanced.wav") save_audio(original_y, sr, orig_path) save_audio(y, sr, enh_path) for plot_func, label in [(plot_waveform, "waveform"), (plot_spectrogram, "spectrogram"), (vad_plot, "vad")]: for typ, signal in [("original", original_y), ("enhanced", y)]: buf = plot_func(signal, sr, f"{typ.title()} {label.title()}") img_path = os.path.join(temp_dir, f"{name}_{label}_{typ}.png") with open(img_path, "wb") as f: f.write(buf.read()) zipf.write(img_path, arcname=os.path.basename(img_path)) if skip_metrics: pesq_score = stoi_score = mfcc_diff = None else: try: pesq_score, stoi_score, mfcc_diff = compute_pesq_mfcc_stoi(orig_path, enh_path) except: pesq_score, stoi_score, mfcc_diff = None, None, None snr = compute_snr(original_y, y) metrics.append({ "file": file_obj.name, "SNR": snr, "PESQ": pesq_score, "STOI": stoi_score, "MFCC Diff": mfcc_diff }) zipf.write(orig_path, arcname=os.path.basename(orig_path)) zipf.write(enh_path, arcname=os.path.basename(enh_path)) df = pd.DataFrame(metrics) metrics_path = os.path.join(temp_dir, "metrics.csv") df.to_csv(metrics_path, index=False) zipf.write(metrics_path, arcname="metrics.csv") zipf.close() enhanced_files = [f for f in os.listdir(temp_dir) if f.endswith("_enhanced.wav")] preview_path = os.path.join(temp_dir, enhanced_files[0]) if enhanced_files else None return zip_path, preview_path def run_enhancement(files, nr, vi, reverb, vol, lang, skip_metrics): if not files: return None, None, "Upload audio files.", gr.update(visible=False) if not any([nr, vi, reverb, vol, lang]): return None, None, "Select at least one enhancement.", gr.update(visible=True, value="No enhancements selected.") zip_path, preview = process_files(files, nr, vi, reverb, vol, lang, skip_metrics) return zip_path, preview, "Done!", gr.update(visible=False) with gr.Blocks() as demo: gr.Markdown("## 🎧 AudioVoiceEnhancer.AI") files = gr.File(label="Upload Audio", file_types=[".wav", ".mp3"], file_count="multiple") with gr.Row(): nr = gr.Checkbox(label="Noise Reduction", value=True) vi = gr.Checkbox(label="Voice Isolation", value=True) reverb = gr.Checkbox(label="Reverb Cleanup", value=True) vol = gr.Checkbox(label="Volume Normalize", value=True) lang = gr.Checkbox(label="Language-Aware Tuning", value=True) skip_metrics = gr.Checkbox(label="🚀 Skip PESQ/STOI for Speed", value=True) run_btn = gr.Button("Enhance Audio") warning = gr.Textbox(visible=False, label="Warning") output_zip = gr.File(label="Download ZIP") output_audio = gr.Audio(label="Preview Enhanced", type="filepath") label = gr.Label("Status") run_btn.click( fn=run_enhancement, inputs=[files, nr, vi, reverb, vol, lang, skip_metrics], outputs=[output_zip, output_audio, label, warning], show_progress=True ) demo.queue() demo.launch()