Spaces:

mfrng
/

AudioVoiceEnhancerAI

Running

App Files Files Community

mfrng commited on Jun 14, 2025

Commit

40b42fc

verified ·

1 Parent(s): 9beb94a

Upload 2 files

Browse files

Files changed (2) hide show

app.py +260 -0
requirements.txt +11 -0

app.py ADDED Viewed

	@@ -0,0 +1,260 @@

+import os
+import io
+import tempfile
+import zipfile
+import numpy as np
+import pandas as pd
+import librosa
+import librosa.display
+import matplotlib.pyplot as plt
+import soundfile as sf
+import gradio as gr
+from scipy.signal import medfilt
+from noisereduce import reduce_noise
+import webrtcvad
+from pesq import pesq
+from pystoi import stoi
+# Models placeholder imports
+# from demucs import DemucsModel  # For voice isolation
+# from voicefixer import VoiceFixer  # For audio restoration
+# -- Helper functions --
+def load_audio(file_obj):
+    y, sr = librosa.load(file_obj, sr=16000)
+    return y, sr
+def save_audio(y, sr, path):
+    sf.write(path, y, sr)
+def plot_waveform(y, sr, title):
+    plt.figure(figsize=(10, 2))
+    librosa.display.waveshow(y, sr=sr)
+    plt.title(title)
+    plt.tight_layout()
+    buf = io.BytesIO()
+    plt.savefig(buf, format='png')
+    plt.close()
+    buf.seek(0)
+    return buf
+def plot_spectrogram(y, sr, title):
+    plt.figure(figsize=(10, 4))
+    D = librosa.amplitude_to_db(np.abs(librosa.stft(y)), ref=np.max)
+    librosa.display.specshow(D, sr=sr, x_axis='time', y_axis='log')
+    plt.colorbar(format='%+2.0f dB')
+    plt.title(title)
+    plt.tight_layout()
+    buf = io.BytesIO()
+    plt.savefig(buf, format='png')
+    plt.close()
+    buf.seek(0)
+    return buf
+def compute_snr(original, enhanced):
+    noise = original - enhanced
+    snr = 10 * np.log10(np.sum(original ** 2) / np.sum(noise ** 2) + 1e-10)
+    return snr
+def vad_plot(y, sr, title):
+    vad = webrtcvad.Vad(2)  # Aggressiveness 0-3
+    frame_duration = 30  # ms
+    frame_length = int(sr * frame_duration / 1000)
+    frames = [y[i:i+frame_length] for i in range(0, len(y), frame_length)]
+    voiced = [vad.is_speech((frame * 32767).astype(np.int16).tobytes(), sr) for frame in frames]
+    times = np.arange(len(voiced)) * frame_duration / 1000
+    plt.figure(figsize=(10, 2))
+    plt.plot(times, voiced, drawstyle='steps-pre')
+    plt.ylim(-0.1, 1.1)
+    plt.title(title)
+    plt.xlabel('Time (s)')
+    plt.ylabel('Voiced (1) / Unvoiced (0)')
+    plt.tight_layout()
+    buf = io.BytesIO()
+    plt.savefig(buf, format='png')
+    plt.close()
+    buf.seek(0)
+    return buf
+def compute_pesq_mfcc_stoi(original_path, enhanced_path):
+    sr = 16000
+    original, _ = librosa.load(original_path, sr=sr)
+    enhanced, _ = librosa.load(enhanced_path, sr=sr)
+    pesq_score = pesq(sr, original, enhanced, 'wb')
+    stoi_score = stoi(original, enhanced, sr, extended=False)
+    mfcc_orig = librosa.feature.mfcc(y=original, sr=sr, n_mfcc=13)
+    mfcc_enh = librosa.feature.mfcc(y=enhanced, sr=sr, n_mfcc=13)
+    # Compute MFCC distance (mean absolute difference)
+    mfcc_diff = np.mean(np.abs(mfcc_orig - mfcc_enh))
+    return pesq_score, stoi_score, mfcc_diff
+# Enhancement functions
+def noise_reduction(y, sr):
+    return reduce_noise(y=y, sr=sr)
+def voice_isolation(y, sr):
+    # Placeholder: Implement with Demucs or similar
+    # For demo, return input
+    return y
+def reverb_cleanup(y, sr):
+    # Simple dereverberation placeholder: median filtering
+    y_dereverb = medfilt(y, kernel_size=5)
+    return y_dereverb
+def volume_normalize(y):
+    peak = np.max(np.abs(y))
+    if peak > 0:
+        y = y / peak
+    return y
+def language_aware_tuning(y, sr):
+    # Placeholder for EQ adjustments by language
+    # For demo, apply slight high-pass filter
+    y_hp = librosa.effects.preemphasis(y)
+    return y_hp
+# Main processing function
+def process_files(
+    files,
+    noise_reduc,
+    voice_iso,
+    reverb_clean,
+    vol_norm,
+    lang_tune,
+    progress=gr.Progress()
+):
+    results = []
+    metrics = []
+    temp_dir = tempfile.mkdtemp()
+    zip_path = os.path.join(temp_dir, "enhanced_results.zip")
+    zipf = zipfile.ZipFile(zip_path, 'w')
+    total = len(files)
+    for i, file_obj in enumerate(files):
+        progress((i + 1) / total, desc=f"Processing {file_obj.name}")
+        y, sr = load_audio(file_obj)
+        original_y = y.copy()
+        # Enhancement pipeline
+        if noise_reduc:
+            y = noise_reduction(y, sr)
+        if voice_iso:
+            y = voice_isolation(y, sr)
+        if reverb_clean:
+            y = reverb_cleanup(y, sr)
+        if vol_norm:
+            y = volume_normalize(y)
+        if lang_tune:
+            y = language_aware_tuning(y, sr)
+        # Save enhanced audio
+        enhanced_filename = os.path.splitext(file_obj.name)[0] + "_enhanced.wav"
+        enhanced_path = os.path.join(temp_dir, enhanced_filename)
+        save_audio(y, sr, enhanced_path)
+        # Save original audio for comparison
+        original_filename = os.path.splitext(file_obj.name)[0] + "_original.wav"
+        original_path = os.path.join(temp_dir, original_filename)
+        save_audio(original_y, sr, original_path)
+        # Generate plots
+        waveform_orig = plot_waveform(original_y, sr, "Original Waveform")
+        waveform_enh = plot_waveform(y, sr, "Enhanced Waveform")
+        spectrogram_orig = plot_spectrogram(original_y, sr, "Original Spectrogram")
+        spectrogram_enh = plot_spectrogram(y, sr, "Enhanced Spectrogram")
+        vad_orig = vad_plot(original_y, sr, "Original VAD")
+        vad_enh = vad_plot(y, sr, "Enhanced VAD")
+        # Save plots to files and add to zip
+        plot_files = []
+        for img_buf, name in [
+            (waveform_orig, "waveform_original.png"),
+            (waveform_enh, "waveform_enhanced.png"),
+            (spectrogram_orig, "spectrogram_original.png"),
+            (spectrogram_enh, "spectrogram_enhanced.png"),
+            (vad_orig, "vad_original.png"),
+            (vad_enh, "vad_enhanced.png"),
+        ]:
+            path = os.path.join(temp_dir, f"{os.path.splitext(file_obj.name)[0]}_{name}")
+            with open(path, "wb") as f:
+                f.write(img_buf.read())
+            zipf.write(path, arcname=os.path.basename(path))
+            plot_files.append(path)
+        # Compute audio quality metrics
+        try:
+            pesq_score, stoi_score, mfcc_diff = compute_pesq_mfcc_stoi(original_path, enhanced_path)
+        except Exception as e:
+            pesq_score, stoi_score, mfcc_diff = None, None, None
+        snr = compute_snr(original_y, y)
+        # Collect metrics
+        metrics.append({
+            "file": file_obj.name,
+            "SNR (dB)": snr,
+            "PESQ": pesq_score,
+            "STOI": stoi_score,
+            "MFCC Diff": mfcc_diff
+        })
+        # Add original and enhanced audio to zip
+        zipf.write(original_path, arcname=os.path.basename(original_path))
+        zipf.write(enhanced_path, arcname=os.path.basename(enhanced_path))
+    # Save metrics CSV
+    metrics_df = pd.DataFrame(metrics)
+    csv_path = os.path.join(temp_dir, "metrics.csv")
+    metrics_df.to_csv(csv_path, index=False)
+    zipf.write(csv_path, arcname="metrics.csv")
+    zipf.close()
+    return zip_path
+# Gradio UI
+with gr.Blocks() as demo:
+    gr.Markdown("# AudioVoiceEnhancer.AI - Audio Enhancement for Transcription & Translation")
+    with gr.Row():
+        audio_files = gr.File(label="Upload Audio Files", file_types=['.wav', '.mp3', '.flac'], file_count="multiple", interactive=True)
+    with gr.Row():
+        noise_checkbox = gr.Checkbox(label="Noise Reduction", info="Reduce background noise")
+        voice_iso_checkbox = gr.Checkbox(label="Voice Isolation", info="Isolate voice from background")
+        reverb_checkbox = gr.Checkbox(label="Reverberation Cleanup", info="Reduce echo/reverb effects")
+        volume_checkbox = gr.Checkbox(label="Volume Normalization", info="Normalize audio volume")
+        lang_checkbox = gr.Checkbox(label="Language-aware Tuning", info="Tune audio clarity based on language")
+    enhance_btn = gr.Button("Enhance Audio")
+    output_zip = gr.File(label="Download ZIP of Enhanced Audio and Reports")
+    progress_bar = gr.Label(value="Upload files and select enhancement options.")
+    def run_enhancement(files, nr, vi, reverb, vol, lang):
+        if not files or len(files) == 0:
+            return None, "Please upload at least one audio file."
+        path = process_files(files, nr, vi, reverb, vol, lang)
+        return path, "Processing complete. Download your ZIP file below."
+    enhance_btn.click(
+        fn=run_enhancement,
+        inputs=[audio_files, noise_checkbox, voice_iso_checkbox, reverb_checkbox, volume_checkbox, lang_checkbox],
+        outputs=[output_zip, progress_bar],
+        show_progress=True,
+    )
+demo.launch()

requirements.txt ADDED Viewed

	@@ -0,0 +1,11 @@

+gradio==3.37.0
+librosa==0.9.2
+matplotlib==3.7.1
+soundfile==0.12.1
+numpy==1.24.3
+pandas==1.5.3
+scipy==1.10.1
+noisereduce==2.0.1
+webrtcvad==2.0.10
+pesq==0.0.3
+pystoi==0.3.5