Spaces:

mfrng
/

AudioVoiceEnhancerAI

Running

App Files Files Community

mfrng commited on Jun 14, 2025

Commit

2ed41fc

verified ·

1 Parent(s): a2b60a5

Update app.py

Browse files

Files changed (1) hide show

app.py +272 -260

app.py CHANGED Viewed

@@ -1,260 +1,272 @@
-import os
-import io
-import tempfile
-import zipfile
-import numpy as np
-import pandas as pd
-import librosa
-import librosa.display
-import matplotlib.pyplot as plt
-import soundfile as sf
-import gradio as gr
-from scipy.signal import medfilt
-from noisereduce import reduce_noise
-import webrtcvad
-from pesq import pesq
-from pystoi import stoi
-# Models placeholder imports
-# from demucs import DemucsModel  # For voice isolation
-# from voicefixer import VoiceFixer  # For audio restoration
-# -- Helper functions --
-def load_audio(file_obj):
-    y, sr = librosa.load(file_obj, sr=16000)
-    return y, sr
-def save_audio(y, sr, path):
-    sf.write(path, y, sr)
-def plot_waveform(y, sr, title):
-    plt.figure(figsize=(10, 2))
-    librosa.display.waveshow(y, sr=sr)
-    plt.title(title)
-    plt.tight_layout()
-    buf = io.BytesIO()
-    plt.savefig(buf, format='png')
-    plt.close()
-    buf.seek(0)
-    return buf
-def plot_spectrogram(y, sr, title):
-    plt.figure(figsize=(10, 4))
-    D = librosa.amplitude_to_db(np.abs(librosa.stft(y)), ref=np.max)
-    librosa.display.specshow(D, sr=sr, x_axis='time', y_axis='log')
-    plt.colorbar(format='%+2.0f dB')
-    plt.title(title)
-    plt.tight_layout()
-    buf = io.BytesIO()
-    plt.savefig(buf, format='png')
-    plt.close()
-    buf.seek(0)
-    return buf
-def compute_snr(original, enhanced):
-    noise = original - enhanced
-    snr = 10 * np.log10(np.sum(original ** 2) / np.sum(noise ** 2) + 1e-10)
-    return snr
-def vad_plot(y, sr, title):
-    vad = webrtcvad.Vad(2)  # Aggressiveness 0-3
-    frame_duration = 30  # ms
-    frame_length = int(sr * frame_duration / 1000)
-    frames = [y[i:i+frame_length] for i in range(0, len(y), frame_length)]
-    voiced = [vad.is_speech((frame * 32767).astype(np.int16).tobytes(), sr) for frame in frames]
-    times = np.arange(len(voiced)) * frame_duration / 1000
-    plt.figure(figsize=(10, 2))
-    plt.plot(times, voiced, drawstyle='steps-pre')
-    plt.ylim(-0.1, 1.1)
-    plt.title(title)
-    plt.xlabel('Time (s)')
-    plt.ylabel('Voiced (1) / Unvoiced (0)')
-    plt.tight_layout()
-    buf = io.BytesIO()
-    plt.savefig(buf, format='png')
-    plt.close()
-    buf.seek(0)
-    return buf
-def compute_pesq_mfcc_stoi(original_path, enhanced_path):
-    sr = 16000
-    original, _ = librosa.load(original_path, sr=sr)
-    enhanced, _ = librosa.load(enhanced_path, sr=sr)
-    pesq_score = pesq(sr, original, enhanced, 'wb')
-    stoi_score = stoi(original, enhanced, sr, extended=False)
-    mfcc_orig = librosa.feature.mfcc(y=original, sr=sr, n_mfcc=13)
-    mfcc_enh = librosa.feature.mfcc(y=enhanced, sr=sr, n_mfcc=13)
-    # Compute MFCC distance (mean absolute difference)
-    mfcc_diff = np.mean(np.abs(mfcc_orig - mfcc_enh))
-    return pesq_score, stoi_score, mfcc_diff
-# Enhancement functions
-def noise_reduction(y, sr):
-    return reduce_noise(y=y, sr=sr)
-def voice_isolation(y, sr):
-    # Placeholder: Implement with Demucs or similar
-    # For demo, return input
-    return y
-def reverb_cleanup(y, sr):
-    # Simple dereverberation placeholder: median filtering
-    y_dereverb = medfilt(y, kernel_size=5)
-    return y_dereverb
-def volume_normalize(y):
-    peak = np.max(np.abs(y))
-    if peak > 0:
-        y = y / peak
-    return y
-def language_aware_tuning(y, sr):
-    # Placeholder for EQ adjustments by language
-    # For demo, apply slight high-pass filter
-    y_hp = librosa.effects.preemphasis(y)
-    return y_hp
-# Main processing function
-def process_files(
-    files,
-    noise_reduc,
-    voice_iso,
-    reverb_clean,
-    vol_norm,
-    lang_tune,
-    progress=gr.Progress()
-):
-    results = []
-    metrics = []
-    temp_dir = tempfile.mkdtemp()
-    zip_path = os.path.join(temp_dir, "enhanced_results.zip")
-    zipf = zipfile.ZipFile(zip_path, 'w')
-    total = len(files)
-    for i, file_obj in enumerate(files):
-        progress((i + 1) / total, desc=f"Processing {file_obj.name}")
-        y, sr = load_audio(file_obj)
-        original_y = y.copy()
-        # Enhancement pipeline
-        if noise_reduc:
-            y = noise_reduction(y, sr)
-        if voice_iso:
-            y = voice_isolation(y, sr)
-        if reverb_clean:
-            y = reverb_cleanup(y, sr)
-        if vol_norm:
-            y = volume_normalize(y)
-        if lang_tune:
-            y = language_aware_tuning(y, sr)
-        # Save enhanced audio
-        enhanced_filename = os.path.splitext(file_obj.name)[0] + "_enhanced.wav"
-        enhanced_path = os.path.join(temp_dir, enhanced_filename)
-        save_audio(y, sr, enhanced_path)
-        # Save original audio for comparison
-        original_filename = os.path.splitext(file_obj.name)[0] + "_original.wav"
-        original_path = os.path.join(temp_dir, original_filename)
-        save_audio(original_y, sr, original_path)
-        # Generate plots
-        waveform_orig = plot_waveform(original_y, sr, "Original Waveform")
-        waveform_enh = plot_waveform(y, sr, "Enhanced Waveform")
-        spectrogram_orig = plot_spectrogram(original_y, sr, "Original Spectrogram")
-        spectrogram_enh = plot_spectrogram(y, sr, "Enhanced Spectrogram")
-        vad_orig = vad_plot(original_y, sr, "Original VAD")
-        vad_enh = vad_plot(y, sr, "Enhanced VAD")
-        # Save plots to files and add to zip
-        plot_files = []
-        for img_buf, name in [
-            (waveform_orig, "waveform_original.png"),
-            (waveform_enh, "waveform_enhanced.png"),
-            (spectrogram_orig, "spectrogram_original.png"),
-            (spectrogram_enh, "spectrogram_enhanced.png"),
-            (vad_orig, "vad_original.png"),
-            (vad_enh, "vad_enhanced.png"),
-        ]:
-            path = os.path.join(temp_dir, f"{os.path.splitext(file_obj.name)[0]}_{name}")
-            with open(path, "wb") as f:
-                f.write(img_buf.read())
-            zipf.write(path, arcname=os.path.basename(path))
-            plot_files.append(path)
-        # Compute audio quality metrics
-        try:
-            pesq_score, stoi_score, mfcc_diff = compute_pesq_mfcc_stoi(original_path, enhanced_path)
-        except Exception as e:
-            pesq_score, stoi_score, mfcc_diff = None, None, None
-        snr = compute_snr(original_y, y)
-        # Collect metrics
-        metrics.append({
-            "file": file_obj.name,
-            "SNR (dB)": snr,
-            "PESQ": pesq_score,
-            "STOI": stoi_score,
-            "MFCC Diff": mfcc_diff
-        })
-        # Add original and enhanced audio to zip
-        zipf.write(original_path, arcname=os.path.basename(original_path))
-        zipf.write(enhanced_path, arcname=os.path.basename(enhanced_path))
-    # Save metrics CSV
-    metrics_df = pd.DataFrame(metrics)
-    csv_path = os.path.join(temp_dir, "metrics.csv")
-    metrics_df.to_csv(csv_path, index=False)
-    zipf.write(csv_path, arcname="metrics.csv")
-    zipf.close()
-    return zip_path
-# Gradio UI
-with gr.Blocks() as demo:
-    gr.Markdown("# AudioVoiceEnhancer.AI - Audio Enhancement for Transcription & Translation")
-    with gr.Row():
-        audio_files = gr.File(label="Upload Audio Files", file_types=['.wav', '.mp3', '.flac'], file_count="multiple", interactive=True)
-    with gr.Row():
-        noise_checkbox = gr.Checkbox(label="Noise Reduction", info="Reduce background noise")
-        voice_iso_checkbox = gr.Checkbox(label="Voice Isolation", info="Isolate voice from background")
-        reverb_checkbox = gr.Checkbox(label="Reverberation Cleanup", info="Reduce echo/reverb effects")
-        volume_checkbox = gr.Checkbox(label="Volume Normalization", info="Normalize audio volume")
-        lang_checkbox = gr.Checkbox(label="Language-aware Tuning", info="Tune audio clarity based on language")
-    enhance_btn = gr.Button("Enhance Audio")
-    output_zip = gr.File(label="Download ZIP of Enhanced Audio and Reports")
-    progress_bar = gr.Label(value="Upload files and select enhancement options.")
-    def run_enhancement(files, nr, vi, reverb, vol, lang):
-        if not files or len(files) == 0:
-            return None, "Please upload at least one audio file."
-        path = process_files(files, nr, vi, reverb, vol, lang)
-        return path, "Processing complete. Download your ZIP file below."
-    enhance_btn.click(
-        fn=run_enhancement,
-        inputs=[audio_files, noise_checkbox, voice_iso_checkbox, reverb_checkbox, volume_checkbox, lang_checkbox],
-        outputs=[output_zip, progress_bar],
-        show_progress=True,
-    )
-demo.launch()

+import os
+import io
+import tempfile
+import zipfile
+import numpy as np
+import pandas as pd
+import librosa
+import librosa.display
+import matplotlib.pyplot as plt
+import soundfile as sf
+import gradio as gr
+from scipy.signal import medfilt
+from noisereduce import reduce_noise
+import webrtcvad
+from pesq import pesq
+from pystoi import stoi
+# Models placeholder imports
+# from demucs import DemucsModel  # For voice isolation
+# from voicefixer import VoiceFixer  # For audio restoration
+# -- Helper functions --
+def load_audio(file_obj):
+    y, sr = librosa.load(file_obj, sr=16000)
+    return y, sr
+def save_audio(y, sr, path):
+    sf.write(path, y, sr)
+def plot_waveform(y, sr, title):
+    plt.figure(figsize=(10, 2))
+    librosa.display.waveshow(y, sr=sr)
+    plt.title(title)
+    plt.tight_layout()
+    buf = io.BytesIO()
+    plt.savefig(buf, format='png')
+    plt.close()
+    buf.seek(0)
+    return buf
+def plot_spectrogram(y, sr, title):
+    plt.figure(figsize=(10, 4))
+    D = librosa.amplitude_to_db(np.abs(librosa.stft(y)), ref=np.max)
+    librosa.display.specshow(D, sr=sr, x_axis='time', y_axis='log')
+    plt.colorbar(format='%+2.0f dB')
+    plt.title(title)
+    plt.tight_layout()
+    buf = io.BytesIO()
+    plt.savefig(buf, format='png')
+    plt.close()
+    buf.seek(0)
+    return buf
+def compute_snr(original, enhanced):
+    noise = original - enhanced
+    snr = 10 * np.log10(np.sum(original ** 2) / np.sum(noise ** 2) + 1e-10)
+    return snr
+def vad_plot(y, sr, title):
+    # webrtcvad requires 16-bit mono PCM, sample rate 16000, 10/20/30 ms chunks
+    import webrtcvad
+    import numpy as np
+    vad = webrtcvad.Vad(2)
+    if sr != 16000:
+        import librosa
+        y = librosa.resample(y, orig_sr=sr, target_sr=16000)
+        sr = 16000
+    frame_duration_ms = 30  # Can be 10, 20, or 30
+    frame_size = int(sr * frame_duration_ms / 1000)  # samples per frame
+    # Pad signal to be multiple of frame_size
+    if len(y) % frame_size != 0:
+        pad_len = frame_size - (len(y) % frame_size)
+        y = np.pad(y, (0, pad_len))
+    frames = np.split(y, len(y) // frame_size)
+    voiced = []
+    for frame in frames:
+        pcm = (frame * 32767).astype(np.int16).tobytes()
+        try:
+            voiced.append(vad.is_speech(pcm, sr))
+        except Exception as e:
+            print("VAD error:", e)
+            voiced.append(False)
+    return voiced
+def compute_pesq_mfcc_stoi(original_path, enhanced_path):
+    sr = 16000
+    original, _ = librosa.load(original_path, sr=sr)
+    enhanced, _ = librosa.load(enhanced_path, sr=sr)
+    pesq_score = pesq(sr, original, enhanced, 'wb')
+    stoi_score = stoi(original, enhanced, sr, extended=False)
+    mfcc_orig = librosa.feature.mfcc(y=original, sr=sr, n_mfcc=13)
+    mfcc_enh = librosa.feature.mfcc(y=enhanced, sr=sr, n_mfcc=13)
+    # Compute MFCC distance (mean absolute difference)
+    mfcc_diff = np.mean(np.abs(mfcc_orig - mfcc_enh))
+    return pesq_score, stoi_score, mfcc_diff
+# Enhancement functions
+def noise_reduction(y, sr):
+    return reduce_noise(y=y, sr=sr)
+def voice_isolation(y, sr):
+    # Placeholder: Implement with Demucs or similar
+    # For demo, return input
+    return y
+def reverb_cleanup(y, sr):
+    # Simple dereverberation placeholder: median filtering
+    y_dereverb = medfilt(y, kernel_size=5)
+    return y_dereverb
+def volume_normalize(y):
+    peak = np.max(np.abs(y))
+    if peak > 0:
+        y = y / peak
+    return y
+def language_aware_tuning(y, sr):
+    # Placeholder for EQ adjustments by language
+    # For demo, apply slight high-pass filter
+    y_hp = librosa.effects.preemphasis(y)
+    return y_hp
+# Main processing function
+def process_files(
+    files,
+    noise_reduc,
+    voice_iso,
+    reverb_clean,
+    vol_norm,
+    lang_tune,
+    progress=gr.Progress()
+):
+    results = []
+    metrics = []
+    temp_dir = tempfile.mkdtemp()
+    zip_path = os.path.join(temp_dir, "enhanced_results.zip")
+    zipf = zipfile.ZipFile(zip_path, 'w')
+    total = len(files)
+    for i, file_obj in enumerate(files):
+        progress((i + 1) / total, desc=f"Processing {file_obj.name}")
+        y, sr = load_audio(file_obj)
+        original_y = y.copy()
+        # Enhancement pipeline
+        if noise_reduc:
+            y = noise_reduction(y, sr)
+        if voice_iso:
+            y = voice_isolation(y, sr)
+        if reverb_clean:
+            y = reverb_cleanup(y, sr)
+        if vol_norm:
+            y = volume_normalize(y)
+        if lang_tune:
+            y = language_aware_tuning(y, sr)
+        # Save enhanced audio
+        enhanced_filename = os.path.splitext(file_obj.name)[0] + "_enhanced.wav"
+        enhanced_path = os.path.join(temp_dir, enhanced_filename)
+        save_audio(y, sr, enhanced_path)
+        # Save original audio for comparison
+        original_filename = os.path.splitext(file_obj.name)[0] + "_original.wav"
+        original_path = os.path.join(temp_dir, original_filename)
+        save_audio(original_y, sr, original_path)
+        # Generate plots
+        waveform_orig = plot_waveform(original_y, sr, "Original Waveform")
+        waveform_enh = plot_waveform(y, sr, "Enhanced Waveform")
+        spectrogram_orig = plot_spectrogram(original_y, sr, "Original Spectrogram")
+        spectrogram_enh = plot_spectrogram(y, sr, "Enhanced Spectrogram")
+        vad_orig = vad_plot(original_y, sr, "Original VAD")
+        vad_enh = vad_plot(y, sr, "Enhanced VAD")
+        # Save plots to files and add to zip
+        plot_files = []
+        for img_buf, name in [
+            (waveform_orig, "waveform_original.png"),
+            (waveform_enh, "waveform_enhanced.png"),
+            (spectrogram_orig, "spectrogram_original.png"),
+            (spectrogram_enh, "spectrogram_enhanced.png"),
+            (vad_orig, "vad_original.png"),
+            (vad_enh, "vad_enhanced.png"),
+        ]:
+            path = os.path.join(temp_dir, f"{os.path.splitext(file_obj.name)[0]}_{name}")
+            with open(path, "wb") as f:
+                f.write(img_buf.read())
+            zipf.write(path, arcname=os.path.basename(path))
+            plot_files.append(path)
+        # Compute audio quality metrics
+        try:
+            pesq_score, stoi_score, mfcc_diff = compute_pesq_mfcc_stoi(original_path, enhanced_path)
+        except Exception as e:
+            pesq_score, stoi_score, mfcc_diff = None, None, None
+        snr = compute_snr(original_y, y)
+        # Collect metrics
+        metrics.append({
+            "file": file_obj.name,
+            "SNR (dB)": snr,
+            "PESQ": pesq_score,
+            "STOI": stoi_score,
+            "MFCC Diff": mfcc_diff
+        })
+        # Add original and enhanced audio to zip
+        zipf.write(original_path, arcname=os.path.basename(original_path))
+        zipf.write(enhanced_path, arcname=os.path.basename(enhanced_path))
+    # Save metrics CSV
+    metrics_df = pd.DataFrame(metrics)
+    csv_path = os.path.join(temp_dir, "metrics.csv")
+    metrics_df.to_csv(csv_path, index=False)
+    zipf.write(csv_path, arcname="metrics.csv")
+    zipf.close()
+    return zip_path
+# Gradio UI
+with gr.Blocks() as demo:
+    gr.Markdown("# AudioVoiceEnhancer.AI - Audio Enhancement for Transcription & Translation")
+    with gr.Row():
+        audio_files = gr.File(label="Upload Audio Files", file_types=['.wav', '.mp3', '.flac'], file_count="multiple", interactive=True)
+    with gr.Row():
+        noise_checkbox = gr.Checkbox(label="Noise Reduction", info="Reduce background noise")
+        voice_iso_checkbox = gr.Checkbox(label="Voice Isolation", info="Isolate voice from background")
+        reverb_checkbox = gr.Checkbox(label="Reverberation Cleanup", info="Reduce echo/reverb effects")
+        volume_checkbox = gr.Checkbox(label="Volume Normalization", info="Normalize audio volume")
+        lang_checkbox = gr.Checkbox(label="Language-aware Tuning", info="Tune audio clarity based on language")
+    enhance_btn = gr.Button("Enhance Audio")
+    output_zip = gr.File(label="Download ZIP of Enhanced Audio and Reports")
+    progress_bar = gr.Label(value="Upload files and select enhancement options.")
+    def run_enhancement(files, nr, vi, reverb, vol, lang):
+        if not files or len(files) == 0:
+            return None, "Please upload at least one audio file."
+        path = process_files(files, nr, vi, reverb, vol, lang)
+        return path, "Processing complete. Download your ZIP file below."
+    enhance_btn.click(
+        fn=run_enhancement,
+        inputs=[audio_files, noise_checkbox, voice_iso_checkbox, reverb_checkbox, volume_checkbox, lang_checkbox],
+        outputs=[output_zip, progress_bar],
+        show_progress=True,
+    )
+demo.launch()