Spaces:

mfrng
/

AudioVoiceEnhancerAI

Build error

App Files Files Community

mfrng commited on Jun 16, 2025

Commit

798cb1c

verified ·

1 Parent(s): 2c7a0a6

Update app.py

Browse files

Files changed (1) hide show

app.py +81 -119

app.py CHANGED Viewed

@@ -16,8 +16,6 @@ import webrtcvad
 from pesq import pesq
 from pystoi import stoi
-# --- Helper Functions ---
 def load_audio(file_obj):
     y, sr = librosa.load(file_obj, sr=16000)
     return y, sr
@@ -29,59 +27,47 @@ def plot_waveform(y, sr, title):
     plt.figure(figsize=(10, 2))
     librosa.display.waveshow(y, sr=sr)
     plt.title(title)
-    plt.tight_layout()
     buf = io.BytesIO()
     plt.savefig(buf, format='png')
     plt.close()
     buf.seek(0)
     return buf
 def plot_spectrogram(y, sr, title):
-    plt.figure(figsize=(10, 4))
     D = librosa.amplitude_to_db(np.abs(librosa.stft(y)), ref=np.max)
     librosa.display.specshow(D, sr=sr, x_axis='time', y_axis='log')
     plt.colorbar(format='%+2.0f dB')
     plt.title(title)
-    plt.tight_layout()
     buf = io.BytesIO()
     plt.savefig(buf, format='png')
     plt.close()
     buf.seek(0)
     return buf
-def compute_snr(original, enhanced):
-    noise = original - enhanced
-    snr = 10 * np.log10(np.sum(original ** 2) / (np.sum(noise ** 2) + 1e-10))
-    return snr
 def vad_plot(y, sr, title):
     vad = webrtcvad.Vad(2)
     if sr != 16000:
         y = librosa.resample(y, orig_sr=sr, target_sr=16000)
         sr = 16000
     frame_duration_ms = 30
     frame_size = int(sr * frame_duration_ms / 1000)
-    if len(y) % frame_size != 0:
-        pad_len = frame_size - (len(y) % frame_size)
-        y = np.pad(y, (0, pad_len))
     frames = np.split(y, len(y) // frame_size)
     voiced = []
     for frame in frames:
         pcm = (frame * 32767).astype(np.int16).tobytes()
         try:
             voiced.append(vad.is_speech(pcm, sr))
-        except Exception:
             voiced.append(False)
     plt.figure(figsize=(10, 1.5))
     plt.plot(voiced, drawstyle='steps-mid')
     plt.title(title)
-    plt.xlabel("Frame Index")
-    plt.ylabel("Speech")
-    plt.tight_layout()
     buf = io.BytesIO()
     plt.savefig(buf, format='png')
     plt.close()
     buf.seek(0)
@@ -93,141 +79,117 @@ def compute_pesq_mfcc_stoi(original_path, enhanced_path):
     enhanced, _ = librosa.load(enhanced_path, sr=sr)
     pesq_score = pesq(sr, original, enhanced, 'wb')
     stoi_score = stoi(original, enhanced, sr, extended=False)
-    mfcc_orig = librosa.feature.mfcc(y=original, sr=sr, n_mfcc=13)
-    mfcc_enh = librosa.feature.mfcc(y=enhanced, sr=sr, n_mfcc=13)
-    mfcc_diff = np.mean(np.abs(mfcc_orig - mfcc_enh))
     return pesq_score, stoi_score, mfcc_diff
-# --- Enhancement Functions ---
-def noise_reduction(y, sr):
-    return reduce_noise(y=y, sr=sr)
-def voice_isolation(y, sr):
-    return y
-def reverb_cleanup(y, sr):
-    return medfilt(y, kernel_size=5)
-def volume_normalize(y):
-    peak = np.max(np.abs(y))
-    if peak > 0:
-        y = y / peak
-    return y
-def language_aware_tuning(y, sr):
-    return librosa.effects.preemphasis(y)
-def amplify(y, factor=1.5):
-    y = y * factor
-    y = np.clip(y, -1.0, 1.0)
-    return y
-# --- Processing Function ---
-def process_files(files, noise_reduc, voice_iso, reverb_clean, vol_norm, lang_tune, amplify_audio, progress=gr.Progress()):
-    results = []
-    metrics = []
     temp_dir = tempfile.mkdtemp()
-    zip_path = os.path.join(temp_dir, "enhanced_results.zip")
     zipf = zipfile.ZipFile(zip_path, 'w')
     total = len(files)
     for i, file_obj in enumerate(files):
         progress((i + 1) / total, desc=f"Processing {file_obj.name}")
         y, sr = load_audio(file_obj)
         original_y = y.copy()
-        if noise_reduc: y = noise_reduction(y, sr)
-        if voice_iso: y = voice_isolation(y, sr)
-        if reverb_clean: y = reverb_cleanup(y, sr)
-        if vol_norm: y = volume_normalize(y)
-        if lang_tune: y = language_aware_tuning(y, sr)
-        if amplify_audio: y = amplify(y)
-        base_name = os.path.splitext(file_obj.name)[0]
-        original_path = os.path.join(temp_dir, f"{base_name}_original.wav")
-        enhanced_path = os.path.join(temp_dir, f"{base_name}_enhanced.wav")
-        save_audio(original_y, sr, original_path)
-        save_audio(y, sr, enhanced_path)
-        for func, suffix in [
-            (plot_waveform, "waveform"),
-            (plot_spectrogram, "spectrogram"),
-            (vad_plot, "vad")
-        ]:
-            for label, data in [("original", original_y), ("enhanced", y)]:
-                img = func(data, sr, f"{label.title()} {suffix.title()}")
-                img_path = os.path.join(temp_dir, f"{base_name}_{suffix}_{label}.png")
                 with open(img_path, "wb") as f:
-                    f.write(img.read())
                 zipf.write(img_path, arcname=os.path.basename(img_path))
-        try:
-            pesq_score, stoi_score, mfcc_diff = compute_pesq_mfcc_stoi(original_path, enhanced_path)
-        except Exception:
-            pesq_score, stoi_score, mfcc_diff = None, None, None
         snr = compute_snr(original_y, y)
         metrics.append({
             "file": file_obj.name,
-            "SNR (dB)": snr,
             "PESQ": pesq_score,
             "STOI": stoi_score,
             "MFCC Diff": mfcc_diff
         })
-        zipf.write(original_path, arcname=os.path.basename(original_path))
-        zipf.write(enhanced_path, arcname=os.path.basename(enhanced_path))
-    metrics_df = pd.DataFrame(metrics)
-    csv_path = os.path.join(temp_dir, "metrics.csv")
-    metrics_df.to_csv(csv_path, index=False)
-    zipf.write(csv_path, arcname="metrics.csv")
     zipf.close()
-    return zip_path
-# --- Gradio UI ---
-def run_enhancement(files, nr, vi, reverb, vol, lang, amp):
     if not files:
-        return None, None, "Please upload at least one audio file.", gr.update(visible=False)
-    if not (nr or vi or reverb or vol or lang or amp):
-        return None, None, "Enable at least one enhancement option.", gr.update(visible=True, value="No enhancements selected!")
-    zip_path = process_files(files, nr, vi, reverb, vol, lang, amp)
-    wav_files = [f for f in os.listdir(os.path.dirname(zip_path)) if f.endswith("_enhanced.wav")]
-    first_output_wav = os.path.join(os.path.dirname(zip_path), wav_files[0]) if wav_files else None
-    return zip_path, first_output_wav, "Enhancement complete.", gr.update(visible=False)
 with gr.Blocks() as demo:
-    gr.Markdown("## AudioVoiceEnhancer.AI - Upload, Enhance, and Analyze Voice Files")
     with gr.Row():
-        audio_files = gr.File(label="Upload Audio Files", file_types=['.wav', '.mp3', '.flac'], file_count="multiple")
-    with gr.Row():
-        noise_checkbox = gr.Checkbox(value=True, label="Noise Reduction")
-        voice_iso_checkbox = gr.Checkbox(value=True, label="Voice Isolation")
-        reverb_checkbox = gr.Checkbox(value=True, label="Reverb Cleanup")
-        volume_checkbox = gr.Checkbox(value=True, label="Volume Normalize")
-        lang_checkbox = gr.Checkbox(value=True, label="Language-Aware Tuning")
-        amplify_checkbox = gr.Checkbox(value=False, label="Amplify (Boost Volume)")
-    enhance_btn = gr.Button("Enhance Audio")
-    warning_text = gr.Textbox(visible=False, label="Warning", interactive=False)
     output_zip = gr.File(label="Download ZIP")
-    playback = gr.Audio(label="Preview Enhanced Audio", type="filepath")
-    progress_label = gr.Label("Status")
-    enhance_btn.click(
         fn=run_enhancement,
-        inputs=[audio_files, noise_checkbox, voice_iso_checkbox, reverb_checkbox, volume_checkbox, lang_checkbox, amplify_checkbox],
-        outputs=[output_zip, playback, progress_label, warning_text],
         show_progress=True
     )
-if __name__ == "__main__":
-    demo.launch()

 from pesq import pesq
 from pystoi import stoi
 def load_audio(file_obj):
     y, sr = librosa.load(file_obj, sr=16000)
     return y, sr
     plt.figure(figsize=(10, 2))
     librosa.display.waveshow(y, sr=sr)
     plt.title(title)
     buf = io.BytesIO()
+    plt.tight_layout()
     plt.savefig(buf, format='png')
     plt.close()
     buf.seek(0)
     return buf
 def plot_spectrogram(y, sr, title):
+    plt.figure(figsize=(10, 3))
     D = librosa.amplitude_to_db(np.abs(librosa.stft(y)), ref=np.max)
     librosa.display.specshow(D, sr=sr, x_axis='time', y_axis='log')
     plt.colorbar(format='%+2.0f dB')
     plt.title(title)
     buf = io.BytesIO()
+    plt.tight_layout()
     plt.savefig(buf, format='png')
     plt.close()
     buf.seek(0)
     return buf
 def vad_plot(y, sr, title):
     vad = webrtcvad.Vad(2)
     if sr != 16000:
         y = librosa.resample(y, orig_sr=sr, target_sr=16000)
         sr = 16000
     frame_duration_ms = 30
     frame_size = int(sr * frame_duration_ms / 1000)
+    y = np.pad(y, (0, frame_size - len(y) % frame_size)) if len(y) % frame_size != 0 else y
     frames = np.split(y, len(y) // frame_size)
     voiced = []
     for frame in frames:
         pcm = (frame * 32767).astype(np.int16).tobytes()
         try:
             voiced.append(vad.is_speech(pcm, sr))
+        except:
             voiced.append(False)
     plt.figure(figsize=(10, 1.5))
     plt.plot(voiced, drawstyle='steps-mid')
     plt.title(title)
     buf = io.BytesIO()
+    plt.tight_layout()
     plt.savefig(buf, format='png')
     plt.close()
     buf.seek(0)
     enhanced, _ = librosa.load(enhanced_path, sr=sr)
     pesq_score = pesq(sr, original, enhanced, 'wb')
     stoi_score = stoi(original, enhanced, sr, extended=False)
+    mfcc_diff = np.mean(np.abs(
+        librosa.feature.mfcc(original, sr, n_mfcc=13) -
+        librosa.feature.mfcc(enhanced, sr, n_mfcc=13)
+    ))
     return pesq_score, stoi_score, mfcc_diff
+def compute_snr(original, enhanced):
+    noise = original - enhanced
+    snr = 10 * np.log10(np.sum(original ** 2) / (np.sum(noise ** 2) + 1e-9))
+    return snr
+def noise_reduction(y, sr): return reduce_noise(y=y, sr=sr)
+def voice_isolation(y, sr): return y  # Placeholder
+def reverb_cleanup(y, sr): return medfilt(y, kernel_size=5)
+def volume_normalize(y): return y / np.max(np.abs(y)) if np.max(np.abs(y)) > 0 else y
+def language_aware_tuning(y, sr): return librosa.effects.preemphasis(y)
+def process_files(files, nr, vi, reverb, vol, lang, skip_metrics=False, progress=gr.Progress()):
+    results, metrics = [], []
     temp_dir = tempfile.mkdtemp()
+    zip_path = os.path.join(temp_dir, "enhanced_output.zip")
     zipf = zipfile.ZipFile(zip_path, 'w')
     total = len(files)
     for i, file_obj in enumerate(files):
         progress((i + 1) / total, desc=f"Processing {file_obj.name}")
         y, sr = load_audio(file_obj)
         original_y = y.copy()
+        if nr: y = noise_reduction(y, sr)
+        if vi: y = voice_isolation(y, sr)
+        if reverb: y = reverb_cleanup(y, sr)
+        if vol: y = volume_normalize(y)
+        if lang: y = language_aware_tuning(y, sr)
+        name = os.path.splitext(file_obj.name)[0]
+        orig_path = os.path.join(temp_dir, f"{name}_original.wav")
+        enh_path = os.path.join(temp_dir, f"{name}_enhanced.wav")
+        save_audio(original_y, sr, orig_path)
+        save_audio(y, sr, enh_path)
+        for plot_func, label in [(plot_waveform, "waveform"), (plot_spectrogram, "spectrogram"), (vad_plot, "vad")]:
+            for typ, signal in [("original", original_y), ("enhanced", y)]:
+                buf = plot_func(signal, sr, f"{typ.title()} {label.title()}")
+                img_path = os.path.join(temp_dir, f"{name}_{label}_{typ}.png")
                 with open(img_path, "wb") as f:
+                    f.write(buf.read())
                 zipf.write(img_path, arcname=os.path.basename(img_path))
+        if skip_metrics:
+            pesq_score = stoi_score = mfcc_diff = None
+        else:
+            try:
+                pesq_score, stoi_score, mfcc_diff = compute_pesq_mfcc_stoi(orig_path, enh_path)
+            except:
+                pesq_score, stoi_score, mfcc_diff = None, None, None
         snr = compute_snr(original_y, y)
         metrics.append({
             "file": file_obj.name,
+            "SNR": snr,
             "PESQ": pesq_score,
             "STOI": stoi_score,
             "MFCC Diff": mfcc_diff
         })
+        zipf.write(orig_path, arcname=os.path.basename(orig_path))
+        zipf.write(enh_path, arcname=os.path.basename(enh_path))
+    df = pd.DataFrame(metrics)
+    metrics_path = os.path.join(temp_dir, "metrics.csv")
+    df.to_csv(metrics_path, index=False)
+    zipf.write(metrics_path, arcname="metrics.csv")
     zipf.close()
+    enhanced_files = [f for f in os.listdir(temp_dir) if f.endswith("_enhanced.wav")]
+    preview_path = os.path.join(temp_dir, enhanced_files[0]) if enhanced_files else None
+    return zip_path, preview_path
+def run_enhancement(files, nr, vi, reverb, vol, lang, skip_metrics):
     if not files:
+        return None, None, "Upload audio files.", gr.update(visible=False)
+    if not any([nr, vi, reverb, vol, lang]):
+        return None, None, "Select at least one enhancement.", gr.update(visible=True, value="No enhancements selected.")
+    zip_path, preview = process_files(files, nr, vi, reverb, vol, lang, skip_metrics)
+    return zip_path, preview, "Done!", gr.update(visible=False)
 with gr.Blocks() as demo:
+    gr.Markdown("## 🎧 AudioVoiceEnhancer.AI")
+    files = gr.File(label="Upload Audio", file_types=[".wav", ".mp3"], file_count="multiple")
     with gr.Row():
+        nr = gr.Checkbox(label="Noise Reduction", value=True)
+        vi = gr.Checkbox(label="Voice Isolation", value=True)
+        reverb = gr.Checkbox(label="Reverb Cleanup", value=True)
+        vol = gr.Checkbox(label="Volume Normalize", value=True)
+        lang = gr.Checkbox(label="Language-Aware Tuning", value=True)
+    skip_metrics = gr.Checkbox(label="🚀 Skip PESQ/STOI for Speed", value=True)
+    run_btn = gr.Button("Enhance Audio")
+    warning = gr.Textbox(visible=False, label="Warning")
     output_zip = gr.File(label="Download ZIP")
+    output_audio = gr.Audio(label="Preview Enhanced", type="filepath")
+    label = gr.Label("Status")
+    run_btn.click(
         fn=run_enhancement,
+        inputs=[files, nr, vi, reverb, vol, lang, skip_metrics],
+        outputs=[output_zip, output_audio, label, warning],
         show_progress=True
     )
+demo.queue()
+demo.launch()