Spaces:

mfrng
/

AudioVoiceEnhancerAI

Build error

App Files Files Community

mfrng commited on Jun 16, 2025

Commit

2c7a0a6

verified ·

1 Parent(s): fb7b885

Update app.py

Browse files

Files changed (1) hide show

app.py +88 -198

app.py CHANGED Viewed

@@ -16,11 +16,7 @@ import webrtcvad
 from pesq import pesq
 from pystoi import stoi
-# Models placeholder imports
-# from demucs import DemucsModel  # For voice isolation
-# from voicefixer import VoiceFixer  # For audio restoration
-# -- Helper functions --
 def load_audio(file_obj):
     y, sr = librosa.load(file_obj, sr=16000)
@@ -55,116 +51,63 @@ def plot_spectrogram(y, sr, title):
 def compute_snr(original, enhanced):
     noise = original - enhanced
-    snr = 10 * np.log10(np.sum(original ** 2) / np.sum(noise ** 2) + 1e-10)
     return snr
 def vad_plot(y, sr, title):
-    # Parameters
     frame_duration_ms = 30
-    hop_length = int(sr * frame_duration_ms / 1000)
-    n_fft = 2048
-    # Compute STFT
-    S = np.abs(librosa.stft(y, n_fft=n_fft, hop_length=hop_length)) ** 2
-    freqs = librosa.fft_frequencies(sr=sr, n_fft=n_fft)
-    # Focus on 80–3000 Hz (speech band)
-    speech_band = np.where((freqs >= 80) & (freqs <= 3000))[0]
-    speech_energy = S[speech_band, :].mean(axis=0)
-    # Normalize energy
-    speech_energy /= np.max(speech_energy) + 1e-6
-    # Threshold for voice activity (tune as needed)
-    voice_mask = speech_energy > 0.1
-    # Time axis for plotting
-    times = librosa.frames_to_time(np.arange(len(voice_mask)), sr=sr, hop_length=hop_length)
-    # Merge voiced intervals
-    intervals = []
-    start = None
-    for i, voiced in enumerate(voice_mask):
-        if voiced and start is None:
-            start = times[i]
-        elif not voiced and start is not None:
-            intervals.append((start, times[i]))
-            start = None
-    if start is not None:
-        intervals.append((start, times[-1]))
-    # Plot waveform + shaded voice regions
-    plt.figure(figsize=(10, 2))
-    librosa.display.waveshow(y, sr=sr, alpha=0.6)
-    for (start_t, end_t) in intervals:
-        plt.axvspan(start_t, end_t, color='green', alpha=0.3)
-    plt.title(title + " (Voice Regions: 80–3000Hz energy)")
-    plt.tight_layout()
     buf = io.BytesIO()
     plt.savefig(buf, format='png')
     plt.close()
     buf.seek(0)
     return buf
-def amplify_voice_fft(y, sr, gain_db=10):
-    # Short-Time Fourier Transform
-    hop_length = 512
-    D = librosa.stft(y, n_fft=2048, hop_length=hop_length)
-    mag, phase = np.abs(D), np.angle(D)
-    freqs = librosa.fft_frequencies(sr=sr, n_fft=2048)
-    voice_band = np.where((freqs >= 80) & (freqs <= 3000))[0]
-    # Convert gain from dB to amplitude
-    gain_amp = 10 ** (gain_db / 20.0)
-    # Amplify only the voice frequency band
-    mag[voice_band, :] *= gain_amp
-    # Reconstruct
-    D_new = mag * np.exp(1j * phase)
-    y_out = librosa.istft(D_new, hop_length=hop_length)
-    return y_out
-def amplify_voice(y, target_db=-20):
-    rms = np.sqrt(np.mean(y**2))
-    if rms > 0:
-        current_db = 20 * np.log10(rms)
-        gain = 10 ** ((target_db - current_db) / 20)
-        y = y * gain
-    return y
 def compute_pesq_mfcc_stoi(original_path, enhanced_path):
     sr = 16000
     original, _ = librosa.load(original_path, sr=sr)
     enhanced, _ = librosa.load(enhanced_path, sr=sr)
     pesq_score = pesq(sr, original, enhanced, 'wb')
     stoi_score = stoi(original, enhanced, sr, extended=False)
     mfcc_orig = librosa.feature.mfcc(y=original, sr=sr, n_mfcc=13)
     mfcc_enh = librosa.feature.mfcc(y=enhanced, sr=sr, n_mfcc=13)
-    # Compute MFCC distance (mean absolute difference)
     mfcc_diff = np.mean(np.abs(mfcc_orig - mfcc_enh))
     return pesq_score, stoi_score, mfcc_diff
-# Enhancement functions
 def noise_reduction(y, sr):
     return reduce_noise(y=y, sr=sr)
 def voice_isolation(y, sr):
-    # Placeholder: Implement with Demucs or similar
-    # For demo, return input
     return y
 def reverb_cleanup(y, sr):
-    # Simple dereverberation placeholder: median filtering
-    y_dereverb = medfilt(y, kernel_size=5)
-    return y_dereverb
 def volume_normalize(y):
     peak = np.max(np.abs(y))
@@ -173,22 +116,16 @@ def volume_normalize(y):
     return y
 def language_aware_tuning(y, sr):
-    # Placeholder for EQ adjustments by language
-    # For demo, apply slight high-pass filter
-    y_hp = librosa.effects.preemphasis(y)
-    return y_hp
-# Main processing function
-def process_files(
-    files,
-    noise_reduc,
-    voice_iso,
-    reverb_clean,
-    vol_norm,
-    lang_tune,
-    progress=gr.Progress()
-):
     results = []
     metrics = []
     temp_dir = tempfile.mkdtemp()
@@ -202,73 +139,32 @@ def process_files(
         y, sr = load_audio(file_obj)
         original_y = y.copy()
-        # Enhancement pipeline
-        if noise_reduc:
-            y = noise_reduction(y, sr)
-        if voice_iso:
-            y = voice_isolation(y, sr)
-        if reverb_clean:
-            y = reverb_cleanup(y, sr)
-        if vol_norm:
-            y = amplify_voice_fft(y, sr, gain_db=8)
-            y = volume_normalize(y)
-        if lang_tune:
-            y = language_aware_tuning(y, sr)
-        # Amplify voice as final step
-        y = amplify_voice(y)
-        # Extract extension and construct filenames
-        base_name, ext = os.path.splitext(file_obj.name)
-        ext = ext.lower()
-        ext_format = ext[1:].upper() if ext.startswith('.') else ext.upper()
-        enhanced_filename = f"{base_name}_enhanced{ext}"
-        enhanced_path = os.path.join(temp_dir, enhanced_filename)
-        try:
-            sf.write(enhanced_path, y, sr, format=ext_format)
-        except Exception:
-            # fallback to WAV
-            enhanced_filename = f"{base_name}_enhanced.wav"
-            enhanced_path = os.path.join(temp_dir, enhanced_filename)
-            sf.write(enhanced_path, y, sr)
-        original_filename = f"{base_name}_original{ext}"
-        original_path = os.path.join(temp_dir, original_filename)
-        try:
-            sf.write(original_path, original_y, sr, format=ext_format)
-        except Exception:
-            original_filename = f"{base_name}_original.wav"
-            original_path = os.path.join(temp_dir, original_filename)
-            sf.write(original_path, original_y, sr)
-        # Generate plots
-        waveform_orig = plot_waveform(original_y, sr, "Original Waveform")
-        waveform_enh = plot_waveform(y, sr, "Enhanced Waveform")
-        spectrogram_orig = plot_spectrogram(original_y, sr, "Original Spectrogram")
-        spectrogram_enh = plot_spectrogram(y, sr, "Enhanced Spectrogram")
-        vad_orig = vad_plot(original_y, sr, "Original VAD")
-        vad_enh = vad_plot(y, sr, "Enhanced VAD")
-        # Save plots and add to zip
-        for img_buf, name in [
-            (waveform_orig, "waveform_original.png"),
-            (waveform_enh, "waveform_enhanced.png"),
-            (spectrogram_orig, "spectrogram_original.png"),
-            (spectrogram_enh, "spectrogram_enhanced.png"),
-            (vad_orig, "vad_original.png"),
-            (vad_enh, "vad_enhanced.png"),
         ]:
-            plot_path = os.path.join(temp_dir, f"{base_name}_{name}")
-            with open(plot_path, "wb") as f:
-                f.write(img_buf.read())
-            zipf.write(plot_path, arcname=os.path.basename(plot_path))
-        # Compute metrics
         try:
             pesq_score, stoi_score, mfcc_diff = compute_pesq_mfcc_stoi(original_path, enhanced_path)
         except Exception:
@@ -293,51 +189,45 @@ def process_files(
     zipf.write(csv_path, arcname="metrics.csv")
     zipf.close()
-    first_enhanced = os.path.join(temp_dir, os.path.splitext(files[0].name)[0] + "_enhanced.wav")
-    return zip_path, first_enhanced
-# Gradio UI
 with gr.Blocks() as demo:
-    gr.Markdown("# AudioVoiceEnhancer.AI - Audio Enhancement for Transcription & Translation")
     with gr.Row():
-        audio_files = gr.File(label="Upload Audio Files", file_types=['.wav', '.mp3', '.flac'], file_count="multiple", interactive=True)
     with gr.Row():
-        noise_checkbox = gr.Checkbox(label="Noise Reduction", info="Reduce background noise", value=True)
-        voice_iso_checkbox = gr.Checkbox(label="Voice Isolation", info="Isolate voice from background", value=True)
-        reverb_checkbox = gr.Checkbox(label="Reverberation Cleanup", info="Reduce echo/reverb effects", value=True)
-        volume_checkbox = gr.Checkbox(label="Volume Normalization", info="Normalize audio volume", value=True)
-        lang_checkbox = gr.Checkbox(label="Language-aware Tuning", info="Tune audio clarity based on language", value=True)
     enhance_btn = gr.Button("Enhance Audio")
-    output_zip = gr.File(label="Download ZIP of Enhanced Audio and Reports")
-    #enhanced_audio_preview = gr.Audio(label="Preview First Enhanced Audio", interactive=False)
-    progress_bar = gr.Label(value="Upload files and select enhancement options.")
-    def run_enhancement(files, nr, vi, reverb, vol, lang):
-        if not files or len(files) == 0:
-            return None, "❌ Please upload at least one audio file."
-        if not (nr or vi or reverb or vol or lang):
-            return None, "⚠️ Please enable at least one enhancement option."
-        try:
-            zip_path, first_enhanced_audio = process_files(files, nr, vi, reverb, vol, lang)
-            return zip_path, first_enhanced_audio, "Processing complete. Download your ZIP file below."
-        except Exception as e:
-            import traceback
-            traceback.print_exc()
-            return None, None, f"Error during enhancement: {str(e)}"
     enhance_btn.click(
         fn=run_enhancement,
-        inputs=[audio_files, noise_checkbox, voice_iso_checkbox, reverb_checkbox, volume_checkbox, lang_checkbox],
-        outputs=[output_zip, progress_bar],
-        show_progress=True,
     )
-demo.launch()

 from pesq import pesq
 from pystoi import stoi
+# --- Helper Functions ---
 def load_audio(file_obj):
     y, sr = librosa.load(file_obj, sr=16000)
 def compute_snr(original, enhanced):
     noise = original - enhanced
+    snr = 10 * np.log10(np.sum(original ** 2) / (np.sum(noise ** 2) + 1e-10))
     return snr
 def vad_plot(y, sr, title):
+    vad = webrtcvad.Vad(2)
+    if sr != 16000:
+        y = librosa.resample(y, orig_sr=sr, target_sr=16000)
+        sr = 16000
     frame_duration_ms = 30
+    frame_size = int(sr * frame_duration_ms / 1000)
+    if len(y) % frame_size != 0:
+        pad_len = frame_size - (len(y) % frame_size)
+        y = np.pad(y, (0, pad_len))
+    frames = np.split(y, len(y) // frame_size)
+    voiced = []
+    for frame in frames:
+        pcm = (frame * 32767).astype(np.int16).tobytes()
+        try:
+            voiced.append(vad.is_speech(pcm, sr))
+        except Exception:
+            voiced.append(False)
+    plt.figure(figsize=(10, 1.5))
+    plt.plot(voiced, drawstyle='steps-mid')
+    plt.title(title)
+    plt.xlabel("Frame Index")
+    plt.ylabel("Speech")
+    plt.tight_layout()
     buf = io.BytesIO()
     plt.savefig(buf, format='png')
     plt.close()
     buf.seek(0)
     return buf
 def compute_pesq_mfcc_stoi(original_path, enhanced_path):
     sr = 16000
     original, _ = librosa.load(original_path, sr=sr)
     enhanced, _ = librosa.load(enhanced_path, sr=sr)
     pesq_score = pesq(sr, original, enhanced, 'wb')
     stoi_score = stoi(original, enhanced, sr, extended=False)
     mfcc_orig = librosa.feature.mfcc(y=original, sr=sr, n_mfcc=13)
     mfcc_enh = librosa.feature.mfcc(y=enhanced, sr=sr, n_mfcc=13)
     mfcc_diff = np.mean(np.abs(mfcc_orig - mfcc_enh))
     return pesq_score, stoi_score, mfcc_diff
+# --- Enhancement Functions ---
 def noise_reduction(y, sr):
     return reduce_noise(y=y, sr=sr)
 def voice_isolation(y, sr):
     return y
 def reverb_cleanup(y, sr):
+    return medfilt(y, kernel_size=5)
 def volume_normalize(y):
     peak = np.max(np.abs(y))
     return y
 def language_aware_tuning(y, sr):
+    return librosa.effects.preemphasis(y)
+def amplify(y, factor=1.5):
+    y = y * factor
+    y = np.clip(y, -1.0, 1.0)
+    return y
+# --- Processing Function ---
+def process_files(files, noise_reduc, voice_iso, reverb_clean, vol_norm, lang_tune, amplify_audio, progress=gr.Progress()):
     results = []
     metrics = []
     temp_dir = tempfile.mkdtemp()
         y, sr = load_audio(file_obj)
         original_y = y.copy()
+        if noise_reduc: y = noise_reduction(y, sr)
+        if voice_iso: y = voice_isolation(y, sr)
+        if reverb_clean: y = reverb_cleanup(y, sr)
+        if vol_norm: y = volume_normalize(y)
+        if lang_tune: y = language_aware_tuning(y, sr)
+        if amplify_audio: y = amplify(y)
+        base_name = os.path.splitext(file_obj.name)[0]
+        original_path = os.path.join(temp_dir, f"{base_name}_original.wav")
+        enhanced_path = os.path.join(temp_dir, f"{base_name}_enhanced.wav")
+        save_audio(original_y, sr, original_path)
+        save_audio(y, sr, enhanced_path)
+        for func, suffix in [
+            (plot_waveform, "waveform"),
+            (plot_spectrogram, "spectrogram"),
+            (vad_plot, "vad")
         ]:
+            for label, data in [("original", original_y), ("enhanced", y)]:
+                img = func(data, sr, f"{label.title()} {suffix.title()}")
+                img_path = os.path.join(temp_dir, f"{base_name}_{suffix}_{label}.png")
+                with open(img_path, "wb") as f:
+                    f.write(img.read())
+                zipf.write(img_path, arcname=os.path.basename(img_path))
         try:
             pesq_score, stoi_score, mfcc_diff = compute_pesq_mfcc_stoi(original_path, enhanced_path)
         except Exception:
     zipf.write(csv_path, arcname="metrics.csv")
     zipf.close()
+    return zip_path
+# --- Gradio UI ---
+def run_enhancement(files, nr, vi, reverb, vol, lang, amp):
+    if not files:
+        return None, None, "Please upload at least one audio file.", gr.update(visible=False)
+    if not (nr or vi or reverb or vol or lang or amp):
+        return None, None, "Enable at least one enhancement option.", gr.update(visible=True, value="No enhancements selected!")
+    zip_path = process_files(files, nr, vi, reverb, vol, lang, amp)
+    wav_files = [f for f in os.listdir(os.path.dirname(zip_path)) if f.endswith("_enhanced.wav")]
+    first_output_wav = os.path.join(os.path.dirname(zip_path), wav_files[0]) if wav_files else None
+    return zip_path, first_output_wav, "Enhancement complete.", gr.update(visible=False)
 with gr.Blocks() as demo:
+    gr.Markdown("## AudioVoiceEnhancer.AI - Upload, Enhance, and Analyze Voice Files")
     with gr.Row():
+        audio_files = gr.File(label="Upload Audio Files", file_types=['.wav', '.mp3', '.flac'], file_count="multiple")
     with gr.Row():
+        noise_checkbox = gr.Checkbox(value=True, label="Noise Reduction")
+        voice_iso_checkbox = gr.Checkbox(value=True, label="Voice Isolation")
+        reverb_checkbox = gr.Checkbox(value=True, label="Reverb Cleanup")
+        volume_checkbox = gr.Checkbox(value=True, label="Volume Normalize")
+        lang_checkbox = gr.Checkbox(value=True, label="Language-Aware Tuning")
+        amplify_checkbox = gr.Checkbox(value=False, label="Amplify (Boost Volume)")
     enhance_btn = gr.Button("Enhance Audio")
+    warning_text = gr.Textbox(visible=False, label="Warning", interactive=False)
+    output_zip = gr.File(label="Download ZIP")
+    playback = gr.Audio(label="Preview Enhanced Audio", type="filepath")
+    progress_label = gr.Label("Status")
     enhance_btn.click(
         fn=run_enhancement,
+        inputs=[audio_files, noise_checkbox, voice_iso_checkbox, reverb_checkbox, volume_checkbox, lang_checkbox, amplify_checkbox],
+        outputs=[output_zip, playback, progress_label, warning_text],
+        show_progress=True
     )
+if __name__ == "__main__":
+    demo.launch()