Spaces:
Running
Running
| import gradio as gr | |
| from pydub import AudioSegment, effects | |
| import numpy as np | |
| import librosa | |
| import soundfile as sf | |
| import matplotlib.pyplot as plt | |
| import io | |
| from PIL import Image | |
| def generate_preset_text(rms, zcr, centroid, hp_cutoff, notch_freq): | |
| preset = "\U0001F39B️ AI Vocal Chain Preset (FL Studio Style)\n\n" | |
| if rms < 0.01: | |
| preset += "- Gain: +6dB\n" | |
| gain_db = 6 | |
| else: | |
| preset += "- Gain: OK\n" | |
| gain_db = 0 | |
| if zcr > 0.08: | |
| preset += "- De-esser: Target 5kHz–8kHz\n" | |
| else: | |
| preset += "- De-esser: Not needed\n" | |
| preset += f"- High-pass filter at {hp_cutoff}Hz (remove mud)\n" | |
| preset += f"- Notch EQ: -5dB around {int(notch_freq)}Hz (reduce harshness)\n" | |
| preset += "- High-shelf boost: +4dB above 10kHz (add air)\n" | |
| preset += "- Fruity Compressor: Ratio 3:1, Threshold -20dB, Fast Attack\n" | |
| preset += "- Limiter: Output ceiling -1dB (prevent clipping)\n" | |
| return preset, gain_db | |
| def find_harsh_frequency(y, sr, freq_range=(2000, 5000)): | |
| S = np.abs(librosa.stft(y))**2 | |
| freqs = librosa.fft_frequencies(sr=sr) | |
| power = np.mean(S, axis=1) | |
| mask = (freqs >= freq_range[0]) & (freqs <= freq_range[1]) | |
| freqs_in_range = freqs[mask] | |
| power_in_range = power[mask] | |
| if len(freqs_in_range) == 0: | |
| return 3000 | |
| peak_idx = np.argmax(power_in_range) | |
| return freqs_in_range[peak_idx] | |
| def apply_notch(audio, center_freq, width=200, reduction_db=-5): | |
| notch_band = audio.high_pass_filter(center_freq - width).low_pass_filter(center_freq + width) | |
| notch_cut = notch_band.apply_gain(reduction_db) | |
| return audio.overlay(notch_cut, gain_during_overlay=reduction_db) | |
| def apply_smart_eq(audio, sr, y): | |
| try: | |
| pitch = librosa.yin(y, fmin=50, fmax=300, sr=sr) | |
| min_pitch = np.nanmin(pitch) | |
| hp_cutoff = max(70, int(min_pitch * 0.8)) if not np.isnan(min_pitch) else 80 | |
| except: | |
| hp_cutoff = 80 | |
| notch_freq = find_harsh_frequency(y, sr) | |
| hp_filtered = audio.high_pass_filter(hp_cutoff) | |
| notched = apply_notch(hp_filtered, center_freq=notch_freq, width=200, reduction_db=-5) | |
| air = audio.high_pass_filter(10000).apply_gain(+4) | |
| eq_result = notched.overlay(air) | |
| return eq_result, hp_cutoff, notch_freq | |
| def apply_limiter(audio, threshold_db=-1.0): | |
| normalized = effects.normalize(audio) | |
| peak_db = normalized.max_dBFS | |
| if peak_db > threshold_db: | |
| reduction = threshold_db - peak_db | |
| limited = normalized.apply_gain(reduction) | |
| else: | |
| limited = normalized | |
| return limited | |
| def generate_waveform_plot(file_path): | |
| y, sr = librosa.load(file_path, sr=None) | |
| duration = librosa.get_duration(y=y, sr=sr) | |
| times = np.linspace(0, duration, num=len(y)) | |
| fig, ax = plt.subplots(figsize=(10, 2), facecolor="#1e1e1e") | |
| ax.plot(times, y, color="violet", linewidth=0.8) | |
| ax.set_xlim([0, duration]) | |
| ax.set_ylim([-1, 1]) | |
| ax.set_xlabel("Time (s)", color="white") | |
| ax.set_ylabel("Amplitude", color="white") | |
| ax.set_title("Processed Vocal Waveform", color="white") | |
| ax.tick_params(colors='white') | |
| fig.patch.set_facecolor('#1e1e1e') | |
| ax.set_facecolor('#1e1e1e') | |
| plt.tight_layout() | |
| buf = io.BytesIO() | |
| plt.savefig(buf, format="png", facecolor=fig.get_facecolor()) | |
| buf.seek(0) | |
| plt.close() | |
| return Image.open(buf) | |
| def process_vocal(file): | |
| if file is None: | |
| raise ValueError("No file was uploaded.") | |
| audio = AudioSegment.from_file(file) | |
| audio.export("input.wav", format="wav") | |
| y, sr = librosa.load("input.wav", sr=None) | |
| rms = librosa.feature.rms(y=y)[0] | |
| zcr = librosa.feature.zero_crossing_rate(y=y)[0] | |
| centroid = librosa.feature.spectral_centroid(y=y, sr=sr)[0] | |
| eq_audio, hp_cutoff, notch_freq = apply_smart_eq(audio, sr, y) | |
| preset_text, gain_db = generate_preset_text(np.mean(rms), np.mean(zcr), np.mean(centroid), hp_cutoff, notch_freq) | |
| with open("vocal_chain_preset.txt", "w") as f: | |
| f.write(preset_text) | |
| processed = eq_audio.apply_gain(gain_db) | |
| processed = effects.compress_dynamic_range(processed, threshold=-20.0, ratio=3.0, attack=5.0, release=50.0) | |
| processed = apply_limiter(processed, threshold_db=-1.0) | |
| processed.export("processed_output.wav", format="wav") | |
| waveform_plot = generate_waveform_plot("processed_output.wav") | |
| return "processed_output.wav", preset_text, "vocal_chain_preset.txt", waveform_plot | |
| with gr.Blocks(css=""" | |
| body { | |
| background-color: #121212; | |
| color: white; | |
| font-family: 'Segoe UI', sans-serif; | |
| } | |
| .gr-button { | |
| background-color: #4f46e5; | |
| color: white; | |
| border-radius: 12px; | |
| border: none; | |
| padding: 8px 16px; | |
| font-weight: bold; | |
| transition: 0.3s; | |
| } | |
| .gr-button:hover { | |
| background-color: #6366f1; | |
| } | |
| .gr-textbox, .gr-audio, .gr-file, .gr-image { | |
| border-radius: 10px; | |
| background-color: #1e1e1e; | |
| color: white; | |
| border: 1px solid #6b21a8; | |
| width: 100%; | |
| } | |
| .gr-row { | |
| display: flex; | |
| justify-content: space-between; | |
| gap: 10px; | |
| } | |
| """) as interface: | |
| with gr.Row(): | |
| audio_input = gr.Audio(type="filepath", label="Upload Your Vocal") | |
| submit_btn = gr.Button("Submit") | |
| with gr.Row(): | |
| with gr.Column(scale=1): | |
| audio_output = gr.Audio(label="Processed Vocal") | |
| with gr.Column(scale=1): | |
| preset_box = gr.Textbox(label="Suggested Vocal Chain / Preset") | |
| with gr.Row(): | |
| preset_file = gr.File(label="Download Preset (.txt)") | |
| with gr.Row(): | |
| waveform_plot = gr.Image(label="Waveform Preview") | |
| submit_btn.click( | |
| fn=process_vocal, | |
| inputs=[audio_input], | |
| outputs=[audio_output, preset_box, preset_file, waveform_plot] | |
| ) | |
| if __name__ == "__main__": | |
| interface.launch() |