import gradio as gr from pydub import AudioSegment, effects import numpy as np import librosa import soundfile as sf import matplotlib.pyplot as plt import io from PIL import Image def generate_preset_text(rms, zcr, centroid, hp_cutoff, notch_freq): preset = "\U0001F39B️ AI Vocal Chain Preset (FL Studio Style)\n\n" if rms < 0.01: preset += "- Gain: +6dB\n" gain_db = 6 else: preset += "- Gain: OK\n" gain_db = 0 if zcr > 0.08: preset += "- De-esser: Target 5kHz–8kHz\n" else: preset += "- De-esser: Not needed\n" preset += f"- High-pass filter at {hp_cutoff}Hz (remove mud)\n" preset += f"- Notch EQ: -5dB around {int(notch_freq)}Hz (reduce harshness)\n" preset += "- High-shelf boost: +4dB above 10kHz (add air)\n" preset += "- Fruity Compressor: Ratio 3:1, Threshold -20dB, Fast Attack\n" preset += "- Limiter: Output ceiling -1dB (prevent clipping)\n" return preset, gain_db def find_harsh_frequency(y, sr, freq_range=(2000, 5000)): S = np.abs(librosa.stft(y))**2 freqs = librosa.fft_frequencies(sr=sr) power = np.mean(S, axis=1) mask = (freqs >= freq_range[0]) & (freqs <= freq_range[1]) freqs_in_range = freqs[mask] power_in_range = power[mask] if len(freqs_in_range) == 0: return 3000 peak_idx = np.argmax(power_in_range) return freqs_in_range[peak_idx] def apply_notch(audio, center_freq, width=200, reduction_db=-5): notch_band = audio.high_pass_filter(center_freq - width).low_pass_filter(center_freq + width) notch_cut = notch_band.apply_gain(reduction_db) return audio.overlay(notch_cut, gain_during_overlay=reduction_db) def apply_smart_eq(audio, sr, y): try: pitch = librosa.yin(y, fmin=50, fmax=300, sr=sr) min_pitch = np.nanmin(pitch) hp_cutoff = max(70, int(min_pitch * 0.8)) if not np.isnan(min_pitch) else 80 except: hp_cutoff = 80 notch_freq = find_harsh_frequency(y, sr) hp_filtered = audio.high_pass_filter(hp_cutoff) notched = apply_notch(hp_filtered, center_freq=notch_freq, width=200, reduction_db=-5) air = audio.high_pass_filter(10000).apply_gain(+4) eq_result = notched.overlay(air) return eq_result, hp_cutoff, notch_freq def apply_limiter(audio, threshold_db=-1.0): normalized = effects.normalize(audio) peak_db = normalized.max_dBFS if peak_db > threshold_db: reduction = threshold_db - peak_db limited = normalized.apply_gain(reduction) else: limited = normalized return limited def generate_waveform_plot(file_path): y, sr = librosa.load(file_path, sr=None) duration = librosa.get_duration(y=y, sr=sr) times = np.linspace(0, duration, num=len(y)) fig, ax = plt.subplots(figsize=(10, 2), facecolor="#1e1e1e") ax.plot(times, y, color="violet", linewidth=0.8) ax.set_xlim([0, duration]) ax.set_ylim([-1, 1]) ax.set_xlabel("Time (s)", color="white") ax.set_ylabel("Amplitude", color="white") ax.set_title("Processed Vocal Waveform", color="white") ax.tick_params(colors='white') fig.patch.set_facecolor('#1e1e1e') ax.set_facecolor('#1e1e1e') plt.tight_layout() buf = io.BytesIO() plt.savefig(buf, format="png", facecolor=fig.get_facecolor()) buf.seek(0) plt.close() return Image.open(buf) def process_vocal(file): if file is None: raise ValueError("No file was uploaded.") audio = AudioSegment.from_file(file) audio.export("input.wav", format="wav") y, sr = librosa.load("input.wav", sr=None) rms = librosa.feature.rms(y=y)[0] zcr = librosa.feature.zero_crossing_rate(y=y)[0] centroid = librosa.feature.spectral_centroid(y=y, sr=sr)[0] eq_audio, hp_cutoff, notch_freq = apply_smart_eq(audio, sr, y) preset_text, gain_db = generate_preset_text(np.mean(rms), np.mean(zcr), np.mean(centroid), hp_cutoff, notch_freq) with open("vocal_chain_preset.txt", "w") as f: f.write(preset_text) processed = eq_audio.apply_gain(gain_db) processed = effects.compress_dynamic_range(processed, threshold=-20.0, ratio=3.0, attack=5.0, release=50.0) processed = apply_limiter(processed, threshold_db=-1.0) processed.export("processed_output.wav", format="wav") waveform_plot = generate_waveform_plot("processed_output.wav") return "processed_output.wav", preset_text, "vocal_chain_preset.txt", waveform_plot with gr.Blocks(css=""" body { background-color: #121212; color: white; font-family: 'Segoe UI', sans-serif; } .gr-button { background-color: #4f46e5; color: white; border-radius: 12px; border: none; padding: 8px 16px; font-weight: bold; transition: 0.3s; } .gr-button:hover { background-color: #6366f1; } .gr-textbox, .gr-audio, .gr-file, .gr-image { border-radius: 10px; background-color: #1e1e1e; color: white; border: 1px solid #6b21a8; width: 100%; } .gr-row { display: flex; justify-content: space-between; gap: 10px; } """) as interface: with gr.Row(): audio_input = gr.Audio(type="filepath", label="Upload Your Vocal") submit_btn = gr.Button("Submit") with gr.Row(): with gr.Column(scale=1): audio_output = gr.Audio(label="Processed Vocal") with gr.Column(scale=1): preset_box = gr.Textbox(label="Suggested Vocal Chain / Preset") with gr.Row(): preset_file = gr.File(label="Download Preset (.txt)") with gr.Row(): waveform_plot = gr.Image(label="Waveform Preview") submit_btn.click( fn=process_vocal, inputs=[audio_input], outputs=[audio_output, preset_box, preset_file, waveform_plot] ) if __name__ == "__main__": interface.launch()