AI_Vocal_Chain / app.py
carduur's picture
Update app.py
3f7624e verified
import gradio as gr
from pydub import AudioSegment, effects
import numpy as np
import librosa
import soundfile as sf
import matplotlib.pyplot as plt
import io
from PIL import Image
def generate_preset_text(rms, zcr, centroid, hp_cutoff, notch_freq):
preset = "\U0001F39B️ AI Vocal Chain Preset (FL Studio Style)\n\n"
if rms < 0.01:
preset += "- Gain: +6dB\n"
gain_db = 6
else:
preset += "- Gain: OK\n"
gain_db = 0
if zcr > 0.08:
preset += "- De-esser: Target 5kHz–8kHz\n"
else:
preset += "- De-esser: Not needed\n"
preset += f"- High-pass filter at {hp_cutoff}Hz (remove mud)\n"
preset += f"- Notch EQ: -5dB around {int(notch_freq)}Hz (reduce harshness)\n"
preset += "- High-shelf boost: +4dB above 10kHz (add air)\n"
preset += "- Fruity Compressor: Ratio 3:1, Threshold -20dB, Fast Attack\n"
preset += "- Limiter: Output ceiling -1dB (prevent clipping)\n"
return preset, gain_db
def find_harsh_frequency(y, sr, freq_range=(2000, 5000)):
S = np.abs(librosa.stft(y))**2
freqs = librosa.fft_frequencies(sr=sr)
power = np.mean(S, axis=1)
mask = (freqs >= freq_range[0]) & (freqs <= freq_range[1])
freqs_in_range = freqs[mask]
power_in_range = power[mask]
if len(freqs_in_range) == 0:
return 3000
peak_idx = np.argmax(power_in_range)
return freqs_in_range[peak_idx]
def apply_notch(audio, center_freq, width=200, reduction_db=-5):
notch_band = audio.high_pass_filter(center_freq - width).low_pass_filter(center_freq + width)
notch_cut = notch_band.apply_gain(reduction_db)
return audio.overlay(notch_cut, gain_during_overlay=reduction_db)
def apply_smart_eq(audio, sr, y):
try:
pitch = librosa.yin(y, fmin=50, fmax=300, sr=sr)
min_pitch = np.nanmin(pitch)
hp_cutoff = max(70, int(min_pitch * 0.8)) if not np.isnan(min_pitch) else 80
except:
hp_cutoff = 80
notch_freq = find_harsh_frequency(y, sr)
hp_filtered = audio.high_pass_filter(hp_cutoff)
notched = apply_notch(hp_filtered, center_freq=notch_freq, width=200, reduction_db=-5)
air = audio.high_pass_filter(10000).apply_gain(+4)
eq_result = notched.overlay(air)
return eq_result, hp_cutoff, notch_freq
def apply_limiter(audio, threshold_db=-1.0):
normalized = effects.normalize(audio)
peak_db = normalized.max_dBFS
if peak_db > threshold_db:
reduction = threshold_db - peak_db
limited = normalized.apply_gain(reduction)
else:
limited = normalized
return limited
def generate_waveform_plot(file_path):
y, sr = librosa.load(file_path, sr=None)
duration = librosa.get_duration(y=y, sr=sr)
times = np.linspace(0, duration, num=len(y))
fig, ax = plt.subplots(figsize=(10, 2), facecolor="#1e1e1e")
ax.plot(times, y, color="violet", linewidth=0.8)
ax.set_xlim([0, duration])
ax.set_ylim([-1, 1])
ax.set_xlabel("Time (s)", color="white")
ax.set_ylabel("Amplitude", color="white")
ax.set_title("Processed Vocal Waveform", color="white")
ax.tick_params(colors='white')
fig.patch.set_facecolor('#1e1e1e')
ax.set_facecolor('#1e1e1e')
plt.tight_layout()
buf = io.BytesIO()
plt.savefig(buf, format="png", facecolor=fig.get_facecolor())
buf.seek(0)
plt.close()
return Image.open(buf)
def process_vocal(file):
if file is None:
raise ValueError("No file was uploaded.")
audio = AudioSegment.from_file(file)
audio.export("input.wav", format="wav")
y, sr = librosa.load("input.wav", sr=None)
rms = librosa.feature.rms(y=y)[0]
zcr = librosa.feature.zero_crossing_rate(y=y)[0]
centroid = librosa.feature.spectral_centroid(y=y, sr=sr)[0]
eq_audio, hp_cutoff, notch_freq = apply_smart_eq(audio, sr, y)
preset_text, gain_db = generate_preset_text(np.mean(rms), np.mean(zcr), np.mean(centroid), hp_cutoff, notch_freq)
with open("vocal_chain_preset.txt", "w") as f:
f.write(preset_text)
processed = eq_audio.apply_gain(gain_db)
processed = effects.compress_dynamic_range(processed, threshold=-20.0, ratio=3.0, attack=5.0, release=50.0)
processed = apply_limiter(processed, threshold_db=-1.0)
processed.export("processed_output.wav", format="wav")
waveform_plot = generate_waveform_plot("processed_output.wav")
return "processed_output.wav", preset_text, "vocal_chain_preset.txt", waveform_plot
with gr.Blocks(css="""
body {
background-color: #121212;
color: white;
font-family: 'Segoe UI', sans-serif;
}
.gr-button {
background-color: #4f46e5;
color: white;
border-radius: 12px;
border: none;
padding: 8px 16px;
font-weight: bold;
transition: 0.3s;
}
.gr-button:hover {
background-color: #6366f1;
}
.gr-textbox, .gr-audio, .gr-file, .gr-image {
border-radius: 10px;
background-color: #1e1e1e;
color: white;
border: 1px solid #6b21a8;
width: 100%;
}
.gr-row {
display: flex;
justify-content: space-between;
gap: 10px;
}
""") as interface:
with gr.Row():
audio_input = gr.Audio(type="filepath", label="Upload Your Vocal")
submit_btn = gr.Button("Submit")
with gr.Row():
with gr.Column(scale=1):
audio_output = gr.Audio(label="Processed Vocal")
with gr.Column(scale=1):
preset_box = gr.Textbox(label="Suggested Vocal Chain / Preset")
with gr.Row():
preset_file = gr.File(label="Download Preset (.txt)")
with gr.Row():
waveform_plot = gr.Image(label="Waveform Preview")
submit_btn.click(
fn=process_vocal,
inputs=[audio_input],
outputs=[audio_output, preset_box, preset_file, waveform_plot]
)
if __name__ == "__main__":
interface.launch()