Spaces:

carduur
/

AI_Vocal_Chain

Running

App Files Files Community

AI_Vocal_Chain / app.py

carduur

Update app.py

3f7624e verified 10 months ago

raw

history blame contribute delete

5.78 kB

	import gradio as gr
	from pydub import AudioSegment, effects
	import numpy as np
	import librosa
	import soundfile as sf
	import matplotlib.pyplot as plt
	import io
	from PIL import Image


	def generate_preset_text(rms, zcr, centroid, hp_cutoff, notch_freq):
	preset = "\U0001F39B️ AI Vocal Chain Preset (FL Studio Style)\n\n"

	if rms < 0.01:
	preset += "- Gain: +6dB\n"
	gain_db = 6
	else:
	preset += "- Gain: OK\n"
	gain_db = 0

	if zcr > 0.08:
	preset += "- De-esser: Target 5kHz–8kHz\n"
	else:
	preset += "- De-esser: Not needed\n"

	preset += f"- High-pass filter at {hp_cutoff}Hz (remove mud)\n"
	preset += f"- Notch EQ: -5dB around {int(notch_freq)}Hz (reduce harshness)\n"
	preset += "- High-shelf boost: +4dB above 10kHz (add air)\n"
	preset += "- Fruity Compressor: Ratio 3:1, Threshold -20dB, Fast Attack\n"
	preset += "- Limiter: Output ceiling -1dB (prevent clipping)\n"

	return preset, gain_db


	def find_harsh_frequency(y, sr, freq_range=(2000, 5000)):
	S = np.abs(librosa.stft(y))**2
	freqs = librosa.fft_frequencies(sr=sr)
	power = np.mean(S, axis=1)

	mask = (freqs >= freq_range[0]) & (freqs <= freq_range[1])
	freqs_in_range = freqs[mask]
	power_in_range = power[mask]

	if len(freqs_in_range) == 0:
	return 3000

	peak_idx = np.argmax(power_in_range)
	return freqs_in_range[peak_idx]


	def apply_notch(audio, center_freq, width=200, reduction_db=-5):
	notch_band = audio.high_pass_filter(center_freq - width).low_pass_filter(center_freq + width)
	notch_cut = notch_band.apply_gain(reduction_db)
	return audio.overlay(notch_cut, gain_during_overlay=reduction_db)


	def apply_smart_eq(audio, sr, y):
	try:
	pitch = librosa.yin(y, fmin=50, fmax=300, sr=sr)
	min_pitch = np.nanmin(pitch)
	hp_cutoff = max(70, int(min_pitch * 0.8)) if not np.isnan(min_pitch) else 80
	except:
	hp_cutoff = 80

	notch_freq = find_harsh_frequency(y, sr)

	hp_filtered = audio.high_pass_filter(hp_cutoff)
	notched = apply_notch(hp_filtered, center_freq=notch_freq, width=200, reduction_db=-5)
	air = audio.high_pass_filter(10000).apply_gain(+4)
	eq_result = notched.overlay(air)

	return eq_result, hp_cutoff, notch_freq


	def apply_limiter(audio, threshold_db=-1.0):
	normalized = effects.normalize(audio)
	peak_db = normalized.max_dBFS

	if peak_db > threshold_db:
	reduction = threshold_db - peak_db
	limited = normalized.apply_gain(reduction)
	else:
	limited = normalized

	return limited


	def generate_waveform_plot(file_path):
	y, sr = librosa.load(file_path, sr=None)
	duration = librosa.get_duration(y=y, sr=sr)
	times = np.linspace(0, duration, num=len(y))

	fig, ax = plt.subplots(figsize=(10, 2), facecolor="#1e1e1e")
	ax.plot(times, y, color="violet", linewidth=0.8)
	ax.set_xlim([0, duration])
	ax.set_ylim([-1, 1])
	ax.set_xlabel("Time (s)", color="white")
	ax.set_ylabel("Amplitude", color="white")
	ax.set_title("Processed Vocal Waveform", color="white")
	ax.tick_params(colors='white')
	fig.patch.set_facecolor('#1e1e1e')
	ax.set_facecolor('#1e1e1e')
	plt.tight_layout()

	buf = io.BytesIO()
	plt.savefig(buf, format="png", facecolor=fig.get_facecolor())
	buf.seek(0)
	plt.close()

	return Image.open(buf)


	def process_vocal(file):
	if file is None:
	raise ValueError("No file was uploaded.")

	audio = AudioSegment.from_file(file)
	audio.export("input.wav", format="wav")
	y, sr = librosa.load("input.wav", sr=None)

	rms = librosa.feature.rms(y=y)[0]
	zcr = librosa.feature.zero_crossing_rate(y=y)[0]
	centroid = librosa.feature.spectral_centroid(y=y, sr=sr)[0]

	eq_audio, hp_cutoff, notch_freq = apply_smart_eq(audio, sr, y)
	preset_text, gain_db = generate_preset_text(np.mean(rms), np.mean(zcr), np.mean(centroid), hp_cutoff, notch_freq)

	with open("vocal_chain_preset.txt", "w") as f:
	f.write(preset_text)

	processed = eq_audio.apply_gain(gain_db)
	processed = effects.compress_dynamic_range(processed, threshold=-20.0, ratio=3.0, attack=5.0, release=50.0)
	processed = apply_limiter(processed, threshold_db=-1.0)
	processed.export("processed_output.wav", format="wav")

	waveform_plot = generate_waveform_plot("processed_output.wav")

	return "processed_output.wav", preset_text, "vocal_chain_preset.txt", waveform_plot

	with gr.Blocks(css="""
	body {
	background-color: #121212;
	color: white;
	font-family: 'Segoe UI', sans-serif;
	}
	.gr-button {
	background-color: #4f46e5;
	color: white;
	border-radius: 12px;
	border: none;
	padding: 8px 16px;
	font-weight: bold;
	transition: 0.3s;
	}
	.gr-button:hover {
	background-color: #6366f1;
	}
	.gr-textbox, .gr-audio, .gr-file, .gr-image {
	border-radius: 10px;
	background-color: #1e1e1e;
	color: white;
	border: 1px solid #6b21a8;
	width: 100%;
	}
	.gr-row {
	display: flex;
	justify-content: space-between;
	gap: 10px;
	}
	""") as interface:
	with gr.Row():
	audio_input = gr.Audio(type="filepath", label="Upload Your Vocal")
	submit_btn = gr.Button("Submit")

	with gr.Row():
	with gr.Column(scale=1):
	audio_output = gr.Audio(label="Processed Vocal")
	with gr.Column(scale=1):
	preset_box = gr.Textbox(label="Suggested Vocal Chain / Preset")

	with gr.Row():
	preset_file = gr.File(label="Download Preset (.txt)")

	with gr.Row():
	waveform_plot = gr.Image(label="Waveform Preview")

	submit_btn.click(
	fn=process_vocal,
	inputs=[audio_input],
	outputs=[audio_output, preset_box, preset_file, waveform_plot]
	)

	if __name__ == "__main__":
	interface.launch()