Spaces:

TeszenAI
/

Audiomix

Sleeping

App Files Files Community

Audiomix / app.py

teszenofficial

Upload app.py

f5ef918 verified 2 months ago

raw

history blame contribute delete

20 kB

	import gradio as gr
	import librosa
	import numpy as np
	import soundfile as sf
	import scipy.signal
	import tempfile
	import os

	# ─────────────────────────────────────────────
	# AUDIO I/O & BASIC PROCESSING
	# ─────────────────────────────────────────────

	def load_audio(path, target_sr=44100):
	y, sr = librosa.load(path, sr=target_sr, mono=True)
	return y.astype(np.float32), sr


	def normalize(y, headroom_db=-1.0):
	peak = np.max(np.abs(y))
	if peak < 1e-9:
	return y
	target = 10 ** (headroom_db / 20.0)
	return y * (target / peak)


	def anti_click_window(y, ramp_samples=256):
	out = y.copy()
	ramp = np.linspace(0, 1, ramp_samples).astype(np.float32)
	out[:ramp_samples] *= ramp
	out[-ramp_samples:] *= ramp[::-1]
	return out


	# ─────────────────────────────────────────────
	# BPM DETECTION
	# ─────────────────────────────────────────────

	def detect_bpm(y, sr):
	tempo, _ = librosa.beat.beat_track(y=y, sr=sr)
	return float(np.asarray(tempo).flatten()[0])


	# ─────────────────────────────────────────────
	# KEY DETECTION
	# ─────────────────────────────────────────────

	def detect_key(y, sr):
	chroma = librosa.feature.chroma_cqt(y=y, sr=sr)
	chroma_mean = chroma.mean(axis=1)
	major_t = np.array([1,0,1,0,1,1,0,1,0,1,0,1], dtype=float)
	minor_t = np.array([1,0,1,1,0,1,0,1,1,0,1,0], dtype=float)
	maj_s = [np.dot(np.roll(major_t, i), chroma_mean) for i in range(12)]
	min_s = [np.dot(np.roll(minor_t, i), chroma_mean) for i in range(12)]
	if max(maj_s) >= max(min_s):
	return int(np.argmax(maj_s)), "major"
	return int(np.argmax(min_s)), "minor"


	def semitones_to_shift(key_a, key_b):
	diff = (key_b - key_a) % 12
	if diff > 6:
	diff -= 12
	return diff


	def pitch_shift(y, sr, n_steps):
	if n_steps == 0:
	return y
	return librosa.effects.pitch_shift(y=y, sr=sr, n_steps=float(n_steps))


	# ─────────────────────────────────────────────
	# SMART TRANSITION POINT (downbeat + breakdown)
	# ─────────────────────────────────────────────

	def find_best_transition_point(y_a, sr, transition_sec, hop=512):
	"""
	Score every frame of track A and pick the best mix-out point.

	Criteria:
	• Low RMS energy → breakdown / intro of chorus = easier to mix out
	• Low onset strength → not mid-drum-fill
	• Beat-aligned (4-beat downbeat) → rhythmic correctness
	• Position: last 35-70 % of track → natural DJ cue range
	"""
	rms = librosa.feature.rms(y=y_a, hop_length=hop)[0]
	onset = librosa.onset.onset_strength(y=y_a, sr=sr, hop_length=hop)
	total_f = len(rms)
	trans_f = int(transition_sec * sr / hop)

	# Smooth energy over ~1.5 s to detect macro breakdowns
	smooth_rms = np.convolve(rms, np.ones(30) / 30, mode='same')

	# Score: favour low energy AND low onset density
	score = (1.0 / (smooth_rms + 1e-6)) * (1.0 / (onset + 1e-6))

	# Position window
	w_start = int(total_f * 0.35)
	w_end = max(w_start + 1, total_f - trans_f - int(4 * sr / hop))
	if w_end <= w_start:
	w_start = max(0, total_f - trans_f - 20)
	w_end = max(w_start + 1, total_f - trans_f)

	# Gaussian position weight centred at 55 % of track
	centre = int(total_f * 0.55)
	sigma = total_f * 0.12
	pos_w = np.exp(-0.5 * ((np.arange(total_f) - centre) / sigma) ** 2)
	score = score * pos_w
	score[:w_start] = 0
	score[w_end:] = 0

	# Restrict to 4-beat downbeat positions
	_, beat_frames = librosa.beat.beat_track(y=y_a, sr=sr, hop_length=hop)
	downbeats = beat_frames[::4]
	candidates = [int(f) for f in downbeats if w_start <= f < w_end]

	if candidates:
	best_frame = max(candidates, key=lambda f: score[min(f, total_f - 1)])
	else:
	best_frame = int(np.argmax(score))

	return int(best_frame) * hop


	# ─────────────────────────────────────────────
	# GRADUAL BPM MORPH ENGINE
	# ─────────────────────────────────────────────

	def _fit_to_length(y, n):
	"""Trim or zero-pad array to exactly n samples."""
	n = int(n)
	if len(y) >= n:
	return y[:n].astype(np.float32)
	return np.pad(y, (0, n - len(y))).astype(np.float32)


	def build_gradual_stretch(y, sr, bpm_start, bpm_end, n_samples_target):
	"""
	Produce a segment of exactly n_samples_target samples whose playback
	tempo linearly morphs from bpm_start → bpm_end.

	Method: overlap-add of short (80 ms) Hann-windowed chunks, each
	independently time-stretched by the instantaneous rate.
	"""
	if abs(bpm_start - bpm_end) < 0.5 or bpm_end < 1:
	rate = float(np.clip(bpm_start / max(bpm_end, 1), 0.5, 2.0))
	return _fit_to_length(librosa.effects.time_stretch(y, rate=rate),
	n_samples_target)

	WIN = int(sr * 0.08) # 80 ms analysis window
	HOP = WIN // 2
	N = max(1, (len(y) - WIN) // HOP + 1)

	# Output buffer sized generously
	out_buf = np.zeros(int(n_samples_target * 2 + sr), dtype=np.float32)
	write_pos = 0

	for i in range(N):
	t = i / max(N - 1, 1) # 0 → 1
	bpm_now = bpm_start + (bpm_end - bpm_start) * t
	rate = float(np.clip(bpm_start / max(bpm_now, 1), 0.5, 2.0))

	r_start = i * HOP
	r_end = min(r_start + WIN, len(y))
	chunk = y[r_start:r_end]
	if len(chunk) < 32:
	continue

	stretched = librosa.effects.time_stretch(chunk, rate=rate)

	# Hann envelope for smooth overlap-add (avoids clicks between chunks)
	env = np.hanning(len(stretched)).astype(np.float32)
	stretched = (stretched * env).astype(np.float32)

	w_end = write_pos + len(stretched)
	if w_end > len(out_buf):
	extra = w_end - len(out_buf) + sr
	out_buf = np.concatenate([out_buf, np.zeros(extra, dtype=np.float32)])
	out_buf[write_pos:w_end] += stretched
	write_pos += HOP # advance by HOP (overlap-add)

	result = out_buf[:write_pos + WIN]
	return _fit_to_length(result, n_samples_target)


	# ─────────────────────────────────────────────
	# EQ / FILTERS
	# ─────────────────────────────────────────────

	def lowpass_filter(y, sr, cutoff_hz=300, order=4):
	nyq = sr / 2.0
	if cutoff_hz >= nyq:
	return y
	b, a = scipy.signal.butter(order, cutoff_hz / nyq, btype='low')
	return scipy.signal.filtfilt(b, a, y).astype(np.float32)


	def highpass_filter(y, sr, cutoff_hz=300, order=4):
	nyq = sr / 2.0
	if cutoff_hz >= nyq:
	return y
	b, a = scipy.signal.butter(order, cutoff_hz / nyq, btype='high')
	return scipy.signal.filtfilt(b, a, y).astype(np.float32)


	def make_crossfade_curve(length, style="smooth"):
	t = np.linspace(0, np.pi, length)
	if style == "smooth":
	fade_out = 0.5 * (1 + np.cos(t))
	fade_in = 0.5 * (1 - np.cos(t))
	else:
	x = np.linspace(0, 1, length)
	fade_out = np.exp(-4 * x); fade_out /= fade_out[0]
	fade_in = 1.0 - np.exp(-4 * x)
	return fade_out.astype(np.float32), fade_in.astype(np.float32)


	def eq_crossfade(seg_a, seg_b, sr, fade_out, fade_in):
	"""Bass-split EQ crossfade: cut A's sub while bringing in B's sub."""
	bass_a = lowpass_filter(seg_a, sr)
	mids_a = highpass_filter(seg_a, sr)
	bass_b = lowpass_filter(seg_b, sr)
	mids_b = highpass_filter(seg_b, sr)
	return ((bass_a * fade_out + bass_b * fade_in) +
	(mids_a * fade_out + mids_b * fade_in)).astype(np.float32)


	# ─────────────────────────────────────────────
	# MAIN MIX ENGINE
	# ─────────────────────────────────────────────

	def automix(file_a_path, file_b_path, transition_sec=10, mix_style="Smooth Mix"):
	SR = 44100

	# 1 ── Load & normalize ───────────────────────────────────────────────
	y_a, _ = load_audio(file_a_path, SR)
	y_b, _ = load_audio(file_b_path, SR)
	y_a = normalize(y_a)
	y_b = normalize(y_b)

	# 2 ── BPM detection ──────────────────────────────────────────────────
	bpm_a = detect_bpm(y_a, SR)
	bpm_b = detect_bpm(y_b, SR)

	# 3 ── Key / pitch correction on track B ──────────────────────────────
	key_a, _ = detect_key(y_a, SR)
	key_b, _ = detect_key(y_b, SR)
	shift = semitones_to_shift(key_a, key_b)
	if abs(shift) > 0:
	y_b = pitch_shift(y_b, SR, -shift)
	y_b = normalize(y_b)

	# 4 ── Smart transition point (downbeat in a breakdown) ────────────────
	trans_samples = int(transition_sec * SR)
	trans_start = find_best_transition_point(y_a, SR, transition_sec)
	trans_start = int(np.clip(trans_start, 0,
	max(0, len(y_a) - trans_samples - 1)))
	trans_end = trans_start + trans_samples

	# 5 ── Gradual BPM morph segments ─────────────────────────────────────
	#
	# zone_a : Track A's outro — tempo morphs FROM bpm_a TO bpm_b
	# so it meets track B at the same speed mid-transition
	#
	# zone_b : Track B's intro — tempo morphs FROM bpm_b TO bpm_a
	# (mirror of zone_a, so both tracks align at the midpoint)
	# After the transition, track B continues at its natural bpm_b
	# (we just play the unmodified tail)
	#
	seg_a_raw = y_a[trans_start:trans_end]
	zone_a = build_gradual_stretch(seg_a_raw, SR,
	bpm_start=bpm_a,
	bpm_end=bpm_b,
	n_samples_target=trans_samples)

	if len(y_b) < trans_samples:
	seg_b_raw = np.pad(y_b, (0, trans_samples - len(y_b)))
	else:
	seg_b_raw = y_b[:trans_samples]

	zone_b = build_gradual_stretch(seg_b_raw, SR,
	bpm_start=bpm_b,
	bpm_end=bpm_a,
	n_samples_target=trans_samples)

	# Track B tail plays at its ORIGINAL natural tempo (no stretching)
	tail_b = y_b[trans_samples:] if len(y_b) > trans_samples else np.array([], dtype=np.float32)

	# 6 ── EQ crossfade blend ─────────────────────────────────────────────
	style_key = "aggressive" if "Aggressive" in mix_style else "smooth"
	fade_out, fade_in = make_crossfade_curve(trans_samples, style_key)
	blend = eq_crossfade(zone_a, zone_b, SR, fade_out, fade_in)

	# 7 ── Assemble final mix ──────────────────────────────────────────────
	pre_a = y_a[:trans_start]
	mix = np.concatenate([pre_a, blend, tail_b])

	# 8 ── Polish ─────────────────────────────────────────────────────────
	mix = anti_click_window(mix)
	mix = normalize(mix, headroom_db=-0.5)

	out_path = os.path.join(tempfile.gettempdir(), "automix_output.wav")
	sf.write(out_path, mix, SR, subtype='PCM_16')

	trans_time_sec = trans_start / SR
	return out_path, bpm_a, bpm_b, key_a, key_b, shift, trans_time_sec


	# ─────────────────────────────────────────────
	# GRADIO UI
	# ─────────────────────────────────────────────

	KEY_NAMES = ["C","C#","D","D#","E","F","F#","G","G#","A","A#","B"]

	CSS = """
	@import url('https://fonts.googleapis.com/css2?family=Syne:wght@400;700;800&family=DM+Mono:wght@300;400;500&display=swap');

	:root {
	--bg: #0a0a0f;
	--surface: #111118;
	--border: #1f1f2e;
	--accent1: #ff3cac;
	--accent2: #00f0ff;
	--accent3: #7928ca;
	--text: #e8e8f0;
	--muted: #6b6b80;
	--radius: 12px;
	}

	body, .gradio-container {
	background: var(--bg) !important;
	font-family: 'DM Mono', monospace !important;
	color: var(--text) !important;
	}

	.app-header {
	text-align: center;
	padding: 48px 20px 32px;
	position: relative;
	}
	.app-header::before {
	content: '';
	position: absolute;
	inset: 0;
	background: radial-gradient(ellipse 80% 60% at 50% 0%, rgba(121,40,202,.18) 0%, transparent 70%);
	pointer-events: none;
	}
	.app-title {
	font-family: 'Syne', sans-serif !important;
	font-size: clamp(2.2rem, 5vw, 3.8rem) !important;
	font-weight: 800 !important;
	letter-spacing: -1px;
	background: linear-gradient(135deg, var(--accent1) 0%, var(--accent2) 55%, var(--accent3) 100%);
	-webkit-background-clip: text;
	-webkit-text-fill-color: transparent;
	background-clip: text;
	margin: 0 0 8px;
	}
	.app-sub {
	color: var(--muted);
	font-size: .85rem;
	letter-spacing: .12em;
	text-transform: uppercase;
	}
	.panel {
	background: var(--surface);
	border: 1px solid var(--border);
	border-radius: var(--radius);
	padding: 24px;
	}
	.mix-btn {
	background: linear-gradient(135deg, var(--accent1), var(--accent3)) !important;
	border: none !important;
	color: #fff !important;
	font-family: 'Syne', sans-serif !important;
	font-weight: 700 !important;
	font-size: 1rem !important;
	letter-spacing: .08em !important;
	padding: 14px 36px !important;
	border-radius: 8px !important;
	cursor: pointer !important;
	transition: opacity .2s, transform .15s !important;
	}
	.mix-btn:hover { opacity: .88; transform: translateY(-1px); }
	.stats-box {
	background: #0d0d14 !important;
	border: 1px solid var(--border) !important;
	border-radius: 8px !important;
	padding: 16px 20px !important;
	font-size: .78rem !important;
	color: var(--muted) !important;
	line-height: 1.8 !important;
	white-space: pre-wrap !important;
	}
	.gr-block, .gr-box { background: var(--surface) !important; border-color: var(--border) !important; }
	label { color: var(--muted) !important; font-size: .78rem !important; letter-spacing: .1em !important; text-transform: uppercase !important; }
	input[type=range] { accent-color: var(--accent1); }
	select, .gr-dropdown { background: #0d0d14 !important; color: var(--text) !important; border-color: var(--border) !important; }
	audio { width: 100%; border-radius: 8px; }
	.divider { border: none; border-top: 1px solid var(--border); margin: 24px 0; }
	"""


	def run_mix(file_a, file_b, transition_sec, mix_style):
	if file_a is None or file_b is None:
	return None, "⚠ Please upload both tracks."
	try:
	out_path, bpm_a, bpm_b, key_a, key_b, shift, trans_sec = automix(
	file_a, file_b,
	transition_sec=int(transition_sec),
	mix_style=mix_style
	)
	mm = int(trans_sec // 60)
	ss = int(trans_sec % 60)
	stats = (
	f"TRACK A → BPM: {bpm_a:.1f} KEY: {KEY_NAMES[key_a]}\n"
	f"TRACK B → BPM: {bpm_b:.1f} KEY: {KEY_NAMES[key_b]}\n"
	f"─────────────────────────────────────────\n"
	f"TRANSITION START : {mm:02d}:{ss:02d} (auto-detected downbeat)\n"
	f"TRANSITION LENGTH : {int(transition_sec)} seconds\n"
	f"MIX STYLE : {mix_style.upper()}\n"
	f"BPM MORPH : {bpm_a:.1f} ↔ {bpm_b:.1f} (gradual during transition)\n"
	f" Track B reverts to {bpm_b:.1f} BPM after mix\n"
	f"PITCH CORRECTION : {'+' if shift >= 0 else ''}{shift} semitones\n"
	f"OUTPUT : 44100 Hz · 16-bit PCM WAV"
	)
	return out_path, stats
	except Exception as e:
	import traceback
	return None, f"Error: {str(e)}\n\n{traceback.format_exc()}"


	with gr.Blocks(title="AI AutoMix DJ") as demo:

	gr.HTML("<style>" + CSS + "</style>")

	gr.HTML("""
	<div class="app-header">
	<div class="app-title">AI AutoMix DJ</div>
	<div class="app-sub">Gradual BPM Morph · Smart Downbeat Detection · EQ Harmonic Mixing</div>
	</div>
	""")

	with gr.Row():
	with gr.Column(scale=1):
	gr.HTML('<div style="color:#6b6b80;font-size:.75rem;text-transform:uppercase;'
	'letter-spacing:.1em;margin-bottom:6px;">Track A — Outgoing</div>')
	file_a = gr.Audio(label="", type="filepath", elem_classes=["panel"])
	with gr.Column(scale=1):
	gr.HTML('<div style="color:#6b6b80;font-size:.75rem;text-transform:uppercase;'
	'letter-spacing:.1em;margin-bottom:6px;">Track B — Incoming</div>')
	file_b = gr.Audio(label="", type="filepath", elem_classes=["panel"])

	gr.HTML('<hr class="divider"/>')

	with gr.Row():
	with gr.Column(scale=2):
	transition_slider = gr.Slider(minimum=5, maximum=15, value=10, step=1,
	label="Transition Duration (seconds)")
	with gr.Column(scale=2):
	mix_style = gr.Dropdown(choices=["Smooth Mix", "Aggressive Mix"],
	value="Smooth Mix", label="Mix Style")
	with gr.Column(scale=1):
	mix_btn = gr.Button("▶ Mix Now", elem_classes=["mix-btn"])

	gr.HTML('<hr class="divider"/>')

	with gr.Row():
	with gr.Column(scale=3):
	gr.HTML('<div style="color:#6b6b80;font-size:.75rem;text-transform:uppercase;'
	'letter-spacing:.1em;margin-bottom:8px;">Output Mix</div>')
	output_audio = gr.Audio(label="", type="filepath")
	with gr.Column(scale=2):
	gr.HTML('<div style="color:#6b6b80;font-size:.75rem;text-transform:uppercase;'
	'letter-spacing:.1em;margin-bottom:8px;">Mix Analysis</div>')
	stats_box = gr.Textbox(value="Waiting for mix…", label="",
	lines=9, max_lines=12, elem_classes=["stats-box"])

	mix_btn.click(fn=run_mix,
	inputs=[file_a, file_b, transition_slider, mix_style],
	outputs=[output_audio, stats_box])

	gr.HTML("""
	<div style="text-align:center;margin-top:32px;color:#2a2a3a;font-size:.72rem;letter-spacing:.12em;">
	AI AUTOMIX DJ · GRADUAL BPM MORPH ENGINE · LIBROSA · GRADIO · SCIPY
	</div>
	""")


	if __name__ == "__main__":
	demo.launch()