Audiomix / app.py
teszenofficial's picture
Upload app.py
f5ef918 verified
import gradio as gr
import librosa
import numpy as np
import soundfile as sf
import scipy.signal
import tempfile
import os
# ─────────────────────────────────────────────
# AUDIO I/O & BASIC PROCESSING
# ─────────────────────────────────────────────
def load_audio(path, target_sr=44100):
y, sr = librosa.load(path, sr=target_sr, mono=True)
return y.astype(np.float32), sr
def normalize(y, headroom_db=-1.0):
peak = np.max(np.abs(y))
if peak < 1e-9:
return y
target = 10 ** (headroom_db / 20.0)
return y * (target / peak)
def anti_click_window(y, ramp_samples=256):
out = y.copy()
ramp = np.linspace(0, 1, ramp_samples).astype(np.float32)
out[:ramp_samples] *= ramp
out[-ramp_samples:] *= ramp[::-1]
return out
# ─────────────────────────────────────────────
# BPM DETECTION
# ─────────────────────────────────────────────
def detect_bpm(y, sr):
tempo, _ = librosa.beat.beat_track(y=y, sr=sr)
return float(np.asarray(tempo).flatten()[0])
# ─────────────────────────────────────────────
# KEY DETECTION
# ─────────────────────────────────────────────
def detect_key(y, sr):
chroma = librosa.feature.chroma_cqt(y=y, sr=sr)
chroma_mean = chroma.mean(axis=1)
major_t = np.array([1,0,1,0,1,1,0,1,0,1,0,1], dtype=float)
minor_t = np.array([1,0,1,1,0,1,0,1,1,0,1,0], dtype=float)
maj_s = [np.dot(np.roll(major_t, i), chroma_mean) for i in range(12)]
min_s = [np.dot(np.roll(minor_t, i), chroma_mean) for i in range(12)]
if max(maj_s) >= max(min_s):
return int(np.argmax(maj_s)), "major"
return int(np.argmax(min_s)), "minor"
def semitones_to_shift(key_a, key_b):
diff = (key_b - key_a) % 12
if diff > 6:
diff -= 12
return diff
def pitch_shift(y, sr, n_steps):
if n_steps == 0:
return y
return librosa.effects.pitch_shift(y=y, sr=sr, n_steps=float(n_steps))
# ─────────────────────────────────────────────
# SMART TRANSITION POINT (downbeat + breakdown)
# ─────────────────────────────────────────────
def find_best_transition_point(y_a, sr, transition_sec, hop=512):
"""
Score every frame of track A and pick the best mix-out point.
Criteria:
β€’ Low RMS energy β†’ breakdown / intro of chorus = easier to mix out
β€’ Low onset strength β†’ not mid-drum-fill
β€’ Beat-aligned (4-beat downbeat) β†’ rhythmic correctness
β€’ Position: last 35-70 % of track β†’ natural DJ cue range
"""
rms = librosa.feature.rms(y=y_a, hop_length=hop)[0]
onset = librosa.onset.onset_strength(y=y_a, sr=sr, hop_length=hop)
total_f = len(rms)
trans_f = int(transition_sec * sr / hop)
# Smooth energy over ~1.5 s to detect macro breakdowns
smooth_rms = np.convolve(rms, np.ones(30) / 30, mode='same')
# Score: favour low energy AND low onset density
score = (1.0 / (smooth_rms + 1e-6)) * (1.0 / (onset + 1e-6))
# Position window
w_start = int(total_f * 0.35)
w_end = max(w_start + 1, total_f - trans_f - int(4 * sr / hop))
if w_end <= w_start:
w_start = max(0, total_f - trans_f - 20)
w_end = max(w_start + 1, total_f - trans_f)
# Gaussian position weight centred at 55 % of track
centre = int(total_f * 0.55)
sigma = total_f * 0.12
pos_w = np.exp(-0.5 * ((np.arange(total_f) - centre) / sigma) ** 2)
score = score * pos_w
score[:w_start] = 0
score[w_end:] = 0
# Restrict to 4-beat downbeat positions
_, beat_frames = librosa.beat.beat_track(y=y_a, sr=sr, hop_length=hop)
downbeats = beat_frames[::4]
candidates = [int(f) for f in downbeats if w_start <= f < w_end]
if candidates:
best_frame = max(candidates, key=lambda f: score[min(f, total_f - 1)])
else:
best_frame = int(np.argmax(score))
return int(best_frame) * hop
# ─────────────────────────────────────────────
# GRADUAL BPM MORPH ENGINE
# ─────────────────────────────────────────────
def _fit_to_length(y, n):
"""Trim or zero-pad array to exactly n samples."""
n = int(n)
if len(y) >= n:
return y[:n].astype(np.float32)
return np.pad(y, (0, n - len(y))).astype(np.float32)
def build_gradual_stretch(y, sr, bpm_start, bpm_end, n_samples_target):
"""
Produce a segment of exactly n_samples_target samples whose playback
tempo linearly morphs from bpm_start β†’ bpm_end.
Method: overlap-add of short (80 ms) Hann-windowed chunks, each
independently time-stretched by the instantaneous rate.
"""
if abs(bpm_start - bpm_end) < 0.5 or bpm_end < 1:
rate = float(np.clip(bpm_start / max(bpm_end, 1), 0.5, 2.0))
return _fit_to_length(librosa.effects.time_stretch(y, rate=rate),
n_samples_target)
WIN = int(sr * 0.08) # 80 ms analysis window
HOP = WIN // 2
N = max(1, (len(y) - WIN) // HOP + 1)
# Output buffer sized generously
out_buf = np.zeros(int(n_samples_target * 2 + sr), dtype=np.float32)
write_pos = 0
for i in range(N):
t = i / max(N - 1, 1) # 0 β†’ 1
bpm_now = bpm_start + (bpm_end - bpm_start) * t
rate = float(np.clip(bpm_start / max(bpm_now, 1), 0.5, 2.0))
r_start = i * HOP
r_end = min(r_start + WIN, len(y))
chunk = y[r_start:r_end]
if len(chunk) < 32:
continue
stretched = librosa.effects.time_stretch(chunk, rate=rate)
# Hann envelope for smooth overlap-add (avoids clicks between chunks)
env = np.hanning(len(stretched)).astype(np.float32)
stretched = (stretched * env).astype(np.float32)
w_end = write_pos + len(stretched)
if w_end > len(out_buf):
extra = w_end - len(out_buf) + sr
out_buf = np.concatenate([out_buf, np.zeros(extra, dtype=np.float32)])
out_buf[write_pos:w_end] += stretched
write_pos += HOP # advance by HOP (overlap-add)
result = out_buf[:write_pos + WIN]
return _fit_to_length(result, n_samples_target)
# ─────────────────────────────────────────────
# EQ / FILTERS
# ─────────────────────────────────────────────
def lowpass_filter(y, sr, cutoff_hz=300, order=4):
nyq = sr / 2.0
if cutoff_hz >= nyq:
return y
b, a = scipy.signal.butter(order, cutoff_hz / nyq, btype='low')
return scipy.signal.filtfilt(b, a, y).astype(np.float32)
def highpass_filter(y, sr, cutoff_hz=300, order=4):
nyq = sr / 2.0
if cutoff_hz >= nyq:
return y
b, a = scipy.signal.butter(order, cutoff_hz / nyq, btype='high')
return scipy.signal.filtfilt(b, a, y).astype(np.float32)
def make_crossfade_curve(length, style="smooth"):
t = np.linspace(0, np.pi, length)
if style == "smooth":
fade_out = 0.5 * (1 + np.cos(t))
fade_in = 0.5 * (1 - np.cos(t))
else:
x = np.linspace(0, 1, length)
fade_out = np.exp(-4 * x); fade_out /= fade_out[0]
fade_in = 1.0 - np.exp(-4 * x)
return fade_out.astype(np.float32), fade_in.astype(np.float32)
def eq_crossfade(seg_a, seg_b, sr, fade_out, fade_in):
"""Bass-split EQ crossfade: cut A's sub while bringing in B's sub."""
bass_a = lowpass_filter(seg_a, sr)
mids_a = highpass_filter(seg_a, sr)
bass_b = lowpass_filter(seg_b, sr)
mids_b = highpass_filter(seg_b, sr)
return ((bass_a * fade_out + bass_b * fade_in) +
(mids_a * fade_out + mids_b * fade_in)).astype(np.float32)
# ─────────────────────────────────────────────
# MAIN MIX ENGINE
# ─────────────────────────────────────────────
def automix(file_a_path, file_b_path, transition_sec=10, mix_style="Smooth Mix"):
SR = 44100
# 1 ── Load & normalize ───────────────────────────────────────────────
y_a, _ = load_audio(file_a_path, SR)
y_b, _ = load_audio(file_b_path, SR)
y_a = normalize(y_a)
y_b = normalize(y_b)
# 2 ── BPM detection ──────────────────────────────────────────────────
bpm_a = detect_bpm(y_a, SR)
bpm_b = detect_bpm(y_b, SR)
# 3 ── Key / pitch correction on track B ──────────────────────────────
key_a, _ = detect_key(y_a, SR)
key_b, _ = detect_key(y_b, SR)
shift = semitones_to_shift(key_a, key_b)
if abs(shift) > 0:
y_b = pitch_shift(y_b, SR, -shift)
y_b = normalize(y_b)
# 4 ── Smart transition point (downbeat in a breakdown) ────────────────
trans_samples = int(transition_sec * SR)
trans_start = find_best_transition_point(y_a, SR, transition_sec)
trans_start = int(np.clip(trans_start, 0,
max(0, len(y_a) - trans_samples - 1)))
trans_end = trans_start + trans_samples
# 5 ── Gradual BPM morph segments ─────────────────────────────────────
#
# zone_a : Track A's outro β€” tempo morphs FROM bpm_a TO bpm_b
# so it meets track B at the same speed mid-transition
#
# zone_b : Track B's intro β€” tempo morphs FROM bpm_b TO bpm_a
# (mirror of zone_a, so both tracks align at the midpoint)
# After the transition, track B continues at its natural bpm_b
# (we just play the unmodified tail)
#
seg_a_raw = y_a[trans_start:trans_end]
zone_a = build_gradual_stretch(seg_a_raw, SR,
bpm_start=bpm_a,
bpm_end=bpm_b,
n_samples_target=trans_samples)
if len(y_b) < trans_samples:
seg_b_raw = np.pad(y_b, (0, trans_samples - len(y_b)))
else:
seg_b_raw = y_b[:trans_samples]
zone_b = build_gradual_stretch(seg_b_raw, SR,
bpm_start=bpm_b,
bpm_end=bpm_a,
n_samples_target=trans_samples)
# Track B tail plays at its ORIGINAL natural tempo (no stretching)
tail_b = y_b[trans_samples:] if len(y_b) > trans_samples else np.array([], dtype=np.float32)
# 6 ── EQ crossfade blend ─────────────────────────────────────────────
style_key = "aggressive" if "Aggressive" in mix_style else "smooth"
fade_out, fade_in = make_crossfade_curve(trans_samples, style_key)
blend = eq_crossfade(zone_a, zone_b, SR, fade_out, fade_in)
# 7 ── Assemble final mix ──────────────────────────────────────────────
pre_a = y_a[:trans_start]
mix = np.concatenate([pre_a, blend, tail_b])
# 8 ── Polish ─────────────────────────────────────────────────────────
mix = anti_click_window(mix)
mix = normalize(mix, headroom_db=-0.5)
out_path = os.path.join(tempfile.gettempdir(), "automix_output.wav")
sf.write(out_path, mix, SR, subtype='PCM_16')
trans_time_sec = trans_start / SR
return out_path, bpm_a, bpm_b, key_a, key_b, shift, trans_time_sec
# ─────────────────────────────────────────────
# GRADIO UI
# ─────────────────────────────────────────────
KEY_NAMES = ["C","C#","D","D#","E","F","F#","G","G#","A","A#","B"]
CSS = """
@import url('https://fonts.googleapis.com/css2?family=Syne:wght@400;700;800&family=DM+Mono:wght@300;400;500&display=swap');
:root {
--bg: #0a0a0f;
--surface: #111118;
--border: #1f1f2e;
--accent1: #ff3cac;
--accent2: #00f0ff;
--accent3: #7928ca;
--text: #e8e8f0;
--muted: #6b6b80;
--radius: 12px;
}
body, .gradio-container {
background: var(--bg) !important;
font-family: 'DM Mono', monospace !important;
color: var(--text) !important;
}
.app-header {
text-align: center;
padding: 48px 20px 32px;
position: relative;
}
.app-header::before {
content: '';
position: absolute;
inset: 0;
background: radial-gradient(ellipse 80% 60% at 50% 0%, rgba(121,40,202,.18) 0%, transparent 70%);
pointer-events: none;
}
.app-title {
font-family: 'Syne', sans-serif !important;
font-size: clamp(2.2rem, 5vw, 3.8rem) !important;
font-weight: 800 !important;
letter-spacing: -1px;
background: linear-gradient(135deg, var(--accent1) 0%, var(--accent2) 55%, var(--accent3) 100%);
-webkit-background-clip: text;
-webkit-text-fill-color: transparent;
background-clip: text;
margin: 0 0 8px;
}
.app-sub {
color: var(--muted);
font-size: .85rem;
letter-spacing: .12em;
text-transform: uppercase;
}
.panel {
background: var(--surface);
border: 1px solid var(--border);
border-radius: var(--radius);
padding: 24px;
}
.mix-btn {
background: linear-gradient(135deg, var(--accent1), var(--accent3)) !important;
border: none !important;
color: #fff !important;
font-family: 'Syne', sans-serif !important;
font-weight: 700 !important;
font-size: 1rem !important;
letter-spacing: .08em !important;
padding: 14px 36px !important;
border-radius: 8px !important;
cursor: pointer !important;
transition: opacity .2s, transform .15s !important;
}
.mix-btn:hover { opacity: .88; transform: translateY(-1px); }
.stats-box {
background: #0d0d14 !important;
border: 1px solid var(--border) !important;
border-radius: 8px !important;
padding: 16px 20px !important;
font-size: .78rem !important;
color: var(--muted) !important;
line-height: 1.8 !important;
white-space: pre-wrap !important;
}
.gr-block, .gr-box { background: var(--surface) !important; border-color: var(--border) !important; }
label { color: var(--muted) !important; font-size: .78rem !important; letter-spacing: .1em !important; text-transform: uppercase !important; }
input[type=range] { accent-color: var(--accent1); }
select, .gr-dropdown { background: #0d0d14 !important; color: var(--text) !important; border-color: var(--border) !important; }
audio { width: 100%; border-radius: 8px; }
.divider { border: none; border-top: 1px solid var(--border); margin: 24px 0; }
"""
def run_mix(file_a, file_b, transition_sec, mix_style):
if file_a is None or file_b is None:
return None, "⚠ Please upload both tracks."
try:
out_path, bpm_a, bpm_b, key_a, key_b, shift, trans_sec = automix(
file_a, file_b,
transition_sec=int(transition_sec),
mix_style=mix_style
)
mm = int(trans_sec // 60)
ss = int(trans_sec % 60)
stats = (
f"TRACK A β†’ BPM: {bpm_a:.1f} KEY: {KEY_NAMES[key_a]}\n"
f"TRACK B β†’ BPM: {bpm_b:.1f} KEY: {KEY_NAMES[key_b]}\n"
f"─────────────────────────────────────────\n"
f"TRANSITION START : {mm:02d}:{ss:02d} (auto-detected downbeat)\n"
f"TRANSITION LENGTH : {int(transition_sec)} seconds\n"
f"MIX STYLE : {mix_style.upper()}\n"
f"BPM MORPH : {bpm_a:.1f} ↔ {bpm_b:.1f} (gradual during transition)\n"
f" Track B reverts to {bpm_b:.1f} BPM after mix\n"
f"PITCH CORRECTION : {'+' if shift >= 0 else ''}{shift} semitones\n"
f"OUTPUT : 44100 Hz Β· 16-bit PCM WAV"
)
return out_path, stats
except Exception as e:
import traceback
return None, f"Error: {str(e)}\n\n{traceback.format_exc()}"
with gr.Blocks(title="AI AutoMix DJ") as demo:
gr.HTML("<style>" + CSS + "</style>")
gr.HTML("""
<div class="app-header">
<div class="app-title">AI AutoMix DJ</div>
<div class="app-sub">Gradual BPM Morph Β· Smart Downbeat Detection Β· EQ Harmonic Mixing</div>
</div>
""")
with gr.Row():
with gr.Column(scale=1):
gr.HTML('<div style="color:#6b6b80;font-size:.75rem;text-transform:uppercase;'
'letter-spacing:.1em;margin-bottom:6px;">Track A β€” Outgoing</div>')
file_a = gr.Audio(label="", type="filepath", elem_classes=["panel"])
with gr.Column(scale=1):
gr.HTML('<div style="color:#6b6b80;font-size:.75rem;text-transform:uppercase;'
'letter-spacing:.1em;margin-bottom:6px;">Track B β€” Incoming</div>')
file_b = gr.Audio(label="", type="filepath", elem_classes=["panel"])
gr.HTML('<hr class="divider"/>')
with gr.Row():
with gr.Column(scale=2):
transition_slider = gr.Slider(minimum=5, maximum=15, value=10, step=1,
label="Transition Duration (seconds)")
with gr.Column(scale=2):
mix_style = gr.Dropdown(choices=["Smooth Mix", "Aggressive Mix"],
value="Smooth Mix", label="Mix Style")
with gr.Column(scale=1):
mix_btn = gr.Button("β–Ά Mix Now", elem_classes=["mix-btn"])
gr.HTML('<hr class="divider"/>')
with gr.Row():
with gr.Column(scale=3):
gr.HTML('<div style="color:#6b6b80;font-size:.75rem;text-transform:uppercase;'
'letter-spacing:.1em;margin-bottom:8px;">Output Mix</div>')
output_audio = gr.Audio(label="", type="filepath")
with gr.Column(scale=2):
gr.HTML('<div style="color:#6b6b80;font-size:.75rem;text-transform:uppercase;'
'letter-spacing:.1em;margin-bottom:8px;">Mix Analysis</div>')
stats_box = gr.Textbox(value="Waiting for mix…", label="",
lines=9, max_lines=12, elem_classes=["stats-box"])
mix_btn.click(fn=run_mix,
inputs=[file_a, file_b, transition_slider, mix_style],
outputs=[output_audio, stats_box])
gr.HTML("""
<div style="text-align:center;margin-top:32px;color:#2a2a3a;font-size:.72rem;letter-spacing:.12em;">
AI AUTOMIX DJ Β· GRADUAL BPM MORPH ENGINE Β· LIBROSA Β· GRADIO Β· SCIPY
</div>
""")
if __name__ == "__main__":
demo.launch()