import gradio as gr import librosa import numpy as np import soundfile as sf import scipy.signal import tempfile import os # ───────────────────────────────────────────── # AUDIO I/O & BASIC PROCESSING # ───────────────────────────────────────────── def load_audio(path, target_sr=44100): y, sr = librosa.load(path, sr=target_sr, mono=True) return y.astype(np.float32), sr def normalize(y, headroom_db=-1.0): peak = np.max(np.abs(y)) if peak < 1e-9: return y target = 10 ** (headroom_db / 20.0) return y * (target / peak) def anti_click_window(y, ramp_samples=256): out = y.copy() ramp = np.linspace(0, 1, ramp_samples).astype(np.float32) out[:ramp_samples] *= ramp out[-ramp_samples:] *= ramp[::-1] return out # ───────────────────────────────────────────── # BPM DETECTION # ───────────────────────────────────────────── def detect_bpm(y, sr): tempo, _ = librosa.beat.beat_track(y=y, sr=sr) return float(np.asarray(tempo).flatten()[0]) # ───────────────────────────────────────────── # KEY DETECTION # ───────────────────────────────────────────── def detect_key(y, sr): chroma = librosa.feature.chroma_cqt(y=y, sr=sr) chroma_mean = chroma.mean(axis=1) major_t = np.array([1,0,1,0,1,1,0,1,0,1,0,1], dtype=float) minor_t = np.array([1,0,1,1,0,1,0,1,1,0,1,0], dtype=float) maj_s = [np.dot(np.roll(major_t, i), chroma_mean) for i in range(12)] min_s = [np.dot(np.roll(minor_t, i), chroma_mean) for i in range(12)] if max(maj_s) >= max(min_s): return int(np.argmax(maj_s)), "major" return int(np.argmax(min_s)), "minor" def semitones_to_shift(key_a, key_b): diff = (key_b - key_a) % 12 if diff > 6: diff -= 12 return diff def pitch_shift(y, sr, n_steps): if n_steps == 0: return y return librosa.effects.pitch_shift(y=y, sr=sr, n_steps=float(n_steps)) # ───────────────────────────────────────────── # SMART TRANSITION POINT (downbeat + breakdown) # ───────────────────────────────────────────── def find_best_transition_point(y_a, sr, transition_sec, hop=512): """ Score every frame of track A and pick the best mix-out point. Criteria: • Low RMS energy → breakdown / intro of chorus = easier to mix out • Low onset strength → not mid-drum-fill • Beat-aligned (4-beat downbeat) → rhythmic correctness • Position: last 35-70 % of track → natural DJ cue range """ rms = librosa.feature.rms(y=y_a, hop_length=hop)[0] onset = librosa.onset.onset_strength(y=y_a, sr=sr, hop_length=hop) total_f = len(rms) trans_f = int(transition_sec * sr / hop) # Smooth energy over ~1.5 s to detect macro breakdowns smooth_rms = np.convolve(rms, np.ones(30) / 30, mode='same') # Score: favour low energy AND low onset density score = (1.0 / (smooth_rms + 1e-6)) * (1.0 / (onset + 1e-6)) # Position window w_start = int(total_f * 0.35) w_end = max(w_start + 1, total_f - trans_f - int(4 * sr / hop)) if w_end <= w_start: w_start = max(0, total_f - trans_f - 20) w_end = max(w_start + 1, total_f - trans_f) # Gaussian position weight centred at 55 % of track centre = int(total_f * 0.55) sigma = total_f * 0.12 pos_w = np.exp(-0.5 * ((np.arange(total_f) - centre) / sigma) ** 2) score = score * pos_w score[:w_start] = 0 score[w_end:] = 0 # Restrict to 4-beat downbeat positions _, beat_frames = librosa.beat.beat_track(y=y_a, sr=sr, hop_length=hop) downbeats = beat_frames[::4] candidates = [int(f) for f in downbeats if w_start <= f < w_end] if candidates: best_frame = max(candidates, key=lambda f: score[min(f, total_f - 1)]) else: best_frame = int(np.argmax(score)) return int(best_frame) * hop # ───────────────────────────────────────────── # GRADUAL BPM MORPH ENGINE # ───────────────────────────────────────────── def _fit_to_length(y, n): """Trim or zero-pad array to exactly n samples.""" n = int(n) if len(y) >= n: return y[:n].astype(np.float32) return np.pad(y, (0, n - len(y))).astype(np.float32) def build_gradual_stretch(y, sr, bpm_start, bpm_end, n_samples_target): """ Produce a segment of exactly n_samples_target samples whose playback tempo linearly morphs from bpm_start → bpm_end. Method: overlap-add of short (80 ms) Hann-windowed chunks, each independently time-stretched by the instantaneous rate. """ if abs(bpm_start - bpm_end) < 0.5 or bpm_end < 1: rate = float(np.clip(bpm_start / max(bpm_end, 1), 0.5, 2.0)) return _fit_to_length(librosa.effects.time_stretch(y, rate=rate), n_samples_target) WIN = int(sr * 0.08) # 80 ms analysis window HOP = WIN // 2 N = max(1, (len(y) - WIN) // HOP + 1) # Output buffer sized generously out_buf = np.zeros(int(n_samples_target * 2 + sr), dtype=np.float32) write_pos = 0 for i in range(N): t = i / max(N - 1, 1) # 0 → 1 bpm_now = bpm_start + (bpm_end - bpm_start) * t rate = float(np.clip(bpm_start / max(bpm_now, 1), 0.5, 2.0)) r_start = i * HOP r_end = min(r_start + WIN, len(y)) chunk = y[r_start:r_end] if len(chunk) < 32: continue stretched = librosa.effects.time_stretch(chunk, rate=rate) # Hann envelope for smooth overlap-add (avoids clicks between chunks) env = np.hanning(len(stretched)).astype(np.float32) stretched = (stretched * env).astype(np.float32) w_end = write_pos + len(stretched) if w_end > len(out_buf): extra = w_end - len(out_buf) + sr out_buf = np.concatenate([out_buf, np.zeros(extra, dtype=np.float32)]) out_buf[write_pos:w_end] += stretched write_pos += HOP # advance by HOP (overlap-add) result = out_buf[:write_pos + WIN] return _fit_to_length(result, n_samples_target) # ───────────────────────────────────────────── # EQ / FILTERS # ───────────────────────────────────────────── def lowpass_filter(y, sr, cutoff_hz=300, order=4): nyq = sr / 2.0 if cutoff_hz >= nyq: return y b, a = scipy.signal.butter(order, cutoff_hz / nyq, btype='low') return scipy.signal.filtfilt(b, a, y).astype(np.float32) def highpass_filter(y, sr, cutoff_hz=300, order=4): nyq = sr / 2.0 if cutoff_hz >= nyq: return y b, a = scipy.signal.butter(order, cutoff_hz / nyq, btype='high') return scipy.signal.filtfilt(b, a, y).astype(np.float32) def make_crossfade_curve(length, style="smooth"): t = np.linspace(0, np.pi, length) if style == "smooth": fade_out = 0.5 * (1 + np.cos(t)) fade_in = 0.5 * (1 - np.cos(t)) else: x = np.linspace(0, 1, length) fade_out = np.exp(-4 * x); fade_out /= fade_out[0] fade_in = 1.0 - np.exp(-4 * x) return fade_out.astype(np.float32), fade_in.astype(np.float32) def eq_crossfade(seg_a, seg_b, sr, fade_out, fade_in): """Bass-split EQ crossfade: cut A's sub while bringing in B's sub.""" bass_a = lowpass_filter(seg_a, sr) mids_a = highpass_filter(seg_a, sr) bass_b = lowpass_filter(seg_b, sr) mids_b = highpass_filter(seg_b, sr) return ((bass_a * fade_out + bass_b * fade_in) + (mids_a * fade_out + mids_b * fade_in)).astype(np.float32) # ───────────────────────────────────────────── # MAIN MIX ENGINE # ───────────────────────────────────────────── def automix(file_a_path, file_b_path, transition_sec=10, mix_style="Smooth Mix"): SR = 44100 # 1 ── Load & normalize ─────────────────────────────────────────────── y_a, _ = load_audio(file_a_path, SR) y_b, _ = load_audio(file_b_path, SR) y_a = normalize(y_a) y_b = normalize(y_b) # 2 ── BPM detection ────────────────────────────────────────────────── bpm_a = detect_bpm(y_a, SR) bpm_b = detect_bpm(y_b, SR) # 3 ── Key / pitch correction on track B ────────────────────────────── key_a, _ = detect_key(y_a, SR) key_b, _ = detect_key(y_b, SR) shift = semitones_to_shift(key_a, key_b) if abs(shift) > 0: y_b = pitch_shift(y_b, SR, -shift) y_b = normalize(y_b) # 4 ── Smart transition point (downbeat in a breakdown) ──────────────── trans_samples = int(transition_sec * SR) trans_start = find_best_transition_point(y_a, SR, transition_sec) trans_start = int(np.clip(trans_start, 0, max(0, len(y_a) - trans_samples - 1))) trans_end = trans_start + trans_samples # 5 ── Gradual BPM morph segments ───────────────────────────────────── # # zone_a : Track A's outro — tempo morphs FROM bpm_a TO bpm_b # so it meets track B at the same speed mid-transition # # zone_b : Track B's intro — tempo morphs FROM bpm_b TO bpm_a # (mirror of zone_a, so both tracks align at the midpoint) # After the transition, track B continues at its natural bpm_b # (we just play the unmodified tail) # seg_a_raw = y_a[trans_start:trans_end] zone_a = build_gradual_stretch(seg_a_raw, SR, bpm_start=bpm_a, bpm_end=bpm_b, n_samples_target=trans_samples) if len(y_b) < trans_samples: seg_b_raw = np.pad(y_b, (0, trans_samples - len(y_b))) else: seg_b_raw = y_b[:trans_samples] zone_b = build_gradual_stretch(seg_b_raw, SR, bpm_start=bpm_b, bpm_end=bpm_a, n_samples_target=trans_samples) # Track B tail plays at its ORIGINAL natural tempo (no stretching) tail_b = y_b[trans_samples:] if len(y_b) > trans_samples else np.array([], dtype=np.float32) # 6 ── EQ crossfade blend ───────────────────────────────────────────── style_key = "aggressive" if "Aggressive" in mix_style else "smooth" fade_out, fade_in = make_crossfade_curve(trans_samples, style_key) blend = eq_crossfade(zone_a, zone_b, SR, fade_out, fade_in) # 7 ── Assemble final mix ────────────────────────────────────────────── pre_a = y_a[:trans_start] mix = np.concatenate([pre_a, blend, tail_b]) # 8 ── Polish ───────────────────────────────────────────────────────── mix = anti_click_window(mix) mix = normalize(mix, headroom_db=-0.5) out_path = os.path.join(tempfile.gettempdir(), "automix_output.wav") sf.write(out_path, mix, SR, subtype='PCM_16') trans_time_sec = trans_start / SR return out_path, bpm_a, bpm_b, key_a, key_b, shift, trans_time_sec # ───────────────────────────────────────────── # GRADIO UI # ───────────────────────────────────────────── KEY_NAMES = ["C","C#","D","D#","E","F","F#","G","G#","A","A#","B"] CSS = """ @import url('https://fonts.googleapis.com/css2?family=Syne:wght@400;700;800&family=DM+Mono:wght@300;400;500&display=swap'); :root { --bg: #0a0a0f; --surface: #111118; --border: #1f1f2e; --accent1: #ff3cac; --accent2: #00f0ff; --accent3: #7928ca; --text: #e8e8f0; --muted: #6b6b80; --radius: 12px; } body, .gradio-container { background: var(--bg) !important; font-family: 'DM Mono', monospace !important; color: var(--text) !important; } .app-header { text-align: center; padding: 48px 20px 32px; position: relative; } .app-header::before { content: ''; position: absolute; inset: 0; background: radial-gradient(ellipse 80% 60% at 50% 0%, rgba(121,40,202,.18) 0%, transparent 70%); pointer-events: none; } .app-title { font-family: 'Syne', sans-serif !important; font-size: clamp(2.2rem, 5vw, 3.8rem) !important; font-weight: 800 !important; letter-spacing: -1px; background: linear-gradient(135deg, var(--accent1) 0%, var(--accent2) 55%, var(--accent3) 100%); -webkit-background-clip: text; -webkit-text-fill-color: transparent; background-clip: text; margin: 0 0 8px; } .app-sub { color: var(--muted); font-size: .85rem; letter-spacing: .12em; text-transform: uppercase; } .panel { background: var(--surface); border: 1px solid var(--border); border-radius: var(--radius); padding: 24px; } .mix-btn { background: linear-gradient(135deg, var(--accent1), var(--accent3)) !important; border: none !important; color: #fff !important; font-family: 'Syne', sans-serif !important; font-weight: 700 !important; font-size: 1rem !important; letter-spacing: .08em !important; padding: 14px 36px !important; border-radius: 8px !important; cursor: pointer !important; transition: opacity .2s, transform .15s !important; } .mix-btn:hover { opacity: .88; transform: translateY(-1px); } .stats-box { background: #0d0d14 !important; border: 1px solid var(--border) !important; border-radius: 8px !important; padding: 16px 20px !important; font-size: .78rem !important; color: var(--muted) !important; line-height: 1.8 !important; white-space: pre-wrap !important; } .gr-block, .gr-box { background: var(--surface) !important; border-color: var(--border) !important; } label { color: var(--muted) !important; font-size: .78rem !important; letter-spacing: .1em !important; text-transform: uppercase !important; } input[type=range] { accent-color: var(--accent1); } select, .gr-dropdown { background: #0d0d14 !important; color: var(--text) !important; border-color: var(--border) !important; } audio { width: 100%; border-radius: 8px; } .divider { border: none; border-top: 1px solid var(--border); margin: 24px 0; } """ def run_mix(file_a, file_b, transition_sec, mix_style): if file_a is None or file_b is None: return None, "⚠ Please upload both tracks." try: out_path, bpm_a, bpm_b, key_a, key_b, shift, trans_sec = automix( file_a, file_b, transition_sec=int(transition_sec), mix_style=mix_style ) mm = int(trans_sec // 60) ss = int(trans_sec % 60) stats = ( f"TRACK A → BPM: {bpm_a:.1f} KEY: {KEY_NAMES[key_a]}\n" f"TRACK B → BPM: {bpm_b:.1f} KEY: {KEY_NAMES[key_b]}\n" f"─────────────────────────────────────────\n" f"TRANSITION START : {mm:02d}:{ss:02d} (auto-detected downbeat)\n" f"TRANSITION LENGTH : {int(transition_sec)} seconds\n" f"MIX STYLE : {mix_style.upper()}\n" f"BPM MORPH : {bpm_a:.1f} ↔ {bpm_b:.1f} (gradual during transition)\n" f" Track B reverts to {bpm_b:.1f} BPM after mix\n" f"PITCH CORRECTION : {'+' if shift >= 0 else ''}{shift} semitones\n" f"OUTPUT : 44100 Hz · 16-bit PCM WAV" ) return out_path, stats except Exception as e: import traceback return None, f"Error: {str(e)}\n\n{traceback.format_exc()}" with gr.Blocks(title="AI AutoMix DJ") as demo: gr.HTML("") gr.HTML("""
AI AutoMix DJ
Gradual BPM Morph · Smart Downbeat Detection · EQ Harmonic Mixing
""") with gr.Row(): with gr.Column(scale=1): gr.HTML('
Track A — Outgoing
') file_a = gr.Audio(label="", type="filepath", elem_classes=["panel"]) with gr.Column(scale=1): gr.HTML('
Track B — Incoming
') file_b = gr.Audio(label="", type="filepath", elem_classes=["panel"]) gr.HTML('
') with gr.Row(): with gr.Column(scale=2): transition_slider = gr.Slider(minimum=5, maximum=15, value=10, step=1, label="Transition Duration (seconds)") with gr.Column(scale=2): mix_style = gr.Dropdown(choices=["Smooth Mix", "Aggressive Mix"], value="Smooth Mix", label="Mix Style") with gr.Column(scale=1): mix_btn = gr.Button("▶ Mix Now", elem_classes=["mix-btn"]) gr.HTML('
') with gr.Row(): with gr.Column(scale=3): gr.HTML('
Output Mix
') output_audio = gr.Audio(label="", type="filepath") with gr.Column(scale=2): gr.HTML('
Mix Analysis
') stats_box = gr.Textbox(value="Waiting for mix…", label="", lines=9, max_lines=12, elem_classes=["stats-box"]) mix_btn.click(fn=run_mix, inputs=[file_a, file_b, transition_slider, mix_style], outputs=[output_audio, stats_box]) gr.HTML("""
AI AUTOMIX DJ · GRADUAL BPM MORPH ENGINE · LIBROSA · GRADIO · SCIPY
""") if __name__ == "__main__": demo.launch()