Spaces:

rikhoffbauer2
/

drum-sample-extractor

Sleeping

App Files Files Community

rikhoffbauer2 commited on 25 days ago

Commit

2a334ed

verified ·

1 Parent(s): d34b37f

Add app.py

Browse files

Files changed (1) hide show

app.py +570 -0

app.py ADDED Viewed

	@@ -0,0 +1,570 @@

+"""
+Gradio UI for Drum Sample Extractor.
+Three tabs:
+1. Extract — Upload audio, run the pipeline, listen to extracted samples
+2. Evaluate — Generate synthetic songs, compare extraction to ground truth
+3. Auto-Optimize — Run autonomous improvement loop with live progress
+"""
+import gradio as gr
+import numpy as np
+import pandas as pd
+import matplotlib
+matplotlib.use('Agg')
+import matplotlib.pyplot as plt
+import json
+import time
+import sys
+import os
+import io
+import tempfile
+import soundfile as sf
+import librosa
+import traceback
+sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
+from drum_extractor import (
+    extract_drums_demucs, detect_onsets, classify_and_separate_hits,
+    compute_librosa_embeddings, cluster_hits, select_best_representatives,
+    synthesize_from_cluster, DrumCluster,
+)
+from quality_metrics import drum_sample_score, compute_all_reference_metrics
+from synth_generator import generate_test_song
+from evaluation import evaluate_extraction, report_to_dict
+from optimizer import run_optimization_loop, PipelineParams, OptimizerState
+# ─────────────────────────────────────────────────────────────────────────────
+# Helper functions
+# ─────────────────────────────────────────────────────────────────────────────
+def audio_to_tuple(audio: np.ndarray, sr: int) -> tuple:
+    """Convert audio array to Gradio-compatible (sr, data) tuple."""
+    if audio.dtype != np.float32:
+        audio = audio.astype(np.float32)
+    # Normalize to prevent clipping
+    peak = np.abs(audio).max()
+    if peak > 0:
+        audio = audio / peak * 0.95
+    return (sr, audio)
+def make_waveform_plot(audio_dict: dict, sr: int, title: str = "Waveforms") -> plt.Figure:
+    """Create a multi-panel waveform plot."""
+    n = len(audio_dict)
+    if n == 0:
+        fig, ax = plt.subplots(figsize=(10, 2))
+        ax.text(0.5, 0.5, "No audio to display", ha='center', va='center')
+        return fig
+    fig, axes = plt.subplots(n, 1, figsize=(10, max(2, n * 1.5)), squeeze=False)
+    fig.suptitle(title, fontsize=12, fontweight='bold')
+    for idx, (name, audio) in enumerate(audio_dict.items()):
+        ax = axes[idx, 0]
+        t = np.arange(len(audio)) / sr
+        ax.plot(t, audio, linewidth=0.3, color='#2196F3')
+        ax.set_ylabel(name, fontsize=8)
+        ax.set_xlim(0, len(audio) / sr)
+        ax.set_ylim(-1, 1)
+        if idx < n - 1:
+            ax.set_xticklabels([])
+        else:
+            ax.set_xlabel("Time (s)")
+    plt.tight_layout()
+    return fig
+def make_metrics_plot(history: list) -> plt.Figure:
+    """Plot optimization history."""
+    if not history:
+        fig, ax = plt.subplots(figsize=(10, 4))
+        ax.text(0.5, 0.5, "No data yet", ha='center', va='center')
+        return fig
+    iters = [r.iteration for r in history]
+    scores = [r.overall_score for r in history]
+    fig, axes = plt.subplots(2, 2, figsize=(12, 8))
+    fig.suptitle("Optimization Progress", fontsize=14, fontweight='bold')
+    # Overall score
+    ax = axes[0, 0]
+    ax.plot(iters, scores, 'b-o', linewidth=2, markersize=4)
+    ax.set_ylabel("Overall Score")
+    ax.set_title("Overall Score (/100)")
+    ax.grid(True, alpha=0.3)
+    best_idx = np.argmax(scores)
+    ax.scatter([iters[best_idx]], [scores[best_idx]], color='red', s=100, zorder=5, label=f'Best: {scores[best_idx]:.1f}')
+    ax.legend()
+    # SI-SDR
+    ax = axes[0, 1]
+    si_sdrs = [r.eval_report.get('mean_si_sdr', -50) if isinstance(r.eval_report, dict) else -50 for r in history]
+    ax.plot(iters, si_sdrs, 'g-o', linewidth=2, markersize=4)
+    ax.set_ylabel("SI-SDR (dB)")
+    ax.set_title("Mean SI-SDR")
+    ax.grid(True, alpha=0.3)
+    # Sample score
+    ax = axes[1, 0]
+    sample_scores = [r.eval_report.get('mean_sample_score', 0) if isinstance(r.eval_report, dict) else 0 for r in history]
+    ax.plot(iters, sample_scores, 'r-o', linewidth=2, markersize=4)
+    ax.set_ylabel("Sample Score (/100)")
+    ax.set_title("Mean Sample Quality Score")
+    ax.grid(True, alpha=0.3)
+    # Parameter evolution
+    ax = axes[1, 1]
+    thresholds = [r.params.get('energy_threshold_db', -40) for r in history]
+    ax.plot(iters, thresholds, 'm-o', linewidth=2, markersize=4, label='energy_thresh (dB)')
+    ax.set_ylabel("Value")
+    ax.set_title("Parameter Evolution")
+    ax.legend(fontsize=8)
+    ax.grid(True, alpha=0.3)
+    plt.tight_layout()
+    return fig
+# ─────────────────────────────────────────────────────────────────────────────
+# Tab 1: Extract
+# ─────────────────────────────────────────────────────────────────────────────
+def run_extraction(audio_input, progress=gr.Progress()):
+    """Run drum extraction on uploaded audio."""
+    if audio_input is None:
+        return (None,) * 10
+    progress(0.0, desc="Loading audio...")
+    sr_in, data = audio_input
+    data = data.astype(np.float32)
+    if data.ndim > 1:
+        data = data.mean(axis=1)
+    peak = np.abs(data).max()
+    if peak > 0:
+        data = data / peak
+    # Save to temp file for Demucs
+    with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as f:
+        sf.write(f.name, data, sr_in)
+        tmp_path = f.name
+    try:
+        # Stage 1: Demucs
+        progress(0.1, desc="Extracting drum stem (Demucs)...")
+        drums, drums_sr = extract_drums_demucs(tmp_path, device="cpu")
+        # Stage 2: Onsets
+        progress(0.4, desc="Detecting onsets...")
+        hits = detect_onsets(drums, drums_sr)
+        if len(hits) == 0:
+            return (audio_to_tuple(drums, drums_sr),) + (None,) * 9
+        # Stage 3: Classify & separate
+        progress(0.5, desc="Classifying hits...")
+        hits = classify_and_separate_hits(hits, separate_overlaps=True)
+        # Stage 4: Embed & cluster
+        progress(0.6, desc="Clustering similar hits...")
+        embeddings = compute_librosa_embeddings(hits)
+        clusters = cluster_hits(hits, embeddings)
+        # Stage 5: Select best (with quality scoring)
+        progress(0.7, desc="Selecting best representatives...")
+        for cluster in clusters:
+            if cluster.count == 1:
+                cluster.best_hit_idx = 0
+                continue
+            scores = []
+            base_label = cluster.label.rsplit('_', 1)[0]
+            for hit in cluster.hits:
+                score = drum_sample_score(hit.audio, hit.sr, base_label)
+                scores.append(score['total'])
+            cluster.best_hit_idx = int(np.argmax(scores))
+        # Stage 6: Synthesis
+        progress(0.8, desc="Synthesizing optimal samples...")
+        for cluster in clusters:
+            if cluster.count >= 2:
+                cluster.synthesized = synthesize_from_cluster(cluster)
+        progress(0.9, desc="Building results...")
+        # Build outputs
+        drums_out = audio_to_tuple(drums, drums_sr)
+        # Collect up to 8 best samples (sorted by cluster size)
+        sorted_clusters = sorted(clusters, key=lambda c: c.count, reverse=True)[:8]
+        sample_outputs = []
+        for c in sorted_clusters:
+            sample_outputs.append(audio_to_tuple(c.best_hit.audio, c.best_hit.sr))
+        # Pad to 8
+        while len(sample_outputs) < 8:
+            sample_outputs.append(None)
+        # Metrics table
+        rows = []
+        for c in sorted_clusters:
+            best = c.best_hit
+            base_label = c.label.rsplit('_', 1)[0]
+            score = drum_sample_score(best.audio, best.sr, base_label)
+            rows.append({
+                'Cluster': c.label,
+                'Hits': c.count,
+                'Score': f"{score['total']:.1f}",
+                'Completeness': f"{score['completeness']:.2f}",
+                'Cleanness': f"{score['cleanness']:.2f}",
+                'Onset': f"{score['onset_quality']:.2f}",
+                'Duration (ms)': f"{best.duration * 1000:.0f}",
+            })
+        metrics_df = pd.DataFrame(rows)
+        # Waveform plot
+        waveforms = {c.label: c.best_hit.audio for c in sorted_clusters[:6]}
+        fig = make_waveform_plot(waveforms, drums_sr, "Extracted Samples")
+        progress(1.0, desc="Done!")
+        return (drums_out,) + tuple(sample_outputs) + (metrics_df, fig)
+    finally:
+        os.unlink(tmp_path)
+# ─────────────────────────────────────────────────────────────────────────────
+# Tab 2: Evaluate
+# ─────────────────────────────────────────────────────────────────────────────
+def run_evaluation(pattern, bpm, bars, progress=gr.Progress()):
+    """Generate synthetic song, extract, evaluate against ground truth."""
+    progress(0.0, desc="Generating synthetic song...")
+    song = generate_test_song(
+        pattern_name=pattern,
+        bars=int(bars),
+        bpm=float(bpm),
+        variation='medium',
+        seed=42,
+    )
+    progress(0.2, desc="Running extraction pipeline...")
+    hits = detect_onsets(song.drums_only, song.sr)
+    if len(hits) == 0:
+        return None, None, None, None, None, "No hits detected"
+    hits = classify_and_separate_hits(hits, separate_overlaps=True)
+    embeddings = compute_librosa_embeddings(hits)
+    clusters = cluster_hits(hits, embeddings)
+    # Quality-based selection
+    for cluster in clusters:
+        if cluster.count == 1:
+            cluster.best_hit_idx = 0
+            continue
+        scores = []
+        base_label = cluster.label.rsplit('_', 1)[0]
+        for hit in cluster.hits:
+            score = drum_sample_score(hit.audio, hit.sr, base_label)
+            scores.append(score['total'])
+        cluster.best_hit_idx = int(np.argmax(scores))
+    for cluster in clusters:
+        if cluster.count >= 2:
+            cluster.synthesized = synthesize_from_cluster(cluster)
+    progress(0.6, desc="Evaluating against ground truth...")
+    gt_samples = {name: s.audio for name, s in song.samples.items()}
+    gt_hit_map = [
+        {'sample': h.sample_name, 'onset': h.onset_time, 'velocity': h.velocity}
+        for h in song.hits
+    ]
+    report = evaluate_extraction(
+        extracted_clusters=clusters,
+        gt_samples=gt_samples,
+        gt_hit_map=gt_hit_map,
+        sr=song.sr,
+        all_hits=hits,
+    )
+    progress(0.8, desc="Building report...")
+    # Mix audio
+    mix_out = audio_to_tuple(song.mix, song.sr)
+    drums_out = audio_to_tuple(song.drums_only, song.sr)
+    # Metrics table
+    summary_rows = [
+        {'Metric': 'Overall Score', 'Value': f"{report.overall_score:.1f}/100",
+         'Target': '> 70'},
+        {'Metric': 'SI-SDR', 'Value': f"{report.mean_si_sdr:.1f} dB",
+         'Target': '> 10 dB'},
+        {'Metric': 'Sample Score', 'Value': f"{report.mean_sample_score:.1f}/100",
+         'Target': '> 60'},
+        {'Metric': 'Envelope Corr', 'Value': f"{report.mean_env_corr:.3f}",
+         'Target': '> 0.9'},
+        {'Metric': 'Onset Error', 'Value': f"{report.mean_onset_error_ms:.1f} ms",
+         'Target': '< 10 ms'},
+        {'Metric': 'Hit Count Acc', 'Value': f"{report.hit_count_accuracy:.2f}",
+         'Target': '> 0.9'},
+        {'Metric': 'Coverage', 'Value': f"{len(report.matches)}/{len(gt_samples)}",
+         'Target': 'All matched'},
+    ]
+    if report.unmatched_gt:
+        summary_rows.append({
+            'Metric': '⚠ Unmatched GT', 'Value': ', '.join(report.unmatched_gt),
+            'Target': 'None'
+        })
+    summary_df = pd.DataFrame(summary_rows)
+    # Match detail table
+    match_rows = []
+    for m in report.matches:
+        match_rows.append({
+            'Cluster': m.cluster_label,
+            'Matched GT': m.gt_name,
+            'SI-SDR (dB)': f"{m.si_sdr:.1f}",
+            'MFCC Dist': f"{m.mfcc_distance:.2f}",
+            'Env Corr': f"{m.envelope_corr:.3f}",
+            'Score': f"{m.sample_score:.1f}",
+            'Onset (ms)': f"{m.onset_precision_ms:.1f}",
+        })
+    match_df = pd.DataFrame(match_rows) if match_rows else pd.DataFrame()
+    # GT vs Extracted waveforms comparison
+    fig, axes = plt.subplots(len(gt_samples), 2, figsize=(12, len(gt_samples) * 2), squeeze=False)
+    fig.suptitle("Ground Truth vs Best Extracted", fontsize=12, fontweight='bold')
+    for idx, (gt_name, gt_audio) in enumerate(gt_samples.items()):
+        # GT waveform
+        t_gt = np.arange(len(gt_audio)) / song.sr
+        axes[idx, 0].plot(t_gt, gt_audio, color='#4CAF50', linewidth=0.5)
+        axes[idx, 0].set_ylabel(gt_name, fontsize=8)
+        axes[idx, 0].set_ylim(-1, 1)
+        if idx == 0:
+            axes[idx, 0].set_title("Ground Truth")
+        # Find matching extracted sample
+        matching = [m for m in report.matches if m.gt_name == gt_name]
+        if matching:
+            best_match = matching[0]
+            ext_cluster = [c for c in clusters if c.label == best_match.cluster_label]
+            if ext_cluster:
+                ext_audio = ext_cluster[0].best_hit.audio
+                t_ext = np.arange(len(ext_audio)) / song.sr
+                axes[idx, 1].plot(t_ext, ext_audio, color='#FF9800', linewidth=0.5)
+        axes[idx, 1].set_ylim(-1, 1)
+        if idx == 0:
+            axes[idx, 1].set_title("Extracted")
+    plt.tight_layout()
+    progress(1.0, desc="Done!")
+    return mix_out, drums_out, summary_df, match_df, fig, ""
+# ─────────────────────────────────────────────────────────────────────────────
+# Tab 3: Auto-Optimize
+# ─────────────────────────────────────────────────────────────────────────────
+# Global state for optimizer (persists across calls)
+_optimizer_state = None
+_optimizer_log = []
+def run_auto_optimize(n_iterations, progress=gr.Progress()):
+    """Run autonomous optimization loop."""
+    global _optimizer_state, _optimizer_log
+    _optimizer_log = []
+    def log_callback(msg):
+        _optimizer_log.append(msg)
+    progress(0.0, desc="Starting optimization...")
+    state = run_optimization_loop(
+        n_iterations=int(n_iterations),
+        patterns=['rock', 'funk', 'halftime'],
+        initial_params=PipelineParams(),
+        seed=42,
+        log_callback=log_callback,
+    )
+    _optimizer_state = state
+    progress(1.0, desc="Done!")
+    # Build outputs
+    log_text = '\n'.join(_optimizer_log)
+    # History table
+    hist_rows = []
+    for r in state.history:
+        hist_rows.append({
+            'Iter': r.iteration,
+            'Pattern': r.test_config.get('pattern', '?'),
+            'BPM': r.test_config.get('bpm', '?'),
+            'Score': f"{r.overall_score:.1f}",
+            'SI-SDR': f"{r.eval_report.get('mean_si_sdr', 0):.1f}" if isinstance(r.eval_report, dict) else 'err',
+            'Sample': f"{r.eval_report.get('mean_sample_score', 0):.1f}" if isinstance(r.eval_report, dict) else 'err',
+            'Time (s)': f"{r.duration_seconds:.1f}",
+        })
+    hist_df = pd.DataFrame(hist_rows)
+    # Optimization plot
+    fig = make_metrics_plot(state.history)
+    # Best params
+    best_params_str = json.dumps(state.best_params, indent=2)
+    return log_text, hist_df, fig, best_params_str
+# ─────────────────────────────────────────────────────────────────────────────
+# App layout
+# ─────────────────────────────────────────────────────────────────────────────
+def build_app():
+    with gr.Blocks(
+        title="🥁 Drum Sample Extractor",
+        theme=gr.themes.Soft(),
+        css="""
+        .gradio-container { max-width: 1200px !important; }
+        .sample-audio { min-height: 60px; }
+        """
+    ) as app:
+        gr.Markdown("""
+        # 🥁 Drum Sample Extractor
+        Extract individual drum samples from audio files using **HTDemucs** stem separation,
+        **multi-band onset detection**, **spectral overlap decomposition**, and
+        **quality-aware clustering**.
+        Includes a synthetic evaluation framework with autonomous parameter optimization.
+        """)
+        with gr.Tabs():
+            # ── Tab 1: Extract ──
+            with gr.Tab("🎵 Extract", id=0):
+                gr.Markdown("Upload an audio file to extract drum samples.")
+                audio_in = gr.Audio(
+                    sources=['upload'],
+                    type='numpy',
+                    label='Upload Audio (MP3, WAV, FLAC)',
+                )
+                extract_btn = gr.Button("🔬 Extract Drum Samples", variant="primary", size="lg")
+                with gr.Row():
+                    drums_out = gr.Audio(type='numpy', label='🥁 Isolated Drum Stem', interactive=False)
+                gr.Markdown("### Extracted Samples")
+                gr.Markdown("*Best representative from each cluster, ranked by hit count:*")
+                with gr.Row():
+                    s0 = gr.Audio(type='numpy', label='Sample 1', interactive=False)
+                    s1 = gr.Audio(type='numpy', label='Sample 2', interactive=False)
+                    s2 = gr.Audio(type='numpy', label='Sample 3', interactive=False)
+                    s3 = gr.Audio(type='numpy', label='Sample 4', interactive=False)
+                with gr.Row():
+                    s4 = gr.Audio(type='numpy', label='Sample 5', interactive=False)
+                    s5 = gr.Audio(type='numpy', label='Sample 6', interactive=False)
+                    s6 = gr.Audio(type='numpy', label='Sample 7', interactive=False)
+                    s7 = gr.Audio(type='numpy', label='Sample 8', interactive=False)
+                gr.Markdown("### Quality Metrics")
+                metrics_table = gr.Dataframe(label="Cluster Quality Scores")
+                waveform_plot = gr.Plot(label="Waveforms")
+                extract_btn.click(
+                    fn=run_extraction,
+                    inputs=[audio_in],
+                    outputs=[drums_out, s0, s1, s2, s3, s4, s5, s6, s7,
+                             metrics_table, waveform_plot],
+                )
+            # ── Tab 2: Evaluate ──
+            with gr.Tab("📊 Evaluate", id=1):
+                gr.Markdown("""
+                ### Synthetic Evaluation
+                Generate a synthetic drum song with known ground-truth samples, run the extraction
+                pipeline, and compare results. This tells us exactly how well the system works.
+                """)
+                with gr.Row():
+                    pattern_dd = gr.Dropdown(
+                        choices=['rock', 'funk', 'halftime'],
+                        value='rock',
+                        label='Drum Pattern'
+                    )
+                    bpm_slider = gr.Slider(80, 200, value=120, step=2, label='BPM')
+                    bars_slider = gr.Slider(2, 8, value=4, step=1, label='Bars')
+                eval_btn = gr.Button("🧪 Generate & Evaluate", variant="primary", size="lg")
+                with gr.Row():
+                    eval_mix = gr.Audio(type='numpy', label='Synthetic Mix', interactive=False)
+                    eval_drums = gr.Audio(type='numpy', label='Drums Only', interactive=False)
+                gr.Markdown("### Evaluation Results")
+                eval_summary = gr.Dataframe(label="Summary Metrics")
+                eval_matches = gr.Dataframe(label="Cluster → Ground Truth Matches")
+                eval_plot = gr.Plot(label="GT vs Extracted Comparison")
+                eval_status = gr.Textbox(label="Status", visible=False)
+                eval_btn.click(
+                    fn=run_evaluation,
+                    inputs=[pattern_dd, bpm_slider, bars_slider],
+                    outputs=[eval_mix, eval_drums, eval_summary, eval_matches,
+                             eval_plot, eval_status],
+                )
+            # ── Tab 3: Auto-Optimize ──
+            with gr.Tab("🔄 Auto-Optimize", id=2):
+                gr.Markdown("""
+                ### Autonomous Parameter Optimization
+                Runs a loop: **generate** synthetic song → **extract** → **evaluate** against ground truth →
+                **diagnose** issues → **tune** parameters → repeat.
+                The optimizer reads evaluation metrics and makes targeted adjustments:
+                - High onset error → tighten `pre_pad` and `min_gap`
+                - Missing hits → lower `energy_threshold`
+                - Poor SI-SDR → adjust overlap separation
+                - Low sample score → rebalance selection weights
+                """)
+                with gr.Row():
+                    n_iters = gr.Slider(2, 30, value=5, step=1,
+                                        label='Number of Iterations')
+                    opt_btn = gr.Button("🚀 Run Optimization", variant="primary", size="lg")
+                opt_log = gr.Textbox(label="Optimization Log", lines=20,
+                                     max_lines=40)
+                gr.Markdown("### Results")
+                opt_table = gr.Dataframe(label="Iteration History")
+                opt_plot = gr.Plot(label="Optimization Progress")
+                opt_params = gr.Code(label="Best Parameters (JSON)", language="json")
+                opt_btn.click(
+                    fn=run_auto_optimize,
+                    inputs=[n_iters],
+                    outputs=[opt_log, opt_table, opt_plot, opt_params],
+                )
+    return app
+# ─────────────────────────────────────────────────────────────────────────────
+# Entry point
+# ─────────────────────────────────────────────────────────────────────────────
+if __name__ == "__main__":
+    app = build_app()
+    app.launch(server_name="0.0.0.0", server_port=7860)