Spaces:

rikhoffbauer2
/

drum-sample-extractor

Sleeping

App Files Files Community

rikhoffbauer2 commited on 25 days ago

Commit

fcf261a

verified ·

1 Parent(s): 2a334ed

v2: Update sample_extractor.py

Browse files

Files changed (1) hide show

sample_extractor.py +609 -0

sample_extractor.py ADDED Viewed

	@@ -0,0 +1,609 @@

+#!/usr/bin/env python3
+"""
+Sample Extractor — Generalized audio sample extraction pipeline.
+Extracts any distinct sound (drum hits, vocal stabs, guitar plucks, SFX, etc.)
+from audio, clusters identical occurrences, picks the best representative,
+and reconstructs the song as MIDI.
+Stages:
+  1. STEM SEPARATION  — HTDemucs isolates target stem (optional)
+  2. ONSET DETECTION  — Adaptive multi-method detection for any sound type
+  3. SPECTRAL CLASSIFICATION — Label sounds by frequency profile
+  4. OVERLAP SEPARATION — Decompose simultaneous sounds via spectral bands
+  5. EMBEDDING & CLUSTERING — Group identical sounds, auto-K
+  6. QUALITY SCORING — Completeness + cleanness + onset sharpness
+  7. SYNTHESIS — Peak-aligned weighted average of cluster members
+  8. MIDI RECONSTRUCTION — Map clusters back to timeline as .mid
+"""
+import argparse, json, os, sys, warnings
+from collections import defaultdict
+from dataclasses import dataclass, field
+from pathlib import Path
+from typing import Optional
+import librosa, numpy as np, soundfile as sf, torch
+warnings.filterwarnings("ignore", category=FutureWarning)
+warnings.filterwarnings("ignore", category=UserWarning)
+# ─── Data structures ─────────────────────────────────────────────────────────
+@dataclass
+class Hit:
+    """A single detected audio event."""
+    audio: np.ndarray
+    sr: int
+    onset_time: float
+    duration: float
+    index: int
+    rms_energy: float = 0.0
+    spectral_centroid: float = 0.0
+    label: str = ""
+    embedding: Optional[np.ndarray] = None
+    cluster_id: int = -1
+    def save(self, path: str):
+        sf.write(path, self.audio, self.sr, subtype='PCM_24')
+@dataclass
+class Cluster:
+    """A group of similar sounds."""
+    cluster_id: int
+    label: str
+    hits: list = field(default_factory=list)
+    best_hit_idx: int = 0
+    synthesized: Optional[np.ndarray] = None
+    midi_note: int = 60  # assigned during MIDI export
+    @property
+    def best_hit(self) -> Hit:
+        return self.hits[self.best_hit_idx]
+    @property
+    def count(self) -> int:
+        return len(self.hits)
+# ─── Stage 1: Stem separation ────────────────────────────────────────────────
+def extract_stem(audio_path: str, stem: str = "drums", device: str = "cpu") -> tuple:
+    """Extract a stem using HTDemucs. stem: drums|bass|vocals|other|all"""
+    if stem == "all":
+        y, sr = librosa.load(audio_path, sr=44100, mono=True)
+        return y.astype(np.float32), sr
+    from demucs.pretrained import get_model
+    from demucs.apply import apply_model
+    print(f"[Stage 1] Extracting '{stem}' stem with HTDemucs...")
+    for name in ["htdemucs_ft", "htdemucs"]:
+        try:
+            model = get_model(name)
+            break
+        except Exception:
+            continue
+    else:
+        raise RuntimeError("Could not load Demucs model")
+    model.eval().to(device)
+    sr = model.samplerate
+    audio_np, _ = librosa.load(audio_path, sr=sr, mono=False)
+    if audio_np.ndim == 1:
+        audio_np = np.stack([audio_np, audio_np])
+    elif audio_np.shape[0] > 2:
+        audio_np = audio_np[:2]
+    elif audio_np.shape[0] == 1:
+        audio_np = np.concatenate([audio_np, audio_np], axis=0)
+    wav = torch.from_numpy(audio_np).float().unsqueeze(0).to(device)
+    with torch.no_grad():
+        sources = apply_model(model, wav, device=device, shifts=1, split=True, overlap=0.25)
+    idx = model.sources.index(stem)
+    result = sources[0, idx].mean(dim=0).cpu().numpy()
+    print(f"  ✓ Extracted {stem}: {len(result)/sr:.1f}s")
+    return result.astype(np.float32), sr
+# ─── Stage 2: Onset detection (generalized) ──────────────────────────────────
+def detect_onsets(y: np.ndarray, sr: int, pre_pad: float = 0.005,
+                  min_dur: float = 0.02, max_dur: float = 1.5,
+                  min_gap: float = 0.015, energy_threshold_db: float = -45.0,
+                  mode: str = "auto") -> list:
+    """
+    Detect audio event onsets. mode: auto|percussive|harmonic|broadband
+    'auto' uses HPSS dual-channel detection (best general-purpose).
+    """
+    print(f"[Stage 2] Detecting onsets (mode={mode})...")
+    if mode == "percussive":
+        onset_env = librosa.onset.onset_strength(y=y, sr=sr, aggregate=np.median, fmax=8000)
+    elif mode == "harmonic":
+        y_harm, _ = librosa.effects.hpss(y)
+        onset_env = librosa.onset.onset_strength(y=y_harm, sr=sr, fmax=8000, lag=2, max_size=3)
+    elif mode == "broadband":
+        onset_env = librosa.onset.onset_strength(y=y, sr=sr)
+    else:  # auto: multi-band max
+        y_harm, y_perc = librosa.effects.hpss(y)
+        env_low = librosa.onset.onset_strength(y=y, sr=sr, fmin=20, fmax=250, aggregate=np.median)
+        env_mid = librosa.onset.onset_strength(y=y, sr=sr, fmin=250, fmax=4000, aggregate=np.median)
+        env_high = librosa.onset.onset_strength(y=y, sr=sr, fmin=4000, fmax=min(sr//2, 20000), aggregate=np.median)
+        env_harm = librosa.onset.onset_strength(y=y_harm, sr=sr, lag=2)
+        def _n(x):
+            m = x.max(); return x/m if m > 0 else x
+        onset_env = np.maximum(np.maximum(_n(env_low), _n(env_mid)),
+                               np.maximum(_n(env_high), _n(env_harm)))
+    wait = max(1, int(min_gap * sr / 512))
+    frames = librosa.onset.onset_detect(onset_envelope=onset_env, sr=sr, wait=wait,
+                                        pre_avg=3, post_avg=3, pre_max=3, post_max=5,
+                                        backtrack=True, units='frames')
+    times = librosa.frames_to_time(frames, sr=sr)
+    print(f"  Raw onsets: {len(times)}")
+    threshold = 10 ** (energy_threshold_db / 20)
+    hits = []
+    for i, t in enumerate(times):
+        s = max(0, int((t - pre_pad) * sr))
+        if i + 1 < len(times):
+            e = min(int(times[i+1] * sr), s + int(max_dur * sr))
+        else:
+            e = min(len(y), s + int(max_dur * sr))
+        seg = y[s:e]
+        if len(seg) < int(min_dur * sr):
+            continue
+        rms = np.sqrt(np.mean(seg**2))
+        if rms < threshold:
+            continue
+        # Fade out
+        fl = min(int(0.005 * sr), len(seg) // 4)
+        if fl > 0:
+            seg = seg.copy()
+            seg[-fl:] *= np.linspace(1, 0, fl)
+        sc = float(librosa.feature.spectral_centroid(y=seg, sr=sr).mean())
+        hits.append(Hit(audio=seg, sr=sr, onset_time=t, duration=len(seg)/sr,
+                        index=len(hits), rms_energy=float(rms), spectral_centroid=sc))
+    print(f"  ✓ Valid hits: {len(hits)}")
+    return hits
+# ─── Stage 3: Classification (generalized) ───────────────────────────────────
+LABEL_RULES = [
+    # (name, condition_fn)
+    ("kick",         lambda lr, mr, hr, c, zcr, d: lr > 0.5 and c < 800),
+    ("hihat_closed", lambda lr, mr, hr, c, zcr, d: hr > 0.35 and c > 4000 and d < 0.15),
+    ("hihat_open",   lambda lr, mr, hr, c, zcr, d: hr > 0.35 and c > 4000 and d >= 0.15),
+    ("cymbal",       lambda lr, mr, hr, c, zcr, d: hr > 0.25 and c > 3000),
+    ("snare",        lambda lr, mr, hr, c, zcr, d: mr > 0.4 and zcr > 0.1 and c > 1000),
+    ("tom",          lambda lr, mr, hr, c, zcr, d: lr > 0.3 and mr > 0.3 and c < 1500),
+    ("bass",         lambda lr, mr, hr, c, zcr, d: lr > 0.6 and c < 400 and d > 0.2),
+    ("vocal",        lambda lr, mr, hr, c, zcr, d: mr > 0.5 and c > 500 and c < 3000 and zcr < 0.15),
+    ("bright",       lambda lr, mr, hr, c, zcr, d: c > 2500),
+    ("mid",          lambda lr, mr, hr, c, zcr, d: c > 800),
+]
+def classify_hit(hit: Hit) -> str:
+    y, sr = hit.audio, hit.sr
+    D = np.abs(librosa.stft(y, n_fft=2048))
+    freqs = librosa.fft_frequencies(sr=sr, n_fft=2048)
+    le = np.sum(D[(freqs >= 20) & (freqs < 200)]**2)
+    me = np.sum(D[(freqs >= 200) & (freqs < 4000)]**2)
+    he = np.sum(D[(freqs >= 4000)]**2)
+    total = le + me + he + 1e-10
+    lr, mr, hr = le/total, me/total, he/total
+    zcr = float(librosa.feature.zero_crossing_rate(y=y).mean())
+    for name, fn in LABEL_RULES:
+        if fn(lr, mr, hr, hit.spectral_centroid, zcr, hit.duration):
+            return name
+    return "other"
+def spectral_decompose(hit: Hit, threshold: float = 0.15) -> dict:
+    """Split a hit into spectral sub-bands if multiple bands are significant."""
+    y, sr = hit.audio, hit.sr
+    D = librosa.stft(y, n_fft=2048)
+    freqs = librosa.fft_frequencies(sr=sr, n_fft=2048)
+    bands = {"low": (20, 250), "mid": (250, 4000), "high": (4000, sr//2)}
+    results = {}
+    for name, (lo, hi) in bands.items():
+        mask = (freqs >= lo) & (freqs <= hi)
+        Db = np.zeros_like(D); Db[mask] = D[mask]
+        ab = librosa.istft(Db, length=len(y))
+        if np.sqrt(np.mean(ab**2)) > 0.001:
+            results[name] = ab
+    return results
+def classify_and_separate(hits: list, separate_overlaps: bool = True,
+                          overlap_threshold: float = 0.15) -> list:
+    """Classify hits and optionally decompose overlapping sounds."""
+    print(f"[Stage 3] Classifying & separating...")
+    all_hits, overlap_count = [], 0
+    band_labels = {"low": "bass_hit", "mid": "mid_hit", "high": "bright_hit"}
+    for hit in hits:
+        hit.label = classify_hit(hit)
+        if separate_overlaps:
+            bands = spectral_decompose(hit, overlap_threshold)
+            if len(bands) >= 2:
+                energies = {k: np.sqrt(np.mean(v**2)) for k, v in bands.items()}
+                mx = max(energies.values())
+                sig = {k: v for k, v in bands.items() if energies[k] > overlap_threshold * mx}
+                if len(sig) >= 2:
+                    overlap_count += 1
+                    for bn, ba in sig.items():
+                        sc = float(librosa.feature.spectral_centroid(y=ba, sr=hit.sr).mean())
+                        sub = Hit(audio=ba, sr=hit.sr, onset_time=hit.onset_time,
+                                  duration=hit.duration, index=len(all_hits),
+                                  rms_energy=float(np.sqrt(np.mean(ba**2))),
+                                  spectral_centroid=sc, label=band_labels.get(bn, "other"))
+                        # Re-classify the sub-hit with full rules
+                        sub.label = classify_hit(sub)
+                        all_hits.append(sub)
+                    continue
+        hit.index = len(all_hits)
+        all_hits.append(hit)
+    counts = defaultdict(int)
+    for h in all_hits:
+        counts[h.label] += 1
+    print(f"  Overlaps decomposed: {overlap_count}")
+    print(f"  Total hits: {len(all_hits)}")
+    for l, c in sorted(counts.items(), key=lambda x: -x[1]):
+        print(f"    {l}: {c}")
+    return all_hits
+# ─── Stage 4: Embedding & Clustering ─────────────────────────────────────────
+def compute_embeddings(hits: list) -> np.ndarray:
+    """58-dim librosa feature embeddings."""
+    embs = []
+    for h in hits:
+        y, sr = h.audio, h.sr
+        ml = int(0.05 * sr)
+        if len(y) < ml:
+            y = np.pad(y, (0, ml - len(y)))
+        mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=20)
+        c = librosa.feature.spectral_centroid(y=y, sr=sr)
+        bw = librosa.feature.spectral_bandwidth(y=y, sr=sr)
+        ro = librosa.feature.spectral_rolloff(y=y, sr=sr)
+        ct = librosa.feature.spectral_contrast(y=y, sr=sr, n_bands=4)
+        fl = librosa.feature.spectral_flatness(y=y)
+        zcr = librosa.feature.zero_crossing_rate(y=y)
+        rms = librosa.feature.rms(y=y)
+        oe = librosa.onset.onset_strength(y=y, sr=sr)
+        if len(oe) > 1:
+            oen = oe / (oe.max() + 1e-10)
+            af = [oen.mean(), oen.std(), float(np.argmax(oen))/len(oen), oen[-1]]
+        else:
+            af = [0,0,0,0]
+        f = np.concatenate([mfcc.mean(1), mfcc.std(1), [c.mean(), c.std()],
+                            [bw.mean(), bw.std()], [ro.mean()], ct.mean(1),
+                            [fl.mean()], [zcr.mean()], [rms.mean()], af, [h.duration]])
+        embs.append(f)
+    embs = np.array(embs, dtype=np.float32)
+    mu, std = embs.mean(0), embs.std(0) + 1e-8
+    return (embs - mu) / std
+def cluster_hits(hits: list, embeddings: np.ndarray) -> list:
+    """Cluster by label group, then sub-cluster via silhouette-optimized KMeans."""
+    from sklearn.cluster import KMeans
+    from sklearn.metrics import silhouette_score
+    print(f"[Stage 4] Clustering...")
+    groups = defaultdict(list)
+    for i, h in enumerate(hits):
+        groups[h.label].append(i)
+    clusters = []
+    for label, indices in groups.items():
+        if len(indices) < 2:
+            clusters.append(Cluster(cluster_id=len(clusters), label=f"{label}_0",
+                                     hits=[hits[i] for i in indices]))
+            continue
+        ge = embeddings[indices]
+        mk = min(max(2, len(indices)//3), 15)
+        bk, bs = 1, -1
+        for k in range(2, mk+1):
+            try:
+                km = KMeans(n_clusters=k, random_state=42, n_init=10, max_iter=300)
+                sl = km.fit_predict(ge)
+                s = silhouette_score(ge, sl)
+                if s > bs: bk, bs = k, s
+            except ValueError:
+                continue
+        if bk >= 2:
+            sl = KMeans(n_clusters=bk, random_state=42, n_init=10).fit_predict(ge)
+        else:
+            sl = np.zeros(len(indices), dtype=int)
+        for sid in range(max(sl)+1):
+            mask = sl == sid
+            mi = [indices[j] for j in range(len(indices)) if mask[j]]
+            clusters.append(Cluster(cluster_id=len(clusters), label=f"{label}_{sid}",
+                                     hits=[hits[i] for i in mi]))
+        print(f"  {label}: {len(indices)} → {bk} sub-clusters (sil={bs:.3f})")
+    print(f"  ✓ Total clusters: {len(clusters)}")
+    return clusters
+# ─── Stage 5: Quality scoring & selection ─────────────────────────────────────
+def sample_quality_score(y: np.ndarray, sr: int, label: str = "other") -> dict:
+    """Score a sample for production quality. Returns dict with total [0,100]."""
+    # Completeness
+    rms_env = librosa.feature.rms(y=y, frame_length=512, hop_length=128)[0]
+    if len(rms_env) >= 10:
+        pk = np.argmax(rms_env); post = rms_env[pk:]
+        tail_r = np.mean(post[-max(3, len(post)//5):]) / (rms_env[pk] + 1e-8)
+        c1 = max(0, 1.0 - tail_r * 5)
+    else:
+        c1 = 0.5
+    import scipy.stats
+    if len(rms_env) >= 10:
+        pk = np.argmax(rms_env); post = rms_env[pk:]
+        if len(post) >= 5:
+            slope, _, r, _, _ = scipy.stats.linregress(np.arange(len(post)), np.log(post+1e-8))
+            c2 = max(0, r**2) if slope < 0 else r**2 * 0.3
+        else:
+            c2 = 0.0
+    else:
+        c2 = 0.0
+    completeness = c1 * 0.6 + c2 * 0.4
+    # Cleanness: robust SNR + pre-onset energy
+    snr = 10*np.log10(np.percentile(y**2, 99) / (np.percentile(y**2, 10) + 1e-12))
+    n_snr = np.clip((snr - 10) / 40, 0, 1)
+    onsets = librosa.onset.onset_detect(y=y, sr=sr, units='samples', backtrack=True)
+    if len(onsets) > 0:
+        os_s = int(onsets[0])
+        pre = y[max(0, os_s-int(sr*.02)):os_s]
+        sig = y[os_s:os_s+int(sr*.1)]
+        if len(pre) > 10 and len(sig) > 10:
+            pdb = 10*np.log10(np.mean(pre**2+1e-12)/np.mean(sig**2+1e-12))
+            n_pre = np.clip((-pdb - 5) / 30, 0, 1)
+        else:
+            n_pre = 0.5
+    else:
+        n_pre = 0.5
+    cleanness = n_snr * 0.5 + n_pre * 0.5
+    # Onset quality
+    oe = librosa.onset.onset_strength(y=y, sr=sr)
+    sharpness = float(np.max(oe) / (np.mean(oe) + 1e-8)) if len(oe) > 1 else 1.0
+    onset_q = float(np.clip((sharpness - 1.0) / 5.0, 0, 1))
+    total = (completeness * 0.30 + cleanness * 0.40 + onset_q * 0.20 + 0.5 * 0.10) * 100
+    return {'total': float(total), 'completeness': float(completeness),
+            'cleanness': float(cleanness), 'onset_quality': float(onset_q)}
+def select_best(clusters: list):
+    """Select best representative per cluster using quality scoring."""
+    print(f"[Stage 5] Selecting best representatives...")
+    for c in clusters:
+        if c.count <= 1:
+            c.best_hit_idx = 0; continue
+        scores = [sample_quality_score(h.audio, h.sr, c.label.rsplit('_',1)[0])['total']
+                  for h in c.hits]
+        c.best_hit_idx = int(np.argmax(scores))
+# ─── Stage 6: Synthesis ──────────────────────────────────────────────────────
+def synthesize_from_cluster(cluster: Cluster) -> Optional[np.ndarray]:
+    """Peak-aligned weighted average synthesis."""
+    if cluster.count < 2:
+        return None
+    tl = int(np.median([len(h.audio) for h in cluster.hits]))
+    aligned, weights = [], []
+    pp_target = None
+    for i, h in enumerate(cluster.hits):
+        a = h.audio.copy()
+        pp = np.argmax(np.abs(a))
+        if pp_target is None: pp_target = pp
+        shift = pp_target - pp
+        if shift > 0: a = np.pad(a, (shift, 0))
+        elif shift < 0: a = a[-shift:]
+        a = a[:tl] if len(a) >= tl else np.pad(a, (0, tl - len(a)))
+        pk = np.abs(a).max()
+        if pk > 0: a = a / pk
+        aligned.append(a)
+        weights.append(2.0 if i == cluster.best_hit_idx else 1.0)
+    aligned = np.array(aligned)
+    w = np.array(weights); w /= w.sum()
+    synth = np.average(aligned, axis=0, weights=w)
+    pk = np.abs(synth).max()
+    return (synth * 0.95 / pk).astype(np.float32) if pk > 0 else synth.astype(np.float32)
+# ─── Stage 7: MIDI reconstruction ────────────────────────────────────────────
+def build_midi(clusters: list, bpm: float = 120.0) -> 'pretty_midi.PrettyMIDI':
+    """Build MIDI file mapping each cluster to a unique note."""
+    import pretty_midi
+    pm = pretty_midi.PrettyMIDI(initial_tempo=bpm)
+    # Assign MIDI notes: one per cluster, starting at C2 (36)
+    base_note = 36
+    for i, c in enumerate(clusters):
+        c.midi_note = min(base_note + i, 127)
+    # Create one instrument for all (using Standard Drums channel for now)
+    inst = pretty_midi.Instrument(program=0, is_drum=True, name='Extracted Samples')
+    pm.instruments.append(inst)
+    for c in clusters:
+        for h in c.hits:
+            vel = max(1, min(127, int(h.rms_energy / 0.3 * 127)))
+            note = pretty_midi.Note(velocity=vel, pitch=c.midi_note,
+                                     start=h.onset_time,
+                                     end=h.onset_time + max(h.duration, 0.05))
+            inst.notes.append(note)
+    # Sort notes by start time
+    inst.notes.sort(key=lambda n: n.start)
+    return pm
+def export_midi(clusters: list, output_path: str, bpm: float = 120.0):
+    """Export MIDI file."""
+    pm = build_midi(clusters, bpm)
+    pm.write(output_path)
+    print(f"  ✓ MIDI saved: {output_path} ({len(pm.instruments[0].notes)} notes)")
+    return pm
+def build_sample_map(clusters: list) -> dict:
+    """Build a mapping from MIDI note → cluster for DAW import."""
+    return {
+        c.midi_note: {
+            'label': c.label,
+            'count': c.count,
+            'duration_ms': int(c.best_hit.duration * 1000),
+        }
+        for c in clusters
+    }
+# ─── Main pipeline ───────────────────────────────────────────────────────────
+def run_pipeline(
+    audio_path: str,
+    output_dir: str = "./extracted_samples",
+    stem: str = "drums",            # drums|bass|vocals|other|all
+    device: str = "auto",
+    onset_mode: str = "auto",       # auto|percussive|harmonic|broadband
+    separate_overlaps: bool = True,
+    overlap_threshold: float = 0.15,
+    synthesize: bool = True,
+    export_midi_file: bool = True,
+    bpm: float = 120.0,
+    min_dur: float = 0.02,
+    max_dur: float = 1.5,
+    energy_threshold_db: float = -45.0,
+    pre_pad: float = 0.005,
+    min_gap: float = 0.015,
+    save_intermediates: bool = True,
+) -> tuple:
+    """Run the full extraction pipeline. Returns (clusters, hits, midi_pm)."""
+    if device == "auto":
+        device = "cuda" if torch.cuda.is_available() else "cpu"
+    out = Path(output_dir); out.mkdir(parents=True, exist_ok=True)
+    # Stage 1
+    audio, sr = extract_stem(audio_path, stem=stem, device=device)
+    if save_intermediates:
+        sf.write(str(out / f"{stem}_stem.wav"), audio, sr, subtype='PCM_24')
+    # Stage 2
+    hits = detect_onsets(audio, sr, pre_pad=pre_pad, min_dur=min_dur,
+                         max_dur=max_dur, min_gap=min_gap,
+                         energy_threshold_db=energy_threshold_db, mode=onset_mode)
+    if not hits:
+        print("⚠ No hits detected!")
+        return [], [], None
+    # Stage 3
+    hits = classify_and_separate(hits, separate_overlaps=separate_overlaps,
+                                  overlap_threshold=overlap_threshold)
+    if save_intermediates:
+        hd = out / "all_hits"; hd.mkdir(exist_ok=True)
+        for h in hits:
+            h.save(str(hd / f"hit_{h.index:04d}_{h.label}_{h.onset_time:.3f}s.wav"))
+    # Stage 4
+    print(f"[Stage 4a] Computing embeddings...")
+    embs = compute_embeddings(hits)
+    print(f"  ✓ Embeddings: {embs.shape}")
+    for i, h in enumerate(hits): h.embedding = embs[i]
+    clusters = cluster_hits(hits, embs)
+    # Stage 5
+    select_best(clusters)
+    # Stage 6
+    if synthesize:
+        print(f"[Stage 6] Synthesizing...")
+        for c in clusters:
+            if c.count >= 2:
+                c.synthesized = synthesize_from_cluster(c)
+    # Stage 7: MIDI
+    midi_pm = None
+    if export_midi_file:
+        print(f"[Stage 7] Building MIDI reconstruction...")
+        midi_pm = export_midi(clusters, str(out / "reconstruction.mid"), bpm=bpm)
+        # Save sample map
+        smap = build_sample_map(clusters)
+        with open(str(out / "sample_map.json"), 'w') as f:
+            json.dump(smap, f, indent=2)
+        print(f"  Sample map: {out / 'sample_map.json'}")
+    # Export
+    print(f"[Export] Saving samples...")
+    sd = out / "samples"; sd.mkdir(exist_ok=True)
+    if synthesize:
+        synd = out / "synthesized"; synd.mkdir(exist_ok=True)
+    manifest = []
+    for c in clusters:
+        best = c.best_hit
+        sp = sd / f"{c.label}__best.wav"; best.save(str(sp))
+        entry = {'cluster_id': c.cluster_id, 'label': c.label, 'count': c.count,
+                 'midi_note': c.midi_note, 'best_onset': best.onset_time,
+                 'best_duration': best.duration, 'best_energy': best.rms_energy}
+        if synthesize and c.synthesized is not None:
+            synp = synd / f"{c.label}__synthesized.wav"
+            sf.write(str(synp), c.synthesized, best.sr, subtype='PCM_24')
+            entry['synthesized'] = str(synp)
+        manifest.append(entry)
+        print(f"  ✓ {c.label}: {c.count} hits → MIDI note {c.midi_note}")
+    with open(str(out / "manifest.json"), 'w') as f:
+        json.dump(manifest, f, indent=2)
+    print(f"\n{'='*50}")
+    print(f"  Clusters: {len(clusters)}")
+    print(f"  Total hits: {sum(c.count for c in clusters)}")
+    print(f"  Output: {output_dir}")
+    return clusters, hits, midi_pm
+# ─── CLI ──────────────────────────────────────────────────────────────────────
+def main():
+    p = argparse.ArgumentParser(description="Extract audio samples from any audio file")
+    p.add_argument("input", help="Input audio file")
+    p.add_argument("-o", "--output-dir", default="./extracted_samples")
+    p.add_argument("--stem", default="drums", choices=["drums","bass","vocals","other","all"])
+    p.add_argument("--onset-mode", default="auto", choices=["auto","percussive","harmonic","broadband"])
+    p.add_argument("--no-gpu", action="store_true")
+    p.add_argument("--no-separate", action="store_true")
+    p.add_argument("--no-midi", action="store_true")
+    p.add_argument("--bpm", type=float, default=120.0)
+    p.add_argument("--min-dur", type=float, default=0.02)
+    p.add_argument("--max-dur", type=float, default=1.5)
+    p.add_argument("--energy-threshold", type=float, default=-45.0)
+    args = p.parse_args()
+    run_pipeline(audio_path=args.input, output_dir=args.output_dir,
+                 stem=args.stem, device="cpu" if args.no_gpu else "auto",
+                 onset_mode=args.onset_mode, separate_overlaps=not args.no_separate,
+                 export_midi_file=not args.no_midi, bpm=args.bpm,
+                 min_dur=args.min_dur, max_dur=args.max_dur,
+                 energy_threshold_db=args.energy_threshold)
+if __name__ == "__main__":
+    main()