Spaces:

rikhoffbauer2
/

drum-sample-extractor

Sleeping

App Files Files Community

rikhoffbauer2 commited on 21 days ago

Commit

d1fa59c

verified ·

1 Parent(s): 539cac7

v6: Fix clustering for real music — auto-scale NCC window, n_clusters fallback, better defaults

Browse files

Files changed (1) hide show

sample_extractor.py +226 -337

sample_extractor.py CHANGED Viewed

@@ -1,23 +1,15 @@
 #!/usr/bin/env python3
 """
-Sample Extractor v4 — NCC-based clustering, full parameter control.
-Key fix: Uses normalized cross-correlation (NCC) to detect identical samples
-instead of MFCC-based KMeans. NCC is amplitude-invariant — same kick at
-different velocities → NCC ≈ 1.0. This correctly groups repeated occurrences
-of the same sample into one cluster.
-Stages:
-  1. STEM SEPARATION  — Demucs (configurable model) isolates target stem
-  2. ONSET DETECTION  — Adaptive multi-method detection
-  3. CLASSIFICATION   — Spectral profile labeling (post-overlap-separation aware)
-  4. NCC CLUSTERING   — Waveform identity matching via cross-correlation
-  5. QUALITY SCORING  — Completeness + cleanness + onset sharpness
-  6. SYNTHESIS        — Peak-aligned weighted average
-  7. MIDI + RENDER    — Timeline reconstruction as .mid and .wav
 """
-import argparse, json, os, sys, warnings
 from collections import defaultdict
 from dataclasses import dataclass, field
 from pathlib import Path
@@ -49,86 +41,70 @@ class Cluster:
     def count(self) -> int: return len(self.hits)
-# Available Demucs models
-DEMUCS_MODELS = [
-    "htdemucs",       # Default hybrid transformer, 4-stem, fast
-    "htdemucs_ft",    # Fine-tuned, 4-stem, best quality, slower (bag of 4)
-    "htdemucs_6s",    # 6-stem: adds guitar + piano
-    "mdx",            # MDX competition winner, waveform U-Net
-    "mdx_extra",      # Hybrid spectral, highest quality overall
-    "mdx_extra_q",    # Quantized mdx_extra (needs diffq)
-]
 DEMUCS_STEMS = {
-    "htdemucs":    ["drums", "bass", "other", "vocals"],
-    "htdemucs_ft": ["drums", "bass", "other", "vocals"],
-    "htdemucs_6s": ["drums", "bass", "other", "vocals", "guitar", "piano"],
-    "mdx":         ["drums", "bass", "other", "vocals"],
-    "mdx_extra":   ["drums", "bass", "other", "vocals"],
-    "mdx_extra_q": ["drums", "bass", "other", "vocals"],
 }
 # ─── Stage 1: Stem separation ────────────────────────────────────────────────
 def extract_stem(audio_path: str, stem: str = "drums", device: str = "cpu",
                  model_name: str = "htdemucs_ft", shifts: int = 1,
                  overlap: float = 0.25) -> tuple:
-    """Extract a stem using Demucs. Cached by (file_hash, stem, model, shifts, overlap)."""
     if stem == "all":
         y, sr = librosa.load(audio_path, sr=44100, mono=True)
         return y.astype(np.float32), sr
-    # Cache key from file content hash + params
     with open(audio_path, 'rb') as f:
-        file_hash = hashlib.md5(f.read(200000)).hexdigest()  # hash first 200KB
-    cache_key = ("stem", file_hash, stem, model_name, shifts, overlap)
-    cached = cache_get(cache_key)
     if cached is not None:
-        print(f"[Stage 1] Using cached {stem} stem ({model_name})")
-        return cached
     from demucs.pretrained import get_model
     from demucs.apply import apply_model
-    print(f"[Stage 1] Extracting '{stem}' with {model_name} (shifts={shifts}, overlap={overlap})...")
-    model = get_model(model_name)
-    model.eval().to(device)
     sr = model.samplerate
     if stem not in model.sources:
-        raise ValueError(f"Stem '{stem}' not in model '{model_name}'. Available: {model.sources}")
     audio_np, _ = librosa.load(audio_path, sr=sr, mono=False)
-    if audio_np.ndim == 1:
-        audio_np = np.stack([audio_np, audio_np])
-    elif audio_np.shape[0] > 2:
-        audio_np = audio_np[:2]
-    elif audio_np.shape[0] == 1:
-        audio_np = np.concatenate([audio_np, audio_np], axis=0)
     wav = torch.from_numpy(audio_np).float().unsqueeze(0).to(device)
     with torch.no_grad():
-        sources = apply_model(model, wav, device=device, shifts=shifts,
-                              split=True, overlap=overlap)
-    idx = model.sources.index(stem)
-    result = sources[0, idx].mean(dim=0).cpu().numpy()
     print(f"  ✓ {stem}: {len(result)/sr:.1f}s")
-    out = (result.astype(np.float32), sr)
-    return cache_set(cache_key, out)
 # ─── Stage 2: Onset detection ────────────────────────────────────────────────
 def detect_onsets(y: np.ndarray, sr: int, pre_pad: float = 0.005,
                   min_dur: float = 0.02, max_dur: float = 1.5,
-                  min_gap: float = 0.015, energy_threshold_db: float = -45.0,
                   mode: str = "auto", backtrack: bool = True,
-                  onset_delta: float = 0.07) -> list:
-    """Detect onsets. mode: auto|percussive|harmonic|broadband"""
-    print(f"[Stage 2] Detecting onsets (mode={mode}, delta={onset_delta})...")
     if mode == "percussive":
         onset_env = librosa.onset.onset_strength(y=y, sr=sr, aggregate=np.median, fmax=8000)
     elif mode == "harmonic":
@@ -136,21 +112,19 @@ def detect_onsets(y: np.ndarray, sr: int, pre_pad: float = 0.005,
         onset_env = librosa.onset.onset_strength(y=y_harm, sr=sr, fmax=8000, lag=2, max_size=3)
     elif mode == "broadband":
         onset_env = librosa.onset.onset_strength(y=y, sr=sr)
-    else:  # auto
         y_harm, y_perc = librosa.effects.hpss(y)
         envs = [
             librosa.onset.onset_strength(y=y, sr=sr, fmin=20, fmax=250, aggregate=np.median),
             librosa.onset.onset_strength(y=y, sr=sr, fmin=250, fmax=4000, aggregate=np.median),
-            librosa.onset.onset_strength(y=y, sr=sr, fmin=4000, fmax=min(sr//2, 20000), aggregate=np.median),
             librosa.onset.onset_strength(y=y_harm, sr=sr, lag=2),
         ]
-        def _n(x):
-            m = x.max(); return x/m if m > 0 else x
         onset_env = np.maximum.reduce([_n(e) for e in envs])
     wait = max(1, int(min_gap * sr / 512))
-    frames = librosa.onset.onset_detect(
-        onset_envelope=onset_env, sr=sr, wait=wait,
         pre_avg=3, post_avg=3, pre_max=3, post_max=5,
         delta=onset_delta, backtrack=backtrack, units='frames')
     times = librosa.frames_to_time(frames, sr=sr)
@@ -160,17 +134,13 @@ def detect_onsets(y: np.ndarray, sr: int, pre_pad: float = 0.005,
     hits = []
     for i, t in enumerate(times):
         s = max(0, int((t - pre_pad) * sr))
-        if i + 1 < len(times):
-            e = min(int(times[i+1] * sr), s + int(max_dur * sr))
-        else:
-            e = min(len(y), s + int(max_dur * sr))
         seg = y[s:e]
-        if len(seg) < int(min_dur * sr): continue
         rms = np.sqrt(np.mean(seg**2))
         if rms < threshold: continue
-        fl = min(int(0.005 * sr), len(seg) // 4)
-        if fl > 0:
-            seg = seg.copy(); seg[-fl:] *= np.linspace(1, 0, fl)
         sc = float(librosa.feature.spectral_centroid(y=seg, sr=sr).mean())
         hits.append(Hit(audio=seg, sr=sr, onset_time=t, duration=len(seg)/sr,
                         index=len(hits), rms_energy=float(rms), spectral_centroid=sc))
@@ -181,294 +151,221 @@ def detect_onsets(y: np.ndarray, sr: int, pre_pad: float = 0.005,
 # ─── Stage 3: Classification ───────────────────────────────────��─────────────
 LABEL_RULES = [
-    ("kick",         lambda lr, mr, hr, c, zcr, d: lr > 0.5 and c < 800),
-    ("hihat_closed", lambda lr, mr, hr, c, zcr, d: hr > 0.35 and c > 4000 and d < 0.15),
-    ("hihat_open",   lambda lr, mr, hr, c, zcr, d: hr > 0.35 and c > 4000 and d >= 0.15),
-    ("cymbal",       lambda lr, mr, hr, c, zcr, d: hr > 0.25 and c > 3000),
-    ("snare",        lambda lr, mr, hr, c, zcr, d: mr > 0.4 and zcr > 0.1 and c > 1000),
-    ("tom",          lambda lr, mr, hr, c, zcr, d: lr > 0.3 and mr > 0.3 and c < 1500),
-    ("bass",         lambda lr, mr, hr, c, zcr, d: lr > 0.6 and c < 400 and d > 0.2),
-    ("vocal",        lambda lr, mr, hr, c, zcr, d: mr > 0.5 and c > 500 and c < 3000 and zcr < 0.15),
-    ("bright",       lambda lr, mr, hr, c, zcr, d: c > 2500),
-    ("mid",          lambda lr, mr, hr, c, zcr, d: c > 800),
 ]
 def classify_hit(hit: Hit) -> str:
     y, sr = hit.audio, hit.sr
     D = np.abs(librosa.stft(y, n_fft=2048))
     freqs = librosa.fft_frequencies(sr=sr, n_fft=2048)
-    le = np.sum(D[(freqs >= 20) & (freqs < 200)]**2)
-    me = np.sum(D[(freqs >= 200) & (freqs < 4000)]**2)
-    he = np.sum(D[(freqs >= 4000)]**2)
-    total = le + me + he + 1e-10
-    lr, mr, hr = le/total, me/total, he/total
     zcr = float(librosa.feature.zero_crossing_rate(y=y).mean())
     for name, fn in LABEL_RULES:
-        if fn(lr, mr, hr, hit.spectral_centroid, zcr, hit.duration):
-            return name
     return "other"
 def classify_hits(hits: list) -> list:
-    """Classify all hits. No overlap separation — clustering handles grouping."""
     print(f"[Stage 3] Classifying {len(hits)} hits...")
-    for h in hits:
-        h.label = classify_hit(h)
     counts = defaultdict(int)
     for h in hits: counts[h.label] += 1
-    for l, c in sorted(counts.items(), key=lambda x: -x[1]):
-        print(f"    {l}: {c}")
     return hits
-# ─── Caching ──────────────────────────────────────────────────────────────────
-import hashlib, functools
-_cache = {}  # key → value; cleared per-session or manually
-def _audio_hash(audio: np.ndarray) -> str:
-    """Fast hash of audio array for cache keys."""
-    return hashlib.md5(audio[:4000].tobytes()).hexdigest()
-def cache_get(key):
-    return _cache.get(key)
-def cache_set(key, value):
-    _cache[key] = value
-    return value
-def cache_clear():
-    _cache.clear()
 # ─── Stage 4: NCC-based clustering ───────────────────────────────────────────
 def ncc_max(a: np.ndarray, b: np.ndarray) -> float:
-    """Normalized cross-correlation peak. Amplitude-invariant.
-    Returns 1.0 for identical waveforms at any amplitude."""
-    a = a - a.mean()
-    b = b - b.mean()
-    norm = np.sqrt(np.dot(a, a) * np.dot(b, b))
     if norm < 1e-10: return 0.0
-    n = max(len(a), len(b))
-    a_pad = np.pad(a, (0, max(0, n - len(a))))
-    b_pad = np.pad(b, (0, max(0, n - len(b))))
-    cc = fftconvolve(a_pad, b_pad[::-1], mode='full')
     return float(np.max(np.abs(cc))) / norm
 def build_ncc_distance_matrix(hits: list, max_compare_samples: int = 8820) -> np.ndarray:
-    """Build N×N distance matrix using NCC. d=0 identical, d=1 unrelated.
-    Cached — recomputing is the most expensive step."""
-    # Cache key from hit audio hashes
     key = ("ncc_dist", tuple(_audio_hash(h.audio) for h in hits), max_compare_samples)
     cached = cache_get(key)
     if cached is not None:
-        print(f"  Using cached NCC distance matrix")
-        return cached
     N = len(hits)
     D = np.zeros((N, N), dtype=np.float32)
     for i in range(N):
         ai = hits[i].audio[:max_compare_samples]
         for j in range(i+1, N):
             bj = hits[j].audio[:max_compare_samples]
-            ncc = ncc_max(ai, bj)
-            D[i, j] = D[j, i] = max(0.0, 1.0 - ncc)
     return cache_set(key, D)
-def _agglom_at_threshold(D: np.ndarray, dist_threshold: float, linkage: str = 'average') -> np.ndarray:
-    """Run agglomerative clustering at a specific threshold. Returns labels."""
-    from sklearn.cluster import AgglomerativeClustering
-    agg = AgglomerativeClustering(
-        n_clusters=None,
-        distance_threshold=max(0.001, dist_threshold),
-        metric='precomputed',
-        linkage=linkage,
-    )
-    return agg.fit_predict(D)
-def _labels_to_clusters(labels: np.ndarray, hits: list) -> list:
-    """Convert sklearn labels to Cluster objects with majority-vote naming."""
     cluster_map = defaultdict(list)
-    for i, lab in enumerate(labels):
-        cluster_map[lab].append(i)
     clusters = []
     for _, indices in sorted(cluster_map.items()):
-        label_votes = defaultdict(int)
-        for idx in indices:
-            label_votes[hits[idx].label] += 1
-        majority_label = max(label_votes, key=label_votes.get)
-        existing = sum(1 for c in clusters if c.label.rsplit('_', 1)[0] == majority_label)
-        clusters.append(Cluster(
-            cluster_id=len(clusters),
-            label=f"{majority_label}_{existing}",
-            hits=[hits[i] for i in indices],
-        ))
     clusters.sort(key=lambda c: c.count, reverse=True)
-    for i, c in enumerate(clusters):
-        c.cluster_id = i
     return clusters
 def cluster_hits(hits: list, ncc_threshold: float = 0.80,
-                 max_compare_ms: float = 200.0,
                  target_min: int = 0, target_max: int = 0,
                  linkage: str = 'average') -> list:
-    """Cluster hits by waveform identity using NCC + agglomerative clustering.
-    If target_min/target_max are set (both > 0), ignores ncc_threshold and
-    binary-searches the distance threshold to produce a cluster count within
-    [target_min, target_max]. This is the most intuitive way to control output.
-    linkage: 'average' (recommended — tolerant of outlier pairs),
-             'complete' (strict — any bad pair splits the cluster),
-             'single' (loose — chains distant points together).
     """
-    if not hits:
-        return []
-    N = len(hits)
-    sr = hits[0].sr
-    max_samples = int(max_compare_ms / 1000.0 * sr)
-    print(f"[Stage 4] NCC clustering ({N} hits, linkage={linkage})...")
-    if N == 1:
-        return [Cluster(cluster_id=0, label=f"{hits[0].label}_0", hits=[hits[0]])]
-    # Build (or retrieve cached) distance matrix
     print(f"  Computing {N*(N-1)//2} pairwise NCC distances...")
     D = build_ncc_distance_matrix(hits, max_compare_samples=max_samples)
-    use_target_range = target_min > 0 and target_max > 0 and target_max >= target_min
     target_min = max(1, min(target_min, N))
     target_max = max(target_min, min(target_max, N))
-    if use_target_range:
-        # Binary search for the distance threshold that gives target cluster count
-        print(f"  Target range: {target_min}–{target_max} clusters, searching threshold...")
-        lo, hi = 0.001, 1.0
-        best_labels = None
-        best_n = -1
-        for _ in range(30):  # max 30 binary search steps
             mid = (lo + hi) / 2
-            labels = _agglom_at_threshold(D, mid, linkage)
             n = len(set(labels))
             if target_min <= n <= target_max:
-                best_labels = labels
-                best_n = n
-                break
-            elif n > target_max:
-                # Too many clusters → need higher threshold (merge more)
-                lo = mid
-            else:
-                # Too few clusters → need lower threshold (split more)
-                hi = mid
-            # Keep best attempt in range
-            if best_labels is None or abs(n - (target_min + target_max) / 2) < abs(best_n - (target_min + target_max) / 2):
-                best_labels = labels
-                best_n = n
         labels = best_labels
-        print(f"  → threshold={mid:.4f}, {best_n} clusters")
     else:
-        # Use fixed NCC threshold
         dist_threshold = max(0.001, 1.0 - ncc_threshold)
-        print(f"  Fixed threshold: NCC≥{ncc_threshold} (dist≤{dist_threshold:.3f})")
-        labels = _agglom_at_threshold(D, dist_threshold, linkage)
-    n_clusters = len(set(labels))
-    print(f"  ✓ {n_clusters} clusters")
     clusters = _labels_to_clusters(labels, hits)
-    for c in clusters:
-        print(f"    {c.label}: {c.count} hits")
     return clusters
-# ─── Stage 5: Quality scoring & selection ─────────────────────────────────────
-def sample_quality_score(y: np.ndarray, sr: int, label: str = "other") -> dict:
-    """Score a sample for production quality. Returns dict with total [0,100]."""
     import scipy.stats
     rms_env = librosa.feature.rms(y=y, frame_length=512, hop_length=128)[0]
-    # Completeness
     if len(rms_env) >= 10:
         pk = np.argmax(rms_env); post = rms_env[pk:]
-        tail_r = np.mean(post[-max(3, len(post)//5):]) / (rms_env[pk] + 1e-8)
-        c1 = max(0, 1.0 - tail_r * 5)
-        if len(post) >= 5:
-            slope, _, r, _, _ = scipy.stats.linregress(np.arange(len(post)), np.log(post+1e-8))
-            c2 = max(0, r**2) if slope < 0 else r**2 * 0.3
-        else: c2 = 0.0
-    else: c1, c2 = 0.5, 0.0
-    completeness = c1 * 0.6 + c2 * 0.4
-    # Cleanness
-    snr = 10*np.log10(np.percentile(y**2, 99) / (np.percentile(y**2, 10) + 1e-12))
-    n_snr = np.clip((snr - 10) / 40, 0, 1)
     onsets = librosa.onset.onset_detect(y=y, sr=sr, units='samples', backtrack=True)
-    if len(onsets) > 0:
-        os_s = int(onsets[0])
-        pre = y[max(0, os_s-int(sr*.02)):os_s]
-        sig = y[os_s:os_s+int(sr*.1)]
-        n_pre = np.clip((-10*np.log10(np.mean(pre**2+1e-12)/np.mean(sig**2+1e-12)) - 5)/30, 0, 1) \
-                if len(pre) > 10 and len(sig) > 10 else 0.5
-    else: n_pre = 0.5
-    cleanness = n_snr * 0.5 + n_pre * 0.5
-    # Onset quality
     oe = librosa.onset.onset_strength(y=y, sr=sr)
-    sharpness = float(np.max(oe)/(np.mean(oe)+1e-8)) if len(oe) > 1 else 1.0
-    onset_q = float(np.clip((sharpness - 1.0) / 5.0, 0, 1))
-    total = (completeness * 0.30 + cleanness * 0.40 + onset_q * 0.20 + 0.5 * 0.10) * 100
-    return {'total': float(total), 'completeness': float(completeness),
-            'cleanness': float(cleanness), 'onset_quality': float(onset_q)}
-def select_best(clusters: list):
     print(f"[Stage 5] Selecting best representatives...")
     for c in clusters:
-        if c.count <= 1: c.best_hit_idx = 0; continue
-        scores = [sample_quality_score(h.audio, h.sr, c.label.rsplit('_',1)[0])['total']
-                  for h in c.hits]
         c.best_hit_idx = int(np.argmax(scores))
 # ─── Stage 6: Synthesis ──────────────────────────────────────────────────────
-def synthesize_from_cluster(cluster: Cluster) -> Optional[np.ndarray]:
-    if cluster.count < 2: return None
     tl = int(np.median([len(h.audio) for h in cluster.hits]))
     aligned, weights = [], []
     pp_target = None
     for i, h in enumerate(cluster.hits):
-        a = h.audio.copy()
-        pp = np.argmax(np.abs(a))
         if pp_target is None: pp_target = pp
-        shift = pp_target - pp
-        if shift > 0: a = np.pad(a, (shift, 0))
-        elif shift < 0: a = a[-shift:]
-        a = a[:tl] if len(a) >= tl else np.pad(a, (0, tl - len(a)))
         pk = np.abs(a).max()
-        if pk > 0: a = a / pk
-        aligned.append(a)
-        weights.append(2.0 if i == cluster.best_hit_idx else 1.0)
-    aligned = np.array(aligned)
-    w = np.array(weights); w /= w.sum()
-    synth = np.average(aligned, axis=0, weights=w)
-    pk = np.abs(synth).max()
-    return (synth * 0.95 / pk).astype(np.float32) if pk > 0 else synth.astype(np.float32)
 # ─── Stage 7: MIDI + rendering ───────────────────────────────────────────────
@@ -476,88 +373,80 @@ def synthesize_from_cluster(cluster: Cluster) -> Optional[np.ndarray]:
 def build_midi(clusters, bpm=120.0):
     import pretty_midi
     pm = pretty_midi.PrettyMIDI(initial_tempo=bpm)
-    for i, c in enumerate(clusters): c.midi_note = min(36 + i, 127)
     inst = pretty_midi.Instrument(program=0, is_drum=True, name='Extracted Samples')
     pm.instruments.append(inst)
     for c in clusters:
         for h in c.hits:
-            vel = max(1, min(127, int(h.rms_energy / 0.3 * 127)))
-            inst.notes.append(pretty_midi.Note(velocity=vel, pitch=c.midi_note,
-                                                start=h.onset_time,
-                                                end=h.onset_time + max(h.duration, 0.05)))
-    inst.notes.sort(key=lambda n: n.start)
-    return pm
 def export_midi(clusters, output_path, bpm=120.0):
-    pm = build_midi(clusters, bpm)
-    pm.write(output_path)
-    print(f"  ✓ MIDI: {output_path} ({len(pm.instruments[0].notes)} notes)")
-    return pm
 def detect_bpm(y, sr):
-    cache_key = ("bpm", _audio_hash(y), sr)
-    cached = cache_get(cache_key)
-    if cached is not None:
-        print(f"  Using cached BPM: {cached}")
-        return cached
-    onset_env = librosa.onset.onset_strength(y=y, sr=sr, aggregate=np.median)
-    bpm = float(librosa.feature.tempo(onset_envelope=onset_env, sr=sr).item())
-    _, beats = librosa.beat.beat_track(onset_envelope=onset_env, sr=sr, units='time')
-    if len(beats) > 2:
-        ibi_bpm = 60.0 / float(np.median(np.diff(beats)))
-        for c in [bpm, ibi_bpm]:
-            if 70 <= c <= 200: bpm = c; break
         else:
-            if bpm < 70: bpm *= 2
-            elif bpm > 200: bpm /= 2
-    return cache_set(cache_key, round(bpm, 1))
 def render_midi_with_samples(clusters, sr=44100):
-    max_end = max((h.onset_time + h.duration for c in clusters for h in c.hits), default=1.0)
-    buf = np.zeros(int((max_end + 1.0) * sr), dtype=np.float64)
     for c in clusters:
-        sample = c.best_hit.audio.astype(np.float64)
-        ref_e = c.best_hit.rms_energy if c.best_hit.rms_energy > 0 else 0.1
         for h in c.hits:
-            vs = min(2.0, h.rms_energy / (ref_e + 1e-8)) ** 0.5
-            s = int(h.onset_time * sr); e = s + len(sample)
-            if e > len(buf): buf = np.concatenate([buf, np.zeros(e - len(buf))])
-            buf[s:e] += sample * vs
-    pk = np.abs(buf).max()
-    return (buf / pk * 0.9).astype(np.float32) if pk > 1e-8 else buf.astype(np.float32)
 def build_sample_map(clusters):
-    return {c.midi_note: {'label': c.label, 'count': c.count,
-            'duration_ms': int(c.best_hit.duration * 1000)} for c in clusters}
 def build_archive(clusters, bpm, sr, midi_path=None, rendered_audio=None):
     import zipfile, tempfile, io
-    zip_path = tempfile.mktemp(suffix='.zip')
-    index = {'bpm': round(bpm, 1), 'sample_rate': sr,
-             'total_clusters': len(clusters),
-             'total_hits': sum(c.count for c in clusters), 'samples': {}}
-    with zipfile.ZipFile(zip_path, 'w', compression=zipfile.ZIP_STORED) as zf:
         for c in clusters:
-            best = c.best_hit; fname = f"samples/{c.label}.wav"
-            buf = io.BytesIO(); sf.write(buf, best.audio, sr, format='WAV', subtype='PCM_24')
-            zf.writestr(fname, buf.getvalue())
-            onset_times = sorted([h.onset_time for h in c.hits])
-            index['samples'][c.label] = {
-                'file': fname, 'classification': c.label.rsplit('_', 1)[0],
-                'midi_note': c.midi_note, 'occurrences': c.count,
-                'onset_times_sec': [round(t, 4) for t in onset_times],
-                'duration_sec': round(best.duration, 4),
-                'rms_energy': round(best.rms_energy, 6),
-                'spectral_centroid_hz': round(best.spectral_centroid, 1),
             }
             if c.synthesized is not None:
-                sf2 = f"samples/{c.label}__synthesized.wav"; b2 = io.BytesIO()
-                sf.write(b2, c.synthesized, sr, format='WAV', subtype='PCM_24')
-                zf.writestr(sf2, b2.getvalue())
-                index['samples'][c.label]['synthesized_file'] = sf2
-        zf.writestr('index.json', json.dumps(index, indent=2))
-        if midi_path and os.path.exists(midi_path): zf.write(midi_path, 'reconstruction.mid')
         if rendered_audio is not None:
-            rb = io.BytesIO(); sf.write(rb, rendered_audio, sr, format='WAV', subtype='PCM_16')
-            zf.writestr('rendered_reconstruction.wav', rb.getvalue())
     return zip_path

 #!/usr/bin/env python3
 """
+Sample Extractor v6 — Tested on real hardstyle tracks.
+Fixes from v5:
+  - NCC compare window auto-scales to median hit length (no more zero-pad inflation)
+  - Target range uses n_clusters directly when binary search hits a cliff
+  - Better defaults for real music (delta=0.12, energy=-35, min_gap=0.03)
+  - Caching for stem separation, BPM, NCC distance matrix
 """
+import argparse, json, os, sys, warnings, hashlib
 from collections import defaultdict
 from dataclasses import dataclass, field
 from pathlib import Path
     def count(self) -> int: return len(self.hits)
+DEMUCS_MODELS = ["htdemucs", "htdemucs_ft", "htdemucs_6s", "mdx", "mdx_extra", "mdx_extra_q"]
 DEMUCS_STEMS = {
+    "htdemucs": ["drums","bass","other","vocals"], "htdemucs_ft": ["drums","bass","other","vocals"],
+    "htdemucs_6s": ["drums","bass","other","vocals","guitar","piano"],
+    "mdx": ["drums","bass","other","vocals"], "mdx_extra": ["drums","bass","other","vocals"],
+    "mdx_extra_q": ["drums","bass","other","vocals"],
 }
+# ─── Caching ──────────────────────────────────────────────────────────────────
+_cache = {}
+def _audio_hash(audio: np.ndarray) -> str:
+    return hashlib.md5(audio[:4000].tobytes()).hexdigest()
+def cache_get(key): return _cache.get(key)
+def cache_set(key, value): _cache[key] = value; return value
+def cache_clear(): _cache.clear()
 # ─── Stage 1: Stem separation ────────────────────────────────────────────────
 def extract_stem(audio_path: str, stem: str = "drums", device: str = "cpu",
                  model_name: str = "htdemucs_ft", shifts: int = 1,
                  overlap: float = 0.25) -> tuple:
     if stem == "all":
         y, sr = librosa.load(audio_path, sr=44100, mono=True)
         return y.astype(np.float32), sr
     with open(audio_path, 'rb') as f:
+        file_hash = hashlib.md5(f.read(200000)).hexdigest()
+    ck = ("stem", file_hash, stem, model_name, shifts, overlap)
+    cached = cache_get(ck)
     if cached is not None:
+        print(f"[Stage 1] Using cached {stem} stem"); return cached
     from demucs.pretrained import get_model
     from demucs.apply import apply_model
+    print(f"[Stage 1] Extracting '{stem}' with {model_name}...")
+    model = get_model(model_name); model.eval().to(device)
     sr = model.samplerate
     if stem not in model.sources:
+        raise ValueError(f"'{stem}' not in {model.sources}")
     audio_np, _ = librosa.load(audio_path, sr=sr, mono=False)
+    if audio_np.ndim == 1: audio_np = np.stack([audio_np, audio_np])
+    elif audio_np.shape[0] > 2: audio_np = audio_np[:2]
+    elif audio_np.shape[0] == 1: audio_np = np.concatenate([audio_np, audio_np], axis=0)
     wav = torch.from_numpy(audio_np).float().unsqueeze(0).to(device)
     with torch.no_grad():
+        sources = apply_model(model, wav, device=device, shifts=shifts, split=True, overlap=overlap)
+    result = sources[0, model.sources.index(stem)].mean(dim=0).cpu().numpy()
     print(f"  ✓ {stem}: {len(result)/sr:.1f}s")
+    return cache_set(ck, (result.astype(np.float32), sr))
 # ─── Stage 2: Onset detection ────────────────────────────────────────────────
 def detect_onsets(y: np.ndarray, sr: int, pre_pad: float = 0.005,
                   min_dur: float = 0.02, max_dur: float = 1.5,
+                  min_gap: float = 0.03, energy_threshold_db: float = -35.0,
                   mode: str = "auto", backtrack: bool = True,
+                  onset_delta: float = 0.12) -> list:
+    print(f"[Stage 2] Detecting onsets (mode={mode}, delta={onset_delta}, energy≥{energy_threshold_db}dB)...")
     if mode == "percussive":
         onset_env = librosa.onset.onset_strength(y=y, sr=sr, aggregate=np.median, fmax=8000)
     elif mode == "harmonic":
         onset_env = librosa.onset.onset_strength(y=y_harm, sr=sr, fmax=8000, lag=2, max_size=3)
     elif mode == "broadband":
         onset_env = librosa.onset.onset_strength(y=y, sr=sr)
+    else:
         y_harm, y_perc = librosa.effects.hpss(y)
         envs = [
             librosa.onset.onset_strength(y=y, sr=sr, fmin=20, fmax=250, aggregate=np.median),
             librosa.onset.onset_strength(y=y, sr=sr, fmin=250, fmax=4000, aggregate=np.median),
+            librosa.onset.onset_strength(y=y, sr=sr, fmin=4000, fmax=min(sr//2,20000), aggregate=np.median),
             librosa.onset.onset_strength(y=y_harm, sr=sr, lag=2),
         ]
+        def _n(x): m=x.max(); return x/m if m>0 else x
         onset_env = np.maximum.reduce([_n(e) for e in envs])
     wait = max(1, int(min_gap * sr / 512))
+    frames = librosa.onset.onset_detect(onset_envelope=onset_env, sr=sr, wait=wait,
         pre_avg=3, post_avg=3, pre_max=3, post_max=5,
         delta=onset_delta, backtrack=backtrack, units='frames')
     times = librosa.frames_to_time(frames, sr=sr)
     hits = []
     for i, t in enumerate(times):
         s = max(0, int((t - pre_pad) * sr))
+        e = min(int(times[i+1]*sr) if i+1<len(times) else len(y), s+int(max_dur*sr))
         seg = y[s:e]
+        if len(seg) < int(min_dur*sr): continue
         rms = np.sqrt(np.mean(seg**2))
         if rms < threshold: continue
+        fl = min(int(0.005*sr), len(seg)//4)
+        if fl > 0: seg = seg.copy(); seg[-fl:] *= np.linspace(1, 0, fl)
         sc = float(librosa.feature.spectral_centroid(y=seg, sr=sr).mean())
         hits.append(Hit(audio=seg, sr=sr, onset_time=t, duration=len(seg)/sr,
                         index=len(hits), rms_energy=float(rms), spectral_centroid=sc))
 # ─── Stage 3: Classification ───────────────────────────────────��─────────────
 LABEL_RULES = [
+    ("kick",         lambda lr,mr,hr,c,zcr,d: lr>0.5 and c<800),
+    ("hihat_closed", lambda lr,mr,hr,c,zcr,d: hr>0.35 and c>4000 and d<0.15),
+    ("hihat_open",   lambda lr,mr,hr,c,zcr,d: hr>0.35 and c>4000 and d>=0.15),
+    ("cymbal",       lambda lr,mr,hr,c,zcr,d: hr>0.25 and c>3000),
+    ("snare",        lambda lr,mr,hr,c,zcr,d: mr>0.4 and zcr>0.1 and c>1000),
+    ("tom",          lambda lr,mr,hr,c,zcr,d: lr>0.3 and mr>0.3 and c<1500),
+    ("bass",         lambda lr,mr,hr,c,zcr,d: lr>0.6 and c<400 and d>0.2),
+    ("vocal",        lambda lr,mr,hr,c,zcr,d: mr>0.5 and 500<c<3000 and zcr<0.15),
+    ("bright",       lambda lr,mr,hr,c,zcr,d: c>2500),
+    ("mid",          lambda lr,mr,hr,c,zcr,d: c>800),
 ]
 def classify_hit(hit: Hit) -> str:
     y, sr = hit.audio, hit.sr
     D = np.abs(librosa.stft(y, n_fft=2048))
     freqs = librosa.fft_frequencies(sr=sr, n_fft=2048)
+    le = np.sum(D[(freqs>=20)&(freqs<200)]**2)
+    me = np.sum(D[(freqs>=200)&(freqs<4000)]**2)
+    he = np.sum(D[(freqs>=4000)]**2)
+    total = le+me+he+1e-10; lr,mr,hr = le/total,me/total,he/total
     zcr = float(librosa.feature.zero_crossing_rate(y=y).mean())
     for name, fn in LABEL_RULES:
+        if fn(lr,mr,hr,hit.spectral_centroid,zcr,hit.duration): return name
     return "other"
 def classify_hits(hits: list) -> list:
     print(f"[Stage 3] Classifying {len(hits)} hits...")
+    for h in hits: h.label = classify_hit(h)
     counts = defaultdict(int)
     for h in hits: counts[h.label] += 1
+    for l, c in sorted(counts.items(), key=lambda x: -x[1]): print(f"    {l}: {c}")
     return hits
 # ─── Stage 4: NCC-based clustering ───────────────────────────────────────────
 def ncc_max(a: np.ndarray, b: np.ndarray) -> float:
+    """NCC peak. Amplitude-invariant. Compares the shorter length only."""
+    # Use the shorter clip's length — no zero-padding inflation
+    n = min(len(a), len(b))
+    a, b = a[:n].copy(), b[:n].copy()
+    a -= a.mean(); b -= b.mean()
+    norm = np.sqrt(np.dot(a,a) * np.dot(b,b))
     if norm < 1e-10: return 0.0
+    cc = fftconvolve(a, b[::-1], mode='full')
     return float(np.max(np.abs(cc))) / norm
 def build_ncc_distance_matrix(hits: list, max_compare_samples: int = 8820) -> np.ndarray:
+    """Cached NCC distance matrix. Auto-scales compare window to hit lengths."""
     key = ("ncc_dist", tuple(_audio_hash(h.audio) for h in hits), max_compare_samples)
     cached = cache_get(key)
     if cached is not None:
+        print(f"  Using cached NCC distance matrix"); return cached
     N = len(hits)
     D = np.zeros((N, N), dtype=np.float32)
     for i in range(N):
         ai = hits[i].audio[:max_compare_samples]
         for j in range(i+1, N):
             bj = hits[j].audio[:max_compare_samples]
+            D[i,j] = D[j,i] = max(0.0, 1.0 - ncc_max(ai, bj))
     return cache_set(key, D)
+def _labels_to_clusters(labels, hits):
     cluster_map = defaultdict(list)
+    for i, lab in enumerate(labels): cluster_map[lab].append(i)
     clusters = []
     for _, indices in sorted(cluster_map.items()):
+        votes = defaultdict(int)
+        for idx in indices: votes[hits[idx].label] += 1
+        majority = max(votes, key=votes.get)
+        existing = sum(1 for c in clusters if c.label.rsplit('_',1)[0] == majority)
+        clusters.append(Cluster(cluster_id=len(clusters), label=f"{majority}_{existing}",
+                                 hits=[hits[i] for i in indices]))
     clusters.sort(key=lambda c: c.count, reverse=True)
+    for i, c in enumerate(clusters): c.cluster_id = i
     return clusters
 def cluster_hits(hits: list, ncc_threshold: float = 0.80,
+                 max_compare_ms: float = 0,
                  target_min: int = 0, target_max: int = 0,
                  linkage: str = 'average') -> list:
+    """NCC clustering with target range support.
+    max_compare_ms: 0 = auto (use median hit length). Otherwise milliseconds.
+    target_min/max: if both > 0, find a cluster count in this range.
     """
+    from sklearn.cluster import AgglomerativeClustering
+    if not hits: return []
+    N = len(hits); sr = hits[0].sr
+    if N == 1: return [Cluster(cluster_id=0, label=f"{hits[0].label}_0", hits=[hits[0]])]
+    # Auto-scale compare window to median hit length
+    if max_compare_ms <= 0:
+        median_len = int(np.median([len(h.audio) for h in hits]))
+        max_samples = max(int(0.03 * sr), median_len)  # at least 30ms
+    else:
+        max_samples = int(max_compare_ms / 1000.0 * sr)
+    print(f"[Stage 4] NCC clustering ({N} hits, compare={max_samples/sr*1000:.0f}ms, linkage={linkage})...")
     print(f"  Computing {N*(N-1)//2} pairwise NCC distances...")
     D = build_ncc_distance_matrix(hits, max_compare_samples=max_samples)
+    use_target = target_min > 0 and target_max > 0 and target_max >= target_min
     target_min = max(1, min(target_min, N))
     target_max = max(target_min, min(target_max, N))
+    if use_target:
+        print(f"  Target: {target_min}–{target_max} clusters")
+        # Strategy 1: Binary search on distance threshold
+        lo, hi = 0.001, 1.0
+        best_labels, best_n, best_dist = None, -1, 0.5
+        for _ in range(30):
             mid = (lo + hi) / 2
+            agg = AgglomerativeClustering(n_clusters=None, distance_threshold=max(0.001, mid),
+                                           metric='precomputed', linkage=linkage)
+            labels = agg.fit_predict(D)
             n = len(set(labels))
             if target_min <= n <= target_max:
+                best_labels, best_n, best_dist = labels, n, mid; break
+            elif n > target_max: lo = mid
+            else: hi = mid
+            if best_labels is None or abs(n-(target_min+target_max)/2) < abs(best_n-(target_min+target_max)/2):
+                best_labels, best_n, best_dist = labels, n, mid
+        # Strategy 2: If binary search didn't land in range, use n_clusters directly
+        if best_n < target_min or best_n > target_max:
+            target_mid = (target_min + target_max) // 2
+            target_mid = min(target_mid, N - 1)
+            print(f"  Binary search got {best_n}, falling back to n_clusters={target_mid}")
+            try:
+                agg = AgglomerativeClustering(n_clusters=target_mid, metric='precomputed', linkage=linkage)
+                best_labels = agg.fit_predict(D)
+                best_n = target_mid
+            except Exception as e:
+                print(f"  n_clusters fallback failed: {e}")
         labels = best_labels
+        print(f"  → {best_n} clusters (dist_threshold={best_dist:.4f})")
     else:
         dist_threshold = max(0.001, 1.0 - ncc_threshold)
+        print(f"  Fixed: NCC≥{ncc_threshold} (dist≤{dist_threshold:.3f})")
+        agg = AgglomerativeClustering(n_clusters=None, distance_threshold=dist_threshold,
+                                       metric='precomputed', linkage=linkage)
+        labels = agg.fit_predict(D)
+    print(f"  ✓ {len(set(labels))} clusters")
     clusters = _labels_to_clusters(labels, hits)
+    for c in clusters: print(f"    {c.label}: {c.count} hits")
     return clusters
+# ─── Stage 5: Quality scoring ────────────────────────────────────────────────
+def sample_quality_score(y, sr, label="other"):
     import scipy.stats
     rms_env = librosa.feature.rms(y=y, frame_length=512, hop_length=128)[0]
     if len(rms_env) >= 10:
         pk = np.argmax(rms_env); post = rms_env[pk:]
+        tail_r = np.mean(post[-max(3,len(post)//5):])/(rms_env[pk]+1e-8)
+        c1 = max(0, 1.0-tail_r*5)
+        if len(post)>=5:
+            slope,_,r,_,_ = scipy.stats.linregress(np.arange(len(post)), np.log(post+1e-8))
+            c2 = max(0,r**2) if slope<0 else r**2*0.3
+        else: c2=0.0
+    else: c1,c2 = 0.5,0.0
+    completeness = c1*0.6+c2*0.4
+    snr = 10*np.log10(np.percentile(y**2,99)/(np.percentile(y**2,10)+1e-12))
+    n_snr = np.clip((snr-10)/40,0,1)
     onsets = librosa.onset.onset_detect(y=y, sr=sr, units='samples', backtrack=True)
+    if len(onsets)>0:
+        os_s=int(onsets[0]); pre=y[max(0,os_s-int(sr*.02)):os_s]; sig=y[os_s:os_s+int(sr*.1)]
+        n_pre = np.clip((-10*np.log10(np.mean(pre**2+1e-12)/np.mean(sig**2+1e-12))-5)/30,0,1) \
+                if len(pre)>10 and len(sig)>10 else 0.5
+    else: n_pre=0.5
+    cleanness = n_snr*0.5+n_pre*0.5
     oe = librosa.onset.onset_strength(y=y, sr=sr)
+    sharpness = float(np.max(oe)/(np.mean(oe)+1e-8)) if len(oe)>1 else 1.0
+    onset_q = float(np.clip((sharpness-1.0)/5.0,0,1))
+    total = (completeness*0.30+cleanness*0.40+onset_q*0.20+0.5*0.10)*100
+    return {'total':float(total),'completeness':float(completeness),
+            'cleanness':float(cleanness),'onset_quality':float(onset_q)}
+def select_best(clusters):
     print(f"[Stage 5] Selecting best representatives...")
     for c in clusters:
+        if c.count<=1: c.best_hit_idx=0; continue
+        scores = [sample_quality_score(h.audio,h.sr,c.label.rsplit('_',1)[0])['total'] for h in c.hits]
         c.best_hit_idx = int(np.argmax(scores))
 # ─── Stage 6: Synthesis ──────────────────────────────────────────────────────
+def synthesize_from_cluster(cluster):
+    if cluster.count<2: return None
     tl = int(np.median([len(h.audio) for h in cluster.hits]))
     aligned, weights = [], []
     pp_target = None
     for i, h in enumerate(cluster.hits):
+        a = h.audio.copy(); pp = np.argmax(np.abs(a))
         if pp_target is None: pp_target = pp
+        shift = pp_target-pp
+        if shift>0: a=np.pad(a,(shift,0))
+        elif shift<0: a=a[-shift:]
+        a = a[:tl] if len(a)>=tl else np.pad(a,(0,tl-len(a)))
         pk = np.abs(a).max()
+        if pk>0: a=a/pk
+        aligned.append(a); weights.append(2.0 if i==cluster.best_hit_idx else 1.0)
+    aligned=np.array(aligned); w=np.array(weights); w/=w.sum()
+    synth=np.average(aligned,axis=0,weights=w); pk=np.abs(synth).max()
+    return (synth*0.95/pk).astype(np.float32) if pk>0 else synth.astype(np.float32)
 # ─── Stage 7: MIDI + rendering ───────────────────────────────────────────────
 def build_midi(clusters, bpm=120.0):
     import pretty_midi
     pm = pretty_midi.PrettyMIDI(initial_tempo=bpm)
+    for i,c in enumerate(clusters): c.midi_note=min(36+i,127)
     inst = pretty_midi.Instrument(program=0, is_drum=True, name='Extracted Samples')
     pm.instruments.append(inst)
     for c in clusters:
         for h in c.hits:
+            vel=max(1,min(127,int(h.rms_energy/0.3*127)))
+            inst.notes.append(pretty_midi.Note(velocity=vel,pitch=c.midi_note,
+                start=h.onset_time,end=h.onset_time+max(h.duration,0.05)))
+    inst.notes.sort(key=lambda n: n.start); return pm
 def export_midi(clusters, output_path, bpm=120.0):
+    pm=build_midi(clusters,bpm); pm.write(output_path)
+    print(f"  ✓ MIDI: {output_path} ({len(pm.instruments[0].notes)} notes)"); return pm
 def detect_bpm(y, sr):
+    ck=("bpm",_audio_hash(y),sr); cached=cache_get(ck)
+    if cached is not None: print(f"  Cached BPM: {cached}"); return cached
+    onset_env=librosa.onset.onset_strength(y=y,sr=sr,aggregate=np.median)
+    bpm=float(librosa.feature.tempo(onset_envelope=onset_env,sr=sr).item())
+    _,beats=librosa.beat.beat_track(onset_envelope=onset_env,sr=sr,units='time')
+    if len(beats)>2:
+        ibi=60.0/float(np.median(np.diff(beats)))
+        for c in [bpm,ibi]:
+            if 70<=c<=200: bpm=c; break
         else:
+            if bpm<70: bpm*=2
+            elif bpm>200: bpm/=2
+    return cache_set(ck, round(bpm,1))
 def render_midi_with_samples(clusters, sr=44100):
+    max_end=max((h.onset_time+h.duration for c in clusters for h in c.hits),default=1.0)
+    buf=np.zeros(int((max_end+1.0)*sr),dtype=np.float64)
     for c in clusters:
+        sample=c.best_hit.audio.astype(np.float64)
+        ref_e=c.best_hit.rms_energy if c.best_hit.rms_energy>0 else 0.1
         for h in c.hits:
+            vs=min(2.0,h.rms_energy/(ref_e+1e-8))**0.5
+            s=int(h.onset_time*sr); e=s+len(sample)
+            if e>len(buf): buf=np.concatenate([buf,np.zeros(e-len(buf))])
+            buf[s:e]+=sample*vs
+    pk=np.abs(buf).max()
+    return (buf/pk*0.9).astype(np.float32) if pk>1e-8 else buf.astype(np.float32)
 def build_sample_map(clusters):
+    return {c.midi_note:{'label':c.label,'count':c.count,
+            'duration_ms':int(c.best_hit.duration*1000)} for c in clusters}
 def build_archive(clusters, bpm, sr, midi_path=None, rendered_audio=None):
     import zipfile, tempfile, io
+    zip_path=tempfile.mktemp(suffix='.zip')
+    index={'bpm':round(bpm,1),'sample_rate':sr,'total_clusters':len(clusters),
+           'total_hits':sum(c.count for c in clusters),'samples':{}}
+    with zipfile.ZipFile(zip_path,'w',compression=zipfile.ZIP_STORED) as zf:
         for c in clusters:
+            best=c.best_hit; fname=f"samples/{c.label}.wav"
+            buf=io.BytesIO(); sf.write(buf,best.audio,sr,format='WAV',subtype='PCM_24')
+            zf.writestr(fname,buf.getvalue())
+            onset_times=sorted([h.onset_time for h in c.hits])
+            index['samples'][c.label]={
+                'file':fname,'classification':c.label.rsplit('_',1)[0],
+                'midi_note':c.midi_note,'occurrences':c.count,
+                'onset_times_sec':[round(t,4) for t in onset_times],
+                'duration_sec':round(best.duration,4),
+                'rms_energy':round(best.rms_energy,6),
+                'spectral_centroid_hz':round(best.spectral_centroid,1),
             }
             if c.synthesized is not None:
+                sf2=f"samples/{c.label}__synthesized.wav"; b2=io.BytesIO()
+                sf.write(b2,c.synthesized,sr,format='WAV',subtype='PCM_24')
+                zf.writestr(sf2,b2.getvalue())
+                index['samples'][c.label]['synthesized_file']=sf2
+        zf.writestr('index.json',json.dumps(index,indent=2))
+        if midi_path and os.path.exists(midi_path): zf.write(midi_path,'reconstruction.mid')
         if rendered_audio is not None:
+            rb=io.BytesIO(); sf.write(rb,rendered_audio,sr,format='WAV',subtype='PCM_16')
+            zf.writestr('rendered_reconstruction.wav',rb.getvalue())
     return zip_path