Spaces:

allyboyboy
/

mroctopus

Sleeping

Ewan Claude Opus 4.6 commited on Feb 23

Commit

f474670

1 Parent(s): 1f67352

Merge repeated same-pitch notes into sustains, increase drum detection sensitivity

Piano: New merge_repeated_notes() step (7c) checks onset energy at re-attack
points of consecutive same-pitch notes. If re-attack energy is below 1.2x
the median onset strength, the notes are merged into one sustained note
instead of stuttering re-strikes. Preserves real repeated notes with genuine
attack energy.

Drums: Significantly increased detection sensitivity:
- Hi-hat band: delta 0.05->0.03, RMS threshold 0.003->0.001, wait 3->2
- Mid band: delta 0.06->0.04, RMS 0.005->0.003, wait 3->2
- Low band: delta 0.08->0.06
- Added full-band safety net pass (catches hits all sub-bands miss)
- Improved fallback classifier for full-band-only detections
- Merge window 25ms->30ms for better cross-band alignment

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>

Files changed (3) hide show

transcriber/drums.py +51 -20
transcriber/optimize.py +84 -0
transcriber/optimize_other.py +6 -0

transcriber/drums.py CHANGED Viewed

@@ -1,9 +1,11 @@
 """Drum transcription via multi-band onset detection + spectral classification.
 Uses sub-band filtering to detect onsets independently in low/mid/high
-frequency ranges, then merges and classifies based on which bands triggered.
-This handles simultaneous hits (kick+hihat, kick+snare+hihat) naturally
-since each drum occupies different frequency bands.
 Input: isolated Demucs drums stem (already separated from other instruments).
 Output: JSON with lane-based drum events.
@@ -21,7 +23,7 @@ from scipy.signal import butter, filtfilt
 LANES = ["crash", "ride", "hihat", "tom_high", "snare", "tom_low", "kick"]
 # Merge tolerance: onsets within this window across bands are the same hit
-MERGE_WINDOW = 0.025  # 25ms
 def _bandpass(y, low, high, sr, order=4):
@@ -48,7 +50,7 @@ def _highpass(y, low, sr, order=4):
 def _detect_band_onsets(y_band, sr, hop_length, delta=0.06, wait=3, rms_threshold=0.005):
     """Detect onsets in a filtered frequency band.
-    Returns list of (time, sample) tuples for onsets above the RMS threshold.
     """
     onset_env = librosa.onset.onset_strength(
         y=y_band, sr=sr, hop_length=hop_length,
@@ -102,7 +104,6 @@ def _classify_from_bands(low_hit, mid_hit, high_hit, y, sr, onset_sample):
     n_fft = min(4096, len(segment))
     if n_fft < 256:
         n_fft = 256
-    # Zero-pad if segment is shorter than n_fft
     if len(segment) < n_fft:
         segment = np.pad(segment, (0, n_fft - len(segment)))
     fft = np.abs(np.fft.rfft(segment, n=n_fft))
@@ -173,13 +174,11 @@ def _classify_from_bands(low_hit, mid_hit, high_hit, y, sr, onset_sample):
     elif has_low and has_mid and has_high:
         # All three bands → snare (full broadband) or complex hit
-        # Snare triggers all bands: low body, mid fundamental, high snare wires
         low_rms = low_hit[2] if low_hit else 0
         high_rms = high_hit[2] if high_hit else 0
         if mid_r > 0.15 and flatness > 0.03:
             results.append(("snare", velocity))
-            # If low is much stronger than expected for snare, also a kick
             if low_rms > 0.08 and sub_low_r > 0.25:
                 results.append(("kick", min(1.0, low_rms * 10)))
         else:
@@ -192,13 +191,22 @@ def _classify_from_bands(low_hit, mid_hit, high_hit, y, sr, onset_sample):
         if low_mid_r > 0.4:
             results.append(("tom_high", velocity))
         else:
-            results.append(("snare", velocity * 0.6))  # ghost note
     else:
-        # Fallback: use spectral features
-        if low_r > 0.5:
             results.append(("kick", velocity))
-        elif high_r > 0.3:
             results.append(("hihat", velocity))
         else:
             results.append(("snare", velocity))
@@ -210,7 +218,8 @@ def transcribe_drums(audio_path, output_path):
     """Transcribe a drums stem to a lane-based drum tab JSON.
     Uses multi-band onset detection: filters the signal into low/mid/high
-    bands, detects onsets independently in each, then merges and classifies
     based on which bands triggered at each time point.
     Args:
@@ -235,17 +244,32 @@ def transcribe_drums(audio_path, output_path):
     y_high = _highpass(y, 3000, sr)        # hi-hat, crash, ride
     # ── Step 2: Per-band onset detection ─────────────────────────────────
     print("  Drums: detecting per-band onsets...")
-    low_onsets = _detect_band_onsets(y_low, sr, hop_length, delta=0.08, rms_threshold=0.008)
-    mid_onsets = _detect_band_onsets(y_mid, sr, hop_length, delta=0.06, rms_threshold=0.005)
-    high_onsets = _detect_band_onsets(y_high, sr, hop_length, delta=0.05, rms_threshold=0.003)
-    print(f"    Low: {len(low_onsets)}, Mid: {len(mid_onsets)}, High: {len(high_onsets)}")
     # ── Step 3: Merge onsets across bands ────────────────────────────────
-    # Collect all onset times, then group within MERGE_WINDOW
     print("  Drums: merging cross-band onsets...")
     all_times = set()
-    for onsets in [low_onsets, mid_onsets, high_onsets]:
         for t, s, r in onsets:
             all_times.add(t)
@@ -262,6 +286,8 @@ def transcribe_drums(audio_path, output_path):
                 group = [t]
         merged_times.append(np.mean(group))
     # For each merged onset, find which bands triggered
     def find_band_hit(band_onsets, target_time):
         """Find the band onset closest to target_time within MERGE_WINDOW."""
@@ -292,8 +318,13 @@ def transcribe_drums(audio_path, output_path):
                 if onset_sample is None or s < onset_sample:
                     onset_sample = s
         if onset_sample is None:
-            continue
         hits = _classify_from_bands(low_hit, mid_hit, high_hit, y, sr, onset_sample)
         for lane, vel in hits:

 """Drum transcription via multi-band onset detection + spectral classification.
 Uses sub-band filtering to detect onsets independently in low/mid/high
+frequency ranges, plus a full-band pass as a safety net. Merges across
+bands and classifies based on which bands triggered.
+Handles simultaneous hits (kick+hihat, kick+snare+hihat) naturally since
+each drum occupies different frequency bands.
 Input: isolated Demucs drums stem (already separated from other instruments).
 Output: JSON with lane-based drum events.
 LANES = ["crash", "ride", "hihat", "tom_high", "snare", "tom_low", "kick"]
 # Merge tolerance: onsets within this window across bands are the same hit
+MERGE_WINDOW = 0.030  # 30ms
 def _bandpass(y, low, high, sr, order=4):
 def _detect_band_onsets(y_band, sr, hop_length, delta=0.06, wait=3, rms_threshold=0.005):
     """Detect onsets in a filtered frequency band.
+    Returns list of (time, sample, rms) tuples for onsets above the threshold.
     """
     onset_env = librosa.onset.onset_strength(
         y=y_band, sr=sr, hop_length=hop_length,
     n_fft = min(4096, len(segment))
     if n_fft < 256:
         n_fft = 256
     if len(segment) < n_fft:
         segment = np.pad(segment, (0, n_fft - len(segment)))
     fft = np.abs(np.fft.rfft(segment, n=n_fft))
     elif has_low and has_mid and has_high:
         # All three bands → snare (full broadband) or complex hit
         low_rms = low_hit[2] if low_hit else 0
         high_rms = high_hit[2] if high_hit else 0
         if mid_r > 0.15 and flatness > 0.03:
             results.append(("snare", velocity))
             if low_rms > 0.08 and sub_low_r > 0.25:
                 results.append(("kick", min(1.0, low_rms * 10)))
         else:
         if low_mid_r > 0.4:
             results.append(("tom_high", velocity))
         else:
+            results.append(("snare", velocity * 0.6))
     else:
+        # Fallback (only full-band detected, no sub-band): use spectral features
+        if low_r > 0.5 and centroid < 400:
+            results.append(("kick", velocity))
+        elif high_r > 0.35 and centroid > 4000:
+            if flatness > 0.15 and velocity > 0.4:
+                results.append(("crash", velocity))
+            else:
+                results.append(("hihat", velocity))
+        elif mid_r > 0.3 and flatness > 0.05:
+            results.append(("snare", velocity))
+        elif centroid < 500:
             results.append(("kick", velocity))
+        elif centroid > 3000:
             results.append(("hihat", velocity))
         else:
             results.append(("snare", velocity))
     """Transcribe a drums stem to a lane-based drum tab JSON.
     Uses multi-band onset detection: filters the signal into low/mid/high
+    bands, detects onsets independently in each, plus a full-band safety
+    net to catch any hits missed by sub-band detection. Merges and classifies
     based on which bands triggered at each time point.
     Args:
     y_high = _highpass(y, 3000, sr)        # hi-hat, crash, ride
     # ── Step 2: Per-band onset detection ─────────────────────────────────
+    # Sensitivity tuned per band:
+    #   - Low: moderate delta, kick/toms are loud and clear
+    #   - Mid: moderate delta, snare is usually prominent
+    #   - High: LOW delta + low RMS threshold — hi-hats are quiet but frequent
+    #   - Full: catches anything the sub-bands miss
     print("  Drums: detecting per-band onsets...")
+    low_onsets = _detect_band_onsets(
+        y_low, sr, hop_length, delta=0.06, wait=3, rms_threshold=0.005
+    )
+    mid_onsets = _detect_band_onsets(
+        y_mid, sr, hop_length, delta=0.04, wait=2, rms_threshold=0.003
+    )
+    high_onsets = _detect_band_onsets(
+        y_high, sr, hop_length, delta=0.03, wait=2, rms_threshold=0.001
+    )
+    # Full-band safety net — catches hits that sub-band filters miss
+    full_onsets = _detect_band_onsets(
+        y, sr, hop_length, delta=0.04, wait=2, rms_threshold=0.005
+    )
+    print(f"    Low: {len(low_onsets)}, Mid: {len(mid_onsets)}, "
+          f"High: {len(high_onsets)}, Full: {len(full_onsets)}")
     # ── Step 3: Merge onsets across bands ────────────────────────────────
     print("  Drums: merging cross-band onsets...")
     all_times = set()
+    for onsets in [low_onsets, mid_onsets, high_onsets, full_onsets]:
         for t, s, r in onsets:
             all_times.add(t)
                 group = [t]
         merged_times.append(np.mean(group))
+    print(f"    {len(merged_times)} merged onsets (from {len(all_times)} raw)")
     # For each merged onset, find which bands triggered
     def find_band_hit(band_onsets, target_time):
         """Find the band onset closest to target_time within MERGE_WINDOW."""
                 if onset_sample is None or s < onset_sample:
                     onset_sample = s
+        # If no sub-band hit, use the full-band onset
         if onset_sample is None:
+            full_hit = find_band_hit(full_onsets, onset_time)
+            if full_hit is not None:
+                onset_sample = full_hit[1]
+            else:
+                continue
         hits = _classify_from_bands(low_hit, mid_hit, high_hit, y, sr, onset_sample)
         for lane, vel in hits:

transcriber/optimize.py CHANGED Viewed

@@ -803,6 +803,85 @@ def remove_hand_outliers(midi_data, hand_split=60, gap_threshold=7):
     return midi_out, removed
 def consolidate_rhythm(midi_data, y, sr, hop_length=512, max_snap=0.04):
     """Consolidate note onsets onto a dominant rhythmic pattern.
@@ -1640,6 +1719,11 @@ def optimize(original_audio_path, midi_path, output_path=None):
     midi_data, rhythm_snapped, n_dominant = consolidate_rhythm(midi_data, y, sr, hop_length)
     print(f"  Snapped {rhythm_snapped} notes to {n_dominant} dominant subdivisions")
     # Step 8: Fix overlaps and enforce min duration (LAST — after all position changes)
     print("\nStep 8: Fixing overlaps and enforcing min duration...")
     midi_data, notes_trimmed, durations_enforced = fix_note_overlap(midi_data)

     return midi_out, removed
+def merge_repeated_notes(midi_data, y, sr, hop_length=512, min_gap=0.15):
+    """Merge consecutive same-pitch notes that lack a real re-attack.
+    Basic-pitch often fragments a single sustained note into multiple short
+    re-strikes. This step checks whether a repeated note has genuine onset
+    energy at the re-attack point. If not, the notes are merged into one
+    sustained note.
+    Args:
+        min_gap: If the gap between notes is larger than this (seconds),
+            always keep separate — the silence itself is musical. Default 150ms.
+    """
+    midi_out = copy.deepcopy(midi_data)
+    merged_count = 0
+    # Compute onset strength envelope for verification
+    onset_env = librosa.onset.onset_strength(y=y, sr=sr, hop_length=hop_length)
+    for instrument in midi_out.instruments:
+        # Sort by pitch then start time to find consecutive same-pitch notes
+        notes = sorted(instrument.notes, key=lambda n: (n.pitch, n.start))
+        to_remove = set()
+        i = 0
+        while i < len(notes) - 1:
+            if i in to_remove:
+                i += 1
+                continue
+            note = notes[i]
+            j = i + 1
+            # Walk forward through consecutive same-pitch notes
+            while j < len(notes) and notes[j].pitch == note.pitch:
+                if j in to_remove:
+                    j += 1
+                    continue
+                next_note = notes[j]
+                gap = next_note.start - note.end
+                # If there's a real gap (silence), keep them separate
+                if gap > min_gap:
+                    break
+                # If the next note starts before or just after this one ends,
+                # check for onset energy at the re-attack point
+                reattack_time = next_note.start
+                reattack_frame = int(reattack_time * sr / hop_length)
+                has_onset = False
+                if 0 <= reattack_frame < len(onset_env):
+                    # Check onset strength in a small window around the re-attack
+                    lo = max(0, reattack_frame - 1)
+                    hi = min(len(onset_env), reattack_frame + 2)
+                    local_strength = float(np.max(onset_env[lo:hi]))
+                    # Compare to the median onset strength — if re-attack is
+                    # weaker than median, it's not a real new attack
+                    median_strength = float(np.median(onset_env[onset_env > 0])) if np.any(onset_env > 0) else 0
+                    has_onset = local_strength > median_strength * 1.2
+                if not has_onset:
+                    # Merge: extend current note to cover the next one
+                    note.end = max(note.end, next_note.end)
+                    to_remove.add(j)
+                    merged_count += 1
+                    j += 1
+                else:
+                    # Real re-attack — stop merging
+                    break
+            i = j if j > i + 1 else i + 1
+        instrument.notes = [n for k, n in enumerate(notes) if k not in to_remove]
+    return midi_out, merged_count
 def consolidate_rhythm(midi_data, y, sr, hop_length=512, max_snap=0.04):
     """Consolidate note onsets onto a dominant rhythmic pattern.
     midi_data, rhythm_snapped, n_dominant = consolidate_rhythm(midi_data, y, sr, hop_length)
     print(f"  Snapped {rhythm_snapped} notes to {n_dominant} dominant subdivisions")
+    # Step 7c: Merge repeated consecutive same-pitch notes without real re-attack
+    print("\nStep 7c: Merging repeated notes without re-attack energy...")
+    midi_data, notes_merged = merge_repeated_notes(midi_data, y, sr, hop_length)
+    print(f"  Merged {notes_merged} repeated notes into sustains")
     # Step 8: Fix overlaps and enforce min duration (LAST — after all position changes)
     print("\nStep 8: Fixing overlaps and enforcing min duration...")
     midi_data, notes_trimmed, durations_enforced = fix_note_overlap(midi_data)

transcriber/optimize_other.py CHANGED Viewed

@@ -34,6 +34,7 @@ from optimize import (
     remove_harmonic_ghosts,
     remove_hand_outliers,
     consolidate_rhythm,
 )
@@ -159,6 +160,11 @@ def optimize_other(original_audio_path, midi_path, output_path=None, mix_audio_p
     midi_data, rhythm_snapped, n_dominant = consolidate_rhythm(midi_data, y, sr, hop_length)
     print(f"  Snapped {rhythm_snapped} notes to {n_dominant} dominant subdivisions")
     # Step 8: Fix overlaps and enforce min duration
     print("\nStep 8: Fixing overlaps...")
     midi_data, notes_trimmed, durations_enforced = fix_note_overlap(midi_data)

     remove_harmonic_ghosts,
     remove_hand_outliers,
     consolidate_rhythm,
+    merge_repeated_notes,
 )
     midi_data, rhythm_snapped, n_dominant = consolidate_rhythm(midi_data, y, sr, hop_length)
     print(f"  Snapped {rhythm_snapped} notes to {n_dominant} dominant subdivisions")
+    # Step 7c: Merge repeated same-pitch notes without real re-attack
+    print("\nStep 7c: Merging repeated notes without re-attack energy...")
+    midi_data, notes_merged = merge_repeated_notes(midi_data, y, sr, hop_length)
+    print(f"  Merged {notes_merged} repeated notes into sustains")
     # Step 8: Fix overlaps and enforce min duration
     print("\nStep 8: Fixing overlaps...")
     midi_data, notes_trimmed, durations_enforced = fix_note_overlap(midi_data)