Spaces:

allyboyboy
/

mroctopus

Sleeping

Ewan Claude Opus 4.6 commited on Feb 20

Commit

1646c97

1 Parent(s): 7dd6b8a

Improve transcription fidelity: trailing notes, sustain pedal, complexity tuning

- Fix trailing silence threshold (5% → 2% RMS + 3s protection zone) — recovers 7s of cut-off endings
- Fix leading silence threshold (10% → 5% + always protect first note)
- Add spectral masking in harmonic ghost removal for two-hand texture
- Add sustain pedal detection from audio spectral flux analysis
- Add complexity-aware tuning (note density + polyphony estimation)
- Add audio analysis toolkit (spectral comparison, CQT visualization)
- UI: larger transport icons, loop label, 5s skip labels

Jewish Bride spectral MSE: -40.3% overall, -73.8% at 95th percentile

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>

Files changed (5) hide show

.gitignore +2 -0
app/src/components/Controls.jsx +14 -11
app/src/index.css +81 -61
transcriber/audio_analysis.py +425 -0
transcriber/optimize.py +239 -27

.gitignore CHANGED Viewed

@@ -8,3 +8,5 @@ transcriber/diagnose_*.py
 transcriber/simulate_*.py
 __pycache__
 *.pyc

 transcriber/simulate_*.py
 __pycache__
 *.pyc
+transcriber/soundfonts/
+transcriber/benchmarks/

app/src/components/Controls.jsx CHANGED Viewed

@@ -62,7 +62,7 @@ export default function Controls({
       <div className="controls-main">
         <div className="controls-left">
           <div className="brand-mark">
-            <OctopusLogo size={28} />
             <span className="brand-name">Mr. Octopus</span>
           </div>
           {fileName && (
@@ -76,19 +76,20 @@ export default function Controls({
             onClick={() => seekTo(Math.max(0, displayTime - 5))}
             title="Back 5s"
           >
-            <svg width="16" height="16" viewBox="0 0 24 24" fill="currentColor">
               <path d="M11 18V6l-8.5 6 8.5 6zm.5-6l8.5 6V6l-8.5 6z" />
             </svg>
           </button>
           <button className="play-btn" onClick={togglePlayPause}>
             {isPlaying ? (
-              <svg width="20" height="20" viewBox="0 0 24 24" fill="currentColor">
                 <rect x="6" y="4" width="4" height="16" rx="1" />
                 <rect x="14" y="4" width="4" height="16" rx="1" />
               </svg>
             ) : (
-              <svg width="20" height="20" viewBox="0 0 24 24" fill="currentColor">
                 <path d="M8 5v14l11-7z" />
               </svg>
             )}
@@ -99,15 +100,17 @@ export default function Controls({
             onClick={() => seekTo(Math.min(totalDuration, displayTime + 5))}
             title="Forward 5s"
           >
-            <svg width="16" height="16" viewBox="0 0 24 24" fill="currentColor">
               <path d="M4 18l8.5-6L4 6v12zm9-12v12l8.5-6L13 6z" />
             </svg>
           </button>
         </div>
         <div className="controls-right">
           {/* Loop controls */}
           <div className="loop-controls">
             {!isLooping ? (
               <>
                 <button
@@ -138,12 +141,6 @@ export default function Controls({
             )}
           </div>
-          {onNewSong && (
-            <button className="btn btn-new" onClick={onNewSong}>
-              + New Song
-            </button>
-          )}
           <div className="tempo-control">
             <span className="tempo-label">Speed</span>
             <input
@@ -155,6 +152,12 @@ export default function Controls({
             />
             <span className="tempo-value">{tempo}%</span>
           </div>
         </div>
       </div>

       <div className="controls-main">
         <div className="controls-left">
           <div className="brand-mark">
+            <OctopusLogo size={32} />
             <span className="brand-name">Mr. Octopus</span>
           </div>
           {fileName && (
             onClick={() => seekTo(Math.max(0, displayTime - 5))}
             title="Back 5s"
           >
+            <svg width="18" height="18" viewBox="0 0 24 24" fill="currentColor">
               <path d="M11 18V6l-8.5 6 8.5 6zm.5-6l8.5 6V6l-8.5 6z" />
             </svg>
+            <span className="transport-label">5s</span>
           </button>
           <button className="play-btn" onClick={togglePlayPause}>
             {isPlaying ? (
+              <svg width="24" height="24" viewBox="0 0 24 24" fill="currentColor">
                 <rect x="6" y="4" width="4" height="16" rx="1" />
                 <rect x="14" y="4" width="4" height="16" rx="1" />
               </svg>
             ) : (
+              <svg width="24" height="24" viewBox="0 0 24 24" fill="currentColor">
                 <path d="M8 5v14l11-7z" />
               </svg>
             )}
             onClick={() => seekTo(Math.min(totalDuration, displayTime + 5))}
             title="Forward 5s"
           >
+            <svg width="18" height="18" viewBox="0 0 24 24" fill="currentColor">
               <path d="M4 18l8.5-6L4 6v12zm9-12v12l8.5-6L13 6z" />
             </svg>
+            <span className="transport-label">5s</span>
           </button>
         </div>
         <div className="controls-right">
           {/* Loop controls */}
           <div className="loop-controls">
+            <span className="loop-label">Loop</span>
             {!isLooping ? (
               <>
                 <button
             )}
           </div>
           <div className="tempo-control">
             <span className="tempo-label">Speed</span>
             <input
             />
             <span className="tempo-value">{tempo}%</span>
           </div>
+          {onNewSong && (
+            <button className="btn btn-new" onClick={onNewSong}>
+              + New Song
+            </button>
+          )}
         </div>
       </div>

app/src/index.css CHANGED Viewed

@@ -228,18 +228,18 @@ body {
 }
 .controls-main {
-  height: 56px;
   display: flex;
   align-items: center;
   justify-content: space-between;
-  padding: 0 20px;
-  gap: 16px;
 }
 .controls-left {
   display: flex;
   align-items: center;
-  gap: 14px;
   min-width: 0;
   flex: 1;
 }
@@ -252,7 +252,7 @@ body {
 }
 .brand-name {
-  font-size: 15px;
   font-weight: 700;
   background: linear-gradient(135deg, #a78bfa, #06b6d4);
   -webkit-background-clip: text;
@@ -263,13 +263,13 @@ body {
 }
 .file-name {
-  font-size: 13px;
   color: var(--text-muted);
   white-space: nowrap;
   overflow: hidden;
   text-overflow: ellipsis;
-  max-width: 200px;
-  padding-left: 14px;
   border-left: 1.5px solid var(--border);
   font-weight: 500;
 }
@@ -277,73 +277,84 @@ body {
 .controls-center {
   display: flex;
   align-items: center;
-  gap: 6px;
   flex-shrink: 0;
 }
 .controls-right {
   display: flex;
   align-items: center;
-  gap: 16px;
   flex: 1;
   justify-content: flex-end;
 }
-/* Transport buttons */
 .transport-btn {
-  width: 36px;
-  height: 36px;
-  border-radius: 8px;
   border: none;
   background: var(--surface-2);
   color: var(--text-muted);
   cursor: pointer;
   display: flex;
   align-items: center;
   justify-content: center;
   transition: all 0.15s;
 }
 .transport-btn:hover {
   background: var(--surface-3);
   color: var(--text);
 }
 /* Play button — bold and prominent */
 .play-btn {
-  width: 48px;
-  height: 48px;
   border-radius: 50%;
   border: none;
   background: var(--primary);
   color: white;
-  font-size: 18px;
   cursor: pointer;
   transition: all 0.2s;
   display: flex;
   align-items: center;
   justify-content: center;
-  box-shadow: 0 0 20px var(--primary-glow);
 }
 .play-btn:hover {
   background: var(--primary-hover);
-  box-shadow: 0 0 30px var(--primary-glow);
-  transform: scale(1.05);
 }
 .play-btn:active {
-  transform: scale(0.97);
 }
-/* + New Song button */
 .btn {
   background: var(--surface-2);
   color: var(--text-muted);
   border: 1.5px solid var(--border);
   border-radius: 8px;
-  padding: 7px 16px;
-  font-size: 12px;
   font-weight: 600;
   font-family: inherit;
   cursor: pointer;
@@ -373,33 +384,33 @@ body {
 .tempo-control {
   display: flex;
   align-items: center;
-  gap: 8px;
   background: var(--surface-2);
-  padding: 6px 14px;
-  border-radius: 8px;
   border: 1px solid var(--border);
 }
 .tempo-label {
-  font-size: 11px;
-  font-weight: 600;
-  color: var(--text-subtle);
   text-transform: uppercase;
   letter-spacing: 0.5px;
   white-space: nowrap;
 }
 .tempo-value {
-  font-size: 13px;
-  font-weight: 600;
-  color: var(--text-muted);
-  min-width: 36px;
   text-align: right;
   font-variant-numeric: tabular-nums;
 }
 .tempo-control input[type='range'] {
-  width: 80px;
 }
 /* ========================================
@@ -409,16 +420,16 @@ body {
 .timeline {
   display: flex;
   align-items: center;
-  gap: 12px;
-  padding: 0 20px 10px;
 }
 .timeline-time {
-  font-size: 12px;
   font-weight: 600;
   color: var(--text-muted);
   font-variant-numeric: tabular-nums;
-  min-width: 36px;
 }
 .timeline-time:last-child {
@@ -432,8 +443,8 @@ body {
 .timeline-track input[type='range'] {
   width: 100%;
-  height: 6px;
-  border-radius: 3px;
   -webkit-appearance: none;
   appearance: none;
   outline: none;
@@ -442,19 +453,19 @@ body {
 }
 .timeline-track input[type='range']:hover {
-  height: 8px;
 }
 .timeline-track input[type='range']::-webkit-slider-thumb {
   -webkit-appearance: none;
   appearance: none;
-  width: 14px;
-  height: 14px;
   border-radius: 50%;
   background: var(--primary-hover);
   cursor: pointer;
   border: 2px solid white;
-  box-shadow: 0 0 8px var(--primary-glow);
   transition: transform 0.1s;
 }
@@ -463,13 +474,13 @@ body {
 }
 .timeline-track input[type='range']::-moz-range-thumb {
-  width: 14px;
-  height: 14px;
   border-radius: 50%;
   background: var(--primary-hover);
   cursor: pointer;
   border: 2px solid white;
-  box-shadow: 0 0 8px var(--primary-glow);
 }
 /* General range sliders (for tempo) */
@@ -477,8 +488,8 @@ input[type='range'] {
   -webkit-appearance: none;
   appearance: none;
   background: var(--border);
-  height: 4px;
-  border-radius: 2px;
   outline: none;
   cursor: pointer;
 }
@@ -486,8 +497,8 @@ input[type='range'] {
 input[type='range']::-webkit-slider-thumb {
   -webkit-appearance: none;
   appearance: none;
-  width: 14px;
-  height: 14px;
   border-radius: 50%;
   background: var(--primary);
   cursor: pointer;
@@ -500,8 +511,8 @@ input[type='range']::-webkit-slider-thumb:hover {
 }
 input[type='range']::-moz-range-thumb {
-  width: 14px;
-  height: 14px;
   border-radius: 50%;
   background: var(--primary);
   cursor: pointer;
@@ -512,16 +523,25 @@ input[type='range']::-moz-range-thumb {
 .loop-controls {
   display: flex;
   align-items: center;
-  gap: 4px;
 }
 .btn-loop {
-  min-width: 32px;
   text-align: center;
   font-weight: 700;
-  font-size: 12px;
-  padding: 6px 10px;
-  border-radius: 6px;
   font-family: inherit;
   letter-spacing: 0.3px;
 }
@@ -538,8 +558,8 @@ input[type='range']::-moz-range-thumb {
 }
 .loop-x {
-  margin-left: 6px;
-  font-size: 14px;
   opacity: 0.6;
 }

 }
 .controls-main {
+  height: 72px;
   display: flex;
   align-items: center;
   justify-content: space-between;
+  padding: 0 24px;
+  gap: 20px;
 }
 .controls-left {
   display: flex;
   align-items: center;
+  gap: 16px;
   min-width: 0;
   flex: 1;
 }
 }
 .brand-name {
+  font-size: 16px;
   font-weight: 700;
   background: linear-gradient(135deg, #a78bfa, #06b6d4);
   -webkit-background-clip: text;
 }
 .file-name {
+  font-size: 14px;
   color: var(--text-muted);
   white-space: nowrap;
   overflow: hidden;
   text-overflow: ellipsis;
+  max-width: 240px;
+  padding-left: 16px;
   border-left: 1.5px solid var(--border);
   font-weight: 500;
 }
 .controls-center {
   display: flex;
   align-items: center;
+  gap: 10px;
   flex-shrink: 0;
 }
 .controls-right {
   display: flex;
   align-items: center;
+  gap: 20px;
   flex: 1;
   justify-content: flex-end;
 }
+/* Transport buttons (skip back/forward) */
 .transport-btn {
+  width: 48px;
+  height: 48px;
+  border-radius: 10px;
   border: none;
   background: var(--surface-2);
   color: var(--text-muted);
   cursor: pointer;
   display: flex;
+  flex-direction: column;
   align-items: center;
   justify-content: center;
+  gap: 2px;
   transition: all 0.15s;
+  border: 1px solid var(--border);
 }
 .transport-btn:hover {
   background: var(--surface-3);
   color: var(--text);
+  border-color: var(--border-hover);
+}
+.transport-label {
+  font-size: 10px;
+  font-weight: 600;
+  letter-spacing: 0.3px;
+  opacity: 0.7;
 }
 /* Play button — bold and prominent */
 .play-btn {
+  width: 56px;
+  height: 56px;
   border-radius: 50%;
   border: none;
   background: var(--primary);
   color: white;
+  font-size: 20px;
   cursor: pointer;
   transition: all 0.2s;
   display: flex;
   align-items: center;
   justify-content: center;
+  box-shadow: 0 0 24px var(--primary-glow);
 }
 .play-btn:hover {
   background: var(--primary-hover);
+  box-shadow: 0 0 36px var(--primary-glow);
+  transform: scale(1.06);
 }
 .play-btn:active {
+  transform: scale(0.96);
 }
+/* General button */
 .btn {
   background: var(--surface-2);
   color: var(--text-muted);
   border: 1.5px solid var(--border);
   border-radius: 8px;
+  padding: 8px 18px;
+  font-size: 13px;
   font-weight: 600;
   font-family: inherit;
   cursor: pointer;
 .tempo-control {
   display: flex;
   align-items: center;
+  gap: 10px;
   background: var(--surface-2);
+  padding: 8px 16px;
+  border-radius: 10px;
   border: 1px solid var(--border);
 }
 .tempo-label {
+  font-size: 12px;
+  font-weight: 700;
+  color: var(--text-muted);
   text-transform: uppercase;
   letter-spacing: 0.5px;
   white-space: nowrap;
 }
 .tempo-value {
+  font-size: 14px;
+  font-weight: 700;
+  color: var(--text);
+  min-width: 40px;
   text-align: right;
   font-variant-numeric: tabular-nums;
 }
 .tempo-control input[type='range'] {
+  width: 100px;
 }
 /* ========================================
 .timeline {
   display: flex;
   align-items: center;
+  gap: 14px;
+  padding: 0 24px 12px;
 }
 .timeline-time {
+  font-size: 13px;
   font-weight: 600;
   color: var(--text-muted);
   font-variant-numeric: tabular-nums;
+  min-width: 40px;
 }
 .timeline-time:last-child {
 .timeline-track input[type='range'] {
   width: 100%;
+  height: 8px;
+  border-radius: 4px;
   -webkit-appearance: none;
   appearance: none;
   outline: none;
 }
 .timeline-track input[type='range']:hover {
+  height: 10px;
 }
 .timeline-track input[type='range']::-webkit-slider-thumb {
   -webkit-appearance: none;
   appearance: none;
+  width: 16px;
+  height: 16px;
   border-radius: 50%;
   background: var(--primary-hover);
   cursor: pointer;
   border: 2px solid white;
+  box-shadow: 0 0 10px var(--primary-glow);
   transition: transform 0.1s;
 }
 }
 .timeline-track input[type='range']::-moz-range-thumb {
+  width: 16px;
+  height: 16px;
   border-radius: 50%;
   background: var(--primary-hover);
   cursor: pointer;
   border: 2px solid white;
+  box-shadow: 0 0 10px var(--primary-glow);
 }
 /* General range sliders (for tempo) */
   -webkit-appearance: none;
   appearance: none;
   background: var(--border);
+  height: 5px;
+  border-radius: 3px;
   outline: none;
   cursor: pointer;
 }
 input[type='range']::-webkit-slider-thumb {
   -webkit-appearance: none;
   appearance: none;
+  width: 16px;
+  height: 16px;
   border-radius: 50%;
   background: var(--primary);
   cursor: pointer;
 }
 input[type='range']::-moz-range-thumb {
+  width: 16px;
+  height: 16px;
   border-radius: 50%;
   background: var(--primary);
   cursor: pointer;
 .loop-controls {
   display: flex;
   align-items: center;
+  gap: 6px;
+}
+.loop-label {
+  font-size: 12px;
+  font-weight: 700;
+  color: var(--text-muted);
+  text-transform: uppercase;
+  letter-spacing: 0.5px;
+  margin-right: 2px;
 }
 .btn-loop {
+  min-width: 36px;
   text-align: center;
   font-weight: 700;
+  font-size: 13px;
+  padding: 7px 12px;
+  border-radius: 8px;
   font-family: inherit;
   letter-spacing: 0.3px;
 }
 }
 .loop-x {
+  margin-left: 8px;
+  font-size: 15px;
   opacity: 0.6;
 }

transcriber/audio_analysis.py ADDED Viewed

	@@ -0,0 +1,425 @@

+"""
+Audio Analysis Toolkit for Mr. Octopus Piano Transcription.
+Three analysis modes:
+  1. Spectral comparison: Renders MIDI→audio via FluidSynth, compares spectrograms
+  2. Visual spectrogram: Generates PNG images for AI/human visual inspection
+  3. Audio playback: Plays original, rendered MIDI, or both side-by-side
+Usage:
+  python audio_analysis.py compare <original_audio> <midi_file> [--output-dir ./analysis]
+  python audio_analysis.py visualize <original_audio> <midi_file> [--output-dir ./analysis]
+  python audio_analysis.py play <audio_file> [--start 10.0] [--duration 5.0]
+  python audio_analysis.py play-both <original_audio> <midi_file> [--start 10.0] [--duration 5.0]
+  python audio_analysis.py full <original_audio> <midi_file> [--output-dir ./analysis]
+"""
+import argparse
+import os
+import sys
+import subprocess
+import tempfile
+import numpy as np
+SOUNDFONT_PATH = os.path.join(os.path.dirname(__file__), "soundfonts", "FluidR3_GM.sf2")
+def render_midi_to_audio(midi_path, output_wav, sample_rate=44100):
+    """Render a MIDI file to WAV using FluidSynth."""
+    cmd = [
+        "fluidsynth",
+        f"--fast-render={output_wav}",
+        f"--sample-rate={sample_rate}",
+        "--gain=0.5",
+        "-n", "-i",
+        SOUNDFONT_PATH,
+        midi_path,
+    ]
+    result = subprocess.run(cmd, capture_output=True, text=True, timeout=120)
+    if not os.path.exists(output_wav):
+        print(f"FluidSynth error: {result.stderr}")
+        raise RuntimeError("FluidSynth failed to render MIDI")
+    return output_wav
+def load_audio(path, sr=22050, duration=None):
+    """Load audio file, return mono signal and sample rate."""
+    import librosa
+    y, sr = librosa.load(path, sr=sr, mono=True, duration=duration)
+    return y, sr
+def compute_spectrogram(y, sr, hop_length=512, n_fft=2048):
+    """Compute a log-magnitude mel spectrogram."""
+    import librosa
+    S = librosa.feature.melspectrogram(y=y, sr=sr, n_fft=n_fft, hop_length=hop_length, n_mels=128)
+    S_db = librosa.power_to_db(S, ref=np.max)
+    return S_db
+def compute_cqt(y, sr, hop_length=512):
+    """Compute constant-Q transform (better for music)."""
+    import librosa
+    C = np.abs(librosa.cqt(y=y, sr=sr, hop_length=hop_length, n_bins=84, bins_per_octave=12))
+    C_db = librosa.amplitude_to_db(C, ref=np.max)
+    return C_db
+def align_lengths(spec_a, spec_b):
+    """Trim both spectrograms to the same number of time frames."""
+    min_frames = min(spec_a.shape[1], spec_b.shape[1])
+    return spec_a[:, :min_frames], spec_b[:, :min_frames]
+def spectral_comparison(original_audio, midi_path, output_dir, sr=22050):
+    """
+    Full spectral comparison: renders MIDI to audio, computes spectrograms,
+    calculates frame-by-frame divergence, and identifies problem regions.
+    """
+    import librosa
+    os.makedirs(output_dir, exist_ok=True)
+    # Step 1: Render MIDI to audio
+    rendered_wav = os.path.join(output_dir, "midi_rendered.wav")
+    print("Rendering MIDI to audio via FluidSynth...")
+    render_midi_to_audio(midi_path, rendered_wav, sample_rate=44100)
+    # Step 2: Load both audio files
+    print("Loading original audio...")
+    y_orig, _ = load_audio(original_audio, sr=sr)
+    print("Loading rendered MIDI audio...")
+    y_midi, _ = load_audio(rendered_wav, sr=sr)
+    # Step 3: Compute spectrograms
+    hop = 512
+    print("Computing spectrograms...")
+    spec_orig = compute_spectrogram(y_orig, sr, hop_length=hop)
+    spec_midi = compute_spectrogram(y_midi, sr, hop_length=hop)
+    # Align lengths
+    spec_orig, spec_midi = align_lengths(spec_orig, spec_midi)
+    # Step 4: Compute frame-by-frame divergence
+    # Normalize to 0-1 range for comparison
+    spec_orig_norm = (spec_orig - spec_orig.min()) / (spec_orig.max() - spec_orig.min() + 1e-8)
+    spec_midi_norm = (spec_midi - spec_midi.min()) / (spec_midi.max() - spec_midi.min() + 1e-8)
+    # Mean squared error per frame (across frequency bins)
+    frame_mse = np.mean((spec_orig_norm - spec_midi_norm) ** 2, axis=0)
+    # Convert frame indices to time
+    n_frames = len(frame_mse)
+    times = librosa.frames_to_time(np.arange(n_frames), sr=sr, hop_length=hop)
+    # Step 5: Identify problem regions (frames with high divergence)
+    threshold = np.percentile(frame_mse, 90)  # top 10% divergence
+    problem_mask = frame_mse > threshold
+    # Group consecutive problem frames into regions
+    regions = []
+    in_region = False
+    start = 0
+    for i, is_problem in enumerate(problem_mask):
+        if is_problem and not in_region:
+            start = i
+            in_region = True
+        elif not is_problem and in_region:
+            if times[i] - times[start] > 0.3:  # min 300ms regions
+                regions.append((times[start], times[i - 1], np.mean(frame_mse[start:i])))
+            in_region = False
+    if in_region:
+        regions.append((times[start], times[-1], np.mean(frame_mse[start:])))
+    # Sort by divergence score (worst first)
+    regions.sort(key=lambda r: r[2], reverse=True)
+    # Step 6: Report
+    report_path = os.path.join(output_dir, "spectral_report.txt")
+    with open(report_path, "w") as f:
+        f.write("SPECTRAL COMPARISON REPORT\n")
+        f.write("=" * 60 + "\n\n")
+        f.write(f"Original: {original_audio}\n")
+        f.write(f"MIDI:     {midi_path}\n")
+        f.write(f"Duration: {times[-1]:.1f}s ({n_frames} frames)\n\n")
+        overall_mse = np.mean(frame_mse)
+        f.write(f"Overall MSE:     {overall_mse:.6f}\n")
+        f.write(f"Median MSE:      {np.median(frame_mse):.6f}\n")
+        f.write(f"90th percentile: {threshold:.6f}\n\n")
+        f.write(f"TOP DIVERGENT REGIONS ({len(regions)} found):\n")
+        f.write("-" * 60 + "\n")
+        for i, (t_start, t_end, score) in enumerate(regions[:20]):
+            f.write(f"  {i+1:2d}. {t_start:6.1f}s - {t_end:6.1f}s  "
+                    f"(duration: {t_end - t_start:.1f}s)  MSE: {score:.6f}\n")
+    print(f"Report written to {report_path}")
+    # Save raw data for further analysis
+    np.savez(os.path.join(output_dir, "spectral_data.npz"),
+             frame_mse=frame_mse, times=times, threshold=threshold)
+    return frame_mse, times, regions
+def generate_spectrograms(original_audio, midi_path, output_dir, sr=22050):
+    """
+    Generate side-by-side spectrogram images for visual inspection.
+    Creates: overview, difference map, and zoomed segments.
+    """
+    import librosa
+    import librosa.display
+    import matplotlib
+    matplotlib.use('Agg')
+    import matplotlib.pyplot as plt
+    os.makedirs(output_dir, exist_ok=True)
+    # Render MIDI
+    rendered_wav = os.path.join(output_dir, "midi_rendered.wav")
+    if not os.path.exists(rendered_wav):
+        print("Rendering MIDI to audio...")
+        render_midi_to_audio(midi_path, rendered_wav, sample_rate=44100)
+    # Load
+    print("Loading audio files...")
+    y_orig, _ = load_audio(original_audio, sr=sr)
+    y_midi, _ = load_audio(rendered_wav, sr=sr)
+    hop = 512
+    # CQT spectrograms (better for music than mel)
+    print("Computing CQT spectrograms...")
+    cqt_orig = compute_cqt(y_orig, sr, hop_length=hop)
+    cqt_midi = compute_cqt(y_midi, sr, hop_length=hop)
+    cqt_orig, cqt_midi = align_lengths(cqt_orig, cqt_midi)
+    duration = min(len(y_orig), len(y_midi)) / sr
+    # ===== Figure 1: Full overview side-by-side =====
+    fig, axes = plt.subplots(3, 1, figsize=(20, 12), constrained_layout=True)
+    fig.suptitle("Spectral Comparison: Original vs MIDI Transcription", fontsize=16, fontweight='bold')
+    # Original
+    img0 = axes[0].imshow(cqt_orig, aspect='auto', origin='lower',
+                           extent=[0, duration, 0, 84], cmap='magma',
+                           vmin=-60, vmax=0)
+    axes[0].set_title("Original Audio", fontsize=13)
+    axes[0].set_ylabel("CQT Bin (semitone)")
+    plt.colorbar(img0, ax=axes[0], label='dB')
+    # MIDI rendered
+    img1 = axes[1].imshow(cqt_midi, aspect='auto', origin='lower',
+                           extent=[0, duration, 0, 84], cmap='magma',
+                           vmin=-60, vmax=0)
+    axes[1].set_title("MIDI Transcription (rendered)", fontsize=13)
+    axes[1].set_ylabel("CQT Bin (semitone)")
+    plt.colorbar(img1, ax=axes[1], label='dB')
+    # Difference map
+    diff = cqt_orig - cqt_midi
+    img2 = axes[2].imshow(diff, aspect='auto', origin='lower',
+                           extent=[0, duration, 0, 84], cmap='RdBu_r',
+                           vmin=-30, vmax=30)
+    axes[2].set_title("Difference (Original − MIDI): Red=missing, Blue=extra", fontsize=13)
+    axes[2].set_ylabel("CQT Bin (semitone)")
+    axes[2].set_xlabel("Time (seconds)")
+    plt.colorbar(img2, ax=axes[2], label='dB difference')
+    overview_path = os.path.join(output_dir, "spectrogram_overview.png")
+    plt.savefig(overview_path, dpi=150)
+    plt.close()
+    print(f"Saved: {overview_path}")
+    # ===== Figure 2: Zoomed segments (first 30s, middle, last 30s) =====
+    segments = [
+        ("Opening (0-30s)", 0, 30),
+        ("Middle", max(0, duration / 2 - 15), min(duration, duration / 2 + 15)),
+        ("Ending", max(0, duration - 30), duration),
+    ]
+    for label, t_start, t_end in segments:
+        frame_start = int(t_start * sr / hop)
+        frame_end = int(t_end * sr / hop)
+        frame_end = min(frame_end, cqt_orig.shape[1])
+        if frame_end <= frame_start:
+            continue
+        fig, axes = plt.subplots(3, 1, figsize=(18, 10), constrained_layout=True)
+        fig.suptitle(f"Zoomed: {label} ({t_start:.0f}s - {t_end:.0f}s)", fontsize=14, fontweight='bold')
+        seg_orig = cqt_orig[:, frame_start:frame_end]
+        seg_midi = cqt_midi[:, frame_start:frame_end]
+        img0 = axes[0].imshow(seg_orig, aspect='auto', origin='lower',
+                               extent=[t_start, t_end, 0, 84], cmap='magma',
+                               vmin=-60, vmax=0)
+        axes[0].set_title("Original")
+        axes[0].set_ylabel("CQT Bin")
+        plt.colorbar(img0, ax=axes[0])
+        img1 = axes[1].imshow(seg_midi, aspect='auto', origin='lower',
+                               extent=[t_start, t_end, 0, 84], cmap='magma',
+                               vmin=-60, vmax=0)
+        axes[1].set_title("MIDI Transcription")
+        axes[1].set_ylabel("CQT Bin")
+        plt.colorbar(img1, ax=axes[1])
+        seg_diff = seg_orig - seg_midi
+        img2 = axes[2].imshow(seg_diff, aspect='auto', origin='lower',
+                               extent=[t_start, t_end, 0, 84], cmap='RdBu_r',
+                               vmin=-30, vmax=30)
+        axes[2].set_title("Difference (Red=missing in MIDI, Blue=extra in MIDI)")
+        axes[2].set_ylabel("CQT Bin")
+        axes[2].set_xlabel("Time (seconds)")
+        plt.colorbar(img2, ax=axes[2])
+        safe_label = label.replace(" ", "_").replace("(", "").replace(")", "").replace("-", "_")
+        seg_path = os.path.join(output_dir, f"spectrogram_{safe_label}.png")
+        plt.savefig(seg_path, dpi=150)
+        plt.close()
+        print(f"Saved: {seg_path}")
+    # ===== Figure 3: Energy envelope comparison =====
+    fig, ax = plt.subplots(figsize=(18, 4), constrained_layout=True)
+    energy_orig = np.mean(cqt_orig, axis=0)
+    energy_midi = np.mean(cqt_midi, axis=0)
+    n_frames = min(len(energy_orig), len(energy_midi))
+    times = librosa.frames_to_time(np.arange(n_frames), sr=sr, hop_length=hop)
+    ax.plot(times, energy_orig[:n_frames], label='Original', alpha=0.8, linewidth=0.5)
+    ax.plot(times, energy_midi[:n_frames], label='MIDI Transcription', alpha=0.8, linewidth=0.5)
+    ax.set_xlabel("Time (seconds)")
+    ax.set_ylabel("Mean CQT Energy (dB)")
+    ax.set_title("Energy Envelope Comparison")
+    ax.legend()
+    energy_path = os.path.join(output_dir, "energy_comparison.png")
+    plt.savefig(energy_path, dpi=150)
+    plt.close()
+    print(f"Saved: {energy_path}")
+    return [overview_path]
+def play_audio(audio_path, start=None, duration=None):
+    """Play audio through system speakers using afplay (macOS)."""
+    cmd = ["afplay", audio_path]
+    if start is not None:
+        # afplay doesn't support start offset natively, so we trim with python
+        import soundfile as sf
+        data, sr = sf.read(audio_path)
+        start_sample = int(start * sr)
+        if duration:
+            end_sample = start_sample + int(duration * sr)
+        else:
+            end_sample = len(data)
+        start_sample = max(0, min(start_sample, len(data)))
+        end_sample = max(start_sample, min(end_sample, len(data)))
+        segment = data[start_sample:end_sample]
+        tmp = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
+        sf.write(tmp.name, segment, sr)
+        cmd = ["afplay", tmp.name]
+    print(f"Playing: {audio_path}" + (f" [{start:.1f}s - {start + duration:.1f}s]" if start and duration else ""))
+    subprocess.run(cmd)
+    print("Playback finished.")
+def play_comparison(original_audio, midi_path, start=None, duration=None):
+    """Play original then MIDI rendering back-to-back for comparison."""
+    import soundfile as sf
+    # Render MIDI
+    rendered_wav = tempfile.NamedTemporaryFile(suffix=".wav", delete=False).name
+    print("Rendering MIDI to audio...")
+    render_midi_to_audio(midi_path, rendered_wav, sample_rate=44100)
+    print("\n--- Playing ORIGINAL ---")
+    play_audio(original_audio, start=start, duration=duration)
+    print("\n--- Playing MIDI TRANSCRIPTION ---")
+    play_audio(rendered_wav, start=start, duration=duration)
+    os.unlink(rendered_wav)
+def full_analysis(original_audio, midi_path, output_dir):
+    """Run all analyses: spectral comparison + visual spectrograms."""
+    print("=" * 60)
+    print("FULL AUDIO ANALYSIS")
+    print("=" * 60)
+    # 1. Spectral comparison (metrics + report)
+    print("\n[1/2] Running spectral comparison...")
+    frame_mse, times, regions = spectral_comparison(original_audio, midi_path, output_dir)
+    # 2. Visual spectrograms
+    print("\n[2/2] Generating visual spectrograms...")
+    images = generate_spectrograms(original_audio, midi_path, output_dir)
+    print("\n" + "=" * 60)
+    print(f"Analysis complete! Results in: {output_dir}")
+    print(f"  - spectral_report.txt   (divergence metrics + problem regions)")
+    print(f"  - spectrogram_overview.png (full comparison)")
+    print(f"  - spectrogram_*.png     (zoomed segments)")
+    print(f"  - energy_comparison.png (energy envelopes)")
+    print(f"  - midi_rendered.wav     (MIDI rendered to audio for listening)")
+    print("=" * 60)
+    return regions
+def main():
+    parser = argparse.ArgumentParser(description="Audio analysis toolkit for piano transcription")
+    subparsers = parser.add_subparsers(dest="command", required=True)
+    # compare
+    p_compare = subparsers.add_parser("compare", help="Spectral comparison")
+    p_compare.add_argument("original", help="Original audio file")
+    p_compare.add_argument("midi", help="MIDI transcription file")
+    p_compare.add_argument("--output-dir", default="./analysis", help="Output directory")
+    # visualize
+    p_viz = subparsers.add_parser("visualize", help="Generate spectrogram images")
+    p_viz.add_argument("original", help="Original audio file")
+    p_viz.add_argument("midi", help="MIDI transcription file")
+    p_viz.add_argument("--output-dir", default="./analysis", help="Output directory")
+    # play
+    p_play = subparsers.add_parser("play", help="Play an audio file")
+    p_play.add_argument("audio", help="Audio file to play")
+    p_play.add_argument("--start", type=float, default=None, help="Start time in seconds")
+    p_play.add_argument("--duration", type=float, default=None, help="Duration in seconds")
+    # play-both
+    p_both = subparsers.add_parser("play-both", help="Play original then MIDI back-to-back")
+    p_both.add_argument("original", help="Original audio file")
+    p_both.add_argument("midi", help="MIDI transcription file")
+    p_both.add_argument("--start", type=float, default=None, help="Start time in seconds")
+    p_both.add_argument("--duration", type=float, default=None, help="Duration in seconds")
+    # full
+    p_full = subparsers.add_parser("full", help="Run all analyses")
+    p_full.add_argument("original", help="Original audio file")
+    p_full.add_argument("midi", help="MIDI transcription file")
+    p_full.add_argument("--output-dir", default="./analysis", help="Output directory")
+    args = parser.parse_args()
+    if args.command == "compare":
+        spectral_comparison(args.original, args.midi, args.output_dir)
+    elif args.command == "visualize":
+        generate_spectrograms(args.original, args.midi, args.output_dir)
+    elif args.command == "play":
+        play_audio(args.audio, start=args.start, duration=args.duration)
+    elif args.command == "play-both":
+        play_comparison(args.original, args.midi, start=args.start, duration=args.duration)
+    elif args.command == "full":
+        full_analysis(args.original, args.midi, args.output_dir)
+if __name__ == "__main__":
+    main()

transcriber/optimize.py CHANGED Viewed

@@ -15,6 +15,7 @@ def remove_leading_silence_notes(midi_data, y, sr):
     Finds the first moment of real musical energy and removes any MIDI notes
     before that point (typically microphone rumble / low-freq noise artifacts).
     """
     midi_out = copy.deepcopy(midi_data)
@@ -28,17 +29,32 @@ def remove_leading_silence_notes(midi_data, y, sr):
     if len(rms) == 0:
         return midi_out, 0, 0.0
-    # Music starts when RMS first exceeds 10% of the peak energy
     max_rms = np.max(rms)
     music_start = 0.0
     for i, r in enumerate(rms):
-        if r > max_rms * 0.1:
             music_start = i * 0.05
             break
     if music_start < 0.1:
         return midi_out, 0, music_start
     removed = 0
     for instrument in midi_out.instruments:
         filtered = []
@@ -53,7 +69,11 @@ def remove_leading_silence_notes(midi_data, y, sr):
 def remove_trailing_silence_notes(midi_data, y, sr):
-    """Remove notes that appear during the audio fade-out/silence at the end."""
     midi_out = copy.deepcopy(midi_data)
     hop = int(0.05 * sr)
@@ -66,13 +86,18 @@ def remove_trailing_silence_notes(midi_data, y, sr):
     max_rms = np.max(rms)
-    # Find the last moment where RMS exceeds 5% of peak (searching backwards)
     music_end = len(y) / sr
     for i in range(len(rms) - 1, -1, -1):
-        if rms[i] > max_rms * 0.05:
-            music_end = (i + 1) * 0.05
             break
     removed = 0
     for instrument in midi_out.instruments:
         filtered = []
@@ -150,13 +175,17 @@ def remove_low_energy_notes(midi_data, y, sr, hop_length=512):
 def remove_harmonic_ghosts(midi_data, y=None, sr=22050, hop_length=512):
     """Remove notes that are harmonic doublings of louder lower notes.
-    Pairwise detector: for notes at harmonic intervals (7, 12, 19, 24
-    semitones), remove the upper note if it's clearly a harmonic ghost.
     Uses CQT energy to protect strong notes: if the CQT shows the note
-    has strong energy (> -10dB), it's a real played note regardless of
-    velocity ratio. This prevents removing notes like C6 that happen to
-    co-occur with C5 but are genuinely played.
     """
     midi_out = copy.deepcopy(midi_data)
     removed = 0
@@ -165,6 +194,7 @@ def remove_harmonic_ghosts(midi_data, y=None, sr=22050, hop_length=512):
     # Compute CQT for energy verification if audio provided
     C_db = None
     if y is not None:
         N_BINS = 88 * 3
         FMIN = librosa.note_to_hz('A0')
@@ -222,6 +252,47 @@ def remove_harmonic_ghosts(midi_data, y=None, sr=22050, hop_length=512):
                             to_remove.add(i)
                             break
         instrument.notes = [n for k, n in enumerate(notes) if k not in to_remove]
         removed += len(to_remove)
@@ -288,7 +359,7 @@ def remove_phantom_notes(midi_data, max_pitch=None):
     return midi_out, removed
-def remove_spurious_onsets(midi_data, y, sr, ref_onsets, hop_length=512):
     """Remove MIDI notes that form false-positive onsets not backed by audio.
     Analysis shows 37 extra MIDI onsets cause the biggest F1 drag (precision=0.918).
@@ -302,12 +373,27 @@ def remove_spurious_onsets(midi_data, y, sr, ref_onsets, hop_length=512):
     3. Short+quiet artifacts: onsets where every note is both short (<200ms)
        and quiet (velocity < 50).
     The filter first identifies which MIDI onsets already match audio onsets,
     then only removes unmatched onsets meeting the above criteria.
     """
     midi_out = copy.deepcopy(midi_data)
     tolerance = 0.05
     onset_env = librosa.onset.onset_strength(y=y, sr=sr, hop_length=hop_length)
     onset_times = librosa.frames_to_time(
         np.arange(len(onset_env)), sr=sr, hop_length=hop_length
@@ -359,12 +445,12 @@ def remove_spurious_onsets(midi_data, y, sr, ref_onsets, hop_length=512):
         # Category 1: Chord fragment -- near a matched onset, but only if
         # the onset has weak audio energy. Strong onsets near chords may be
         # real grace notes or arpeggios.
-        if near_matched and strength < 2.0:
             onsets_to_remove.add(j)
             continue
         # Category 2: Isolated ghost -- single note, low strength or far from audio
-        if len(onset_notes) == 1 and (strength < 1.5 or nearest_audio_ms > 100):
             onsets_to_remove.add(j)
             continue
@@ -377,14 +463,14 @@ def remove_spurious_onsets(midi_data, y, sr, ref_onsets, hop_length=512):
         # low velocity (< 35), far from audio onset. These are rumble artifacts
         # that survive the energy filter.
         if (len(onset_notes) == 1 and onset_notes[0].pitch < 40
-                and onset_notes[0].velocity < 35 and nearest_audio_ms > 60):
             onsets_to_remove.add(j)
             continue
         # Category 5: Multi-note onset far from any audio onset (> 120ms)
         # with weak-to-moderate onset strength. These are chord-split artifacts
         # or hallucinated events with no audio support.
-        if nearest_audio_ms > 120 and strength < 3.0:
             onsets_to_remove.add(j)
             continue
@@ -397,7 +483,7 @@ def remove_spurious_onsets(midi_data, y, sr, ref_onsets, hop_length=512):
         # Category 7: Moderate distance from audio (> 70ms) with weak
         # onset strength — catches near-miss hallucinations that are
         # just outside the 50ms matching window.
-        if nearest_audio_ms > 70 and strength < 2.5:
             onsets_to_remove.add(j)
             continue
@@ -633,6 +719,66 @@ def limit_total_concurrent(midi_data, max_per_hand=4, hand_split=60):
     return midi_out, trimmed
 def extend_note_durations(midi_data, y, sr, hop_length=512, max_per_hand=4, hand_split=60):
     """Extend MIDI note durations to match audio CQT energy decay.
@@ -658,6 +804,14 @@ def extend_note_durations(midi_data, y, sr, hop_length=512, max_per_hand=4, hand
     C_norm = (C_norm + 80.0) / 80.0
     n_frames = C.shape[1]
     # Pre-compute per-frame concurrent counts per hand (fast O(1) lookup)
     right_count = np.zeros(n_frames, dtype=int)
     left_count = np.zeros(n_frames, dtype=int)
@@ -671,6 +825,7 @@ def extend_note_durations(midi_data, y, sr, hop_length=512, max_per_hand=4, hand
                 left_count[sf:ef] += 1
     extended = 0
     for inst in midi_out.instruments:
         # Sort notes by start time for overlap checking
         notes_sorted = sorted(inst.notes, key=lambda n: (n.pitch, n.start))
@@ -681,8 +836,13 @@ def extend_note_durations(midi_data, y, sr, hop_length=512, max_per_hand=4, hand
                 continue
             end_frame = min(n_frames, int(note.end * sr / hop_length))
-            # Max extension: 2 seconds beyond current end
-            max_extend = min(n_frames, end_frame + int(2.0 * sr / hop_length))
             # Don't extend into the next note at the same pitch
             next_start_frame = max_extend
@@ -698,7 +858,7 @@ def extend_note_durations(midi_data, y, sr, hop_length=512, max_per_hand=4, hand
             for f in range(end_frame, min(max_extend, next_start_frame)):
                 lo = max(0, fund_bin - 1)
                 hi = min(N_BINS, fund_bin + 2)
-                if np.mean(C_norm[lo:hi, f]) > 0.20:
                     # Check concurrent: this note isn't counted in hand_count
                     # beyond end_frame, so hand_count[f] >= max_per_hand means
                     # extending here would create max_per_hand + 1 concurrent
@@ -717,6 +877,8 @@ def extend_note_durations(midi_data, y, sr, hop_length=512, max_per_hand=4, hand
                     hand_count[old_end_frame:new_end_frame] += 1
                 note.end = new_end
                 extended += 1
     return midi_out, extended
@@ -1084,6 +1246,48 @@ def recover_missing_notes(midi_data, y, sr, hop_length=512, snap_onsets=None):
     return midi_out, recovered
 def optimize(original_audio_path, midi_path, output_path=None):
     """Full optimization pipeline."""
     if output_path is None:
@@ -1114,6 +1318,12 @@ def optimize(original_audio_path, midi_path, output_path=None):
     total_notes = sum(len(inst.notes) for inst in midi_data.instruments)
     print(f"  {total_notes} MIDI notes")
     # Step 0: Remove notes in leading silence (mic rumble artifacts)
     print("\nStep 0: Removing notes in leading silence...")
     midi_data, silence_removed, music_start = remove_leading_silence_notes(midi_data, y, sr)
@@ -1190,7 +1400,7 @@ def optimize(original_audio_path, midi_path, output_path=None):
     # Step 6b: Remove spurious false-positive onsets
     print("\nStep 6b: Removing spurious onsets (false positive cleanup)...")
     midi_data, spurious_notes, spurious_onsets = remove_spurious_onsets(
-        midi_data, y, sr, ref_onsets, hop_length
     )
     print(f"  Removed {spurious_notes} notes across {spurious_onsets} spurious onsets")
@@ -1246,14 +1456,16 @@ def optimize(original_audio_path, midi_path, output_path=None):
     )
     print(f"  Recovered {notes_recovered} notes from CQT energy")
-    # Step 8f: Playability filter — limit per-onset chord size (4 per hand)
-    print("\nStep 8f: Playability filter (max 4 notes per hand per chord)...")
-    midi_data, playability_removed = limit_concurrent_notes(midi_data, max_per_hand=4)
     print(f"  Removed {playability_removed} excess chord notes")
-    # Step 8g: Limit total concurrent sounding notes (4 per hand)
-    print("\nStep 8g: Concurrent sounding limit (max 4 per hand)...")
-    midi_data, sustain_trimmed = limit_total_concurrent(midi_data, max_per_hand=4)
     print(f"  Trimmed {sustain_trimmed} sustained notes to reduce pileup")
     # Final metrics

     Finds the first moment of real musical energy and removes any MIDI notes
     before that point (typically microphone rumble / low-freq noise artifacts).
+    Always preserves the first detected MIDI note to prevent eating the opening.
     """
     midi_out = copy.deepcopy(midi_data)
     if len(rms) == 0:
         return midi_out, 0, 0.0
+    # Music starts when RMS first exceeds 5% of the peak energy
+    # (reduced from 10% to avoid eating quiet openings)
     max_rms = np.max(rms)
     music_start = 0.0
     for i, r in enumerate(rms):
+        if r > max_rms * 0.05:
             music_start = i * 0.05
             break
     if music_start < 0.1:
         return midi_out, 0, music_start
+    # Find the earliest MIDI note onset — always protect it
+    all_notes = sorted(
+        [n for inst in midi_out.instruments for n in inst.notes],
+        key=lambda n: n.start
+    )
+    earliest_onset = all_notes[0].start if all_notes else 0.0
+    # If the "silence" region would eat the first note, clamp music_start
+    if music_start > earliest_onset:
+        music_start = earliest_onset
+    if music_start < 0.1:
+        return midi_out, 0, music_start
     removed = 0
     for instrument in midi_out.instruments:
         filtered = []
 def remove_trailing_silence_notes(midi_data, y, sr):
+    """Remove notes that appear during the audio fade-out/silence at the end.
+    Uses a 2% RMS threshold (reduced from 5%) and adds a 3-second protection
+    zone after the detected music end to preserve natural piano decay/sustain.
+    """
     midi_out = copy.deepcopy(midi_data)
     hop = int(0.05 * sr)
     max_rms = np.max(rms)
+    # Find the last moment where RMS exceeds 2% of peak (searching backwards)
+    # Reduced from 5% to preserve quiet endings, fade-outs, and final sustain
     music_end = len(y) / sr
     for i in range(len(rms) - 1, -1, -1):
+        if rms[i] > max_rms * 0.02:
+            # Add 3-second protection zone for natural piano decay
+            music_end = (i + 1) * 0.05 + 3.0
             break
+    # Clamp to actual audio duration
+    music_end = min(music_end, len(y) / sr)
     removed = 0
     for instrument in midi_out.instruments:
         filtered = []
 def remove_harmonic_ghosts(midi_data, y=None, sr=22050, hop_length=512):
     """Remove notes that are harmonic doublings of louder lower notes.
+    Two-stage detector:
+    1. Pairwise: for notes at harmonic intervals (7, 12, 19, 24 semitones),
+       remove the upper note if it's clearly a harmonic ghost.
+    2. Spectral masking: when bass + melody overlap (two-hand texture),
+       check if upper notes can be explained by the harmonic series of
+       strong lower notes. This catches ghost notes that the pairwise
+       detector misses because they're at non-standard intervals.
     Uses CQT energy to protect strong notes: if the CQT shows the note
+    has strong independent energy distinct from what the lower note's
+    harmonics would produce, it's a real played note.
     """
     midi_out = copy.deepcopy(midi_data)
     removed = 0
     # Compute CQT for energy verification if audio provided
     C_db = None
+    N_BINS = 0
     if y is not None:
         N_BINS = 88 * 3
         FMIN = librosa.note_to_hz('A0')
                             to_remove.add(i)
                             break
+        # Stage 2: Spectral masking for two-hand texture
+        # When bass (< MIDI 55) and melody (>= MIDI 60) overlap, bass harmonics
+        # can produce ghost notes in the melody range. Check if a mid-range note
+        # is explainable as a harmonic partial of a concurrent bass note.
+        if C_db is not None:
+            remaining = [(k, n) for k, n in enumerate(notes) if k not in to_remove]
+            bass_notes = [(k, n) for k, n in remaining if n.pitch < 55]
+            mid_notes = [(k, n) for k, n in remaining if 55 <= n.pitch < 72]
+            for mid_k, mid_n in mid_notes:
+                if mid_k in to_remove:
+                    continue
+                for bass_k, bass_n in bass_notes:
+                    if abs(bass_n.start - mid_n.start) > 0.05:
+                        continue
+                    # Check if mid_n.pitch matches any harmonic partial of bass_n
+                    # Harmonics: 2nd (+12), 3rd (+19), 4th (+24), 5th (+28), 6th (+31)
+                    bass_pitch = bass_n.pitch
+                    harmonic_pitches = {
+                        bass_pitch + 12,  # 2nd harmonic (octave)
+                        bass_pitch + 19,  # 3rd (octave + fifth)
+                        bass_pitch + 24,  # 4th (2 octaves)
+                        bass_pitch + 28,  # 5th (2 oct + major 3rd)
+                        bass_pitch + 31,  # 6th (2 oct + fifth)
+                    }
+                    if mid_n.pitch in harmonic_pitches:
+                        # This mid note matches a bass harmonic — check if
+                        # it has independent CQT energy above the harmonic level
+                        mid_bin = (mid_n.pitch - 21) * 3 + 1
+                        bass_bin = (bass_pitch - 21) * 3 + 1
+                        if 0 <= mid_bin < N_BINS and 0 <= bass_bin < N_BINS:
+                            sf = max(0, int(mid_n.start * sr / hop_length))
+                            ef = min(C_db.shape[1], sf + max(1, int(0.15 * sr / hop_length)))
+                            mid_energy = float(np.max(C_db[max(0, mid_bin-1):min(N_BINS, mid_bin+2), sf:ef]))
+                            bass_energy = float(np.max(C_db[max(0, bass_bin-1):min(N_BINS, bass_bin+2), sf:ef]))
+                            # If bass is much louder (>8dB) and mid note is quiet,
+                            # it's likely a harmonic ghost
+                            if bass_energy - mid_energy > 8.0 and mid_n.velocity < bass_n.velocity * 0.7:
+                                to_remove.add(mid_k)
+                                break
         instrument.notes = [n for k, n in enumerate(notes) if k not in to_remove]
         removed += len(to_remove)
     return midi_out, removed
+def remove_spurious_onsets(midi_data, y, sr, ref_onsets, hop_length=512, complexity='simple'):
     """Remove MIDI notes that form false-positive onsets not backed by audio.
     Analysis shows 37 extra MIDI onsets cause the biggest F1 drag (precision=0.918).
     3. Short+quiet artifacts: onsets where every note is both short (<200ms)
        and quiet (velocity < 50).
+    For complex pieces, thresholds are relaxed to preserve legitimate dense
+    textures that might otherwise be classified as spurious.
     The filter first identifies which MIDI onsets already match audio onsets,
     then only removes unmatched onsets meeting the above criteria.
     """
     midi_out = copy.deepcopy(midi_data)
     tolerance = 0.05
+    # Complexity-adjusted thresholds: complex pieces are more permissive
+    # to preserve legitimate dense textures
+    if complexity == 'complex':
+        strength_scale = 1.5   # require stronger evidence to remove
+        dist_scale = 1.4       # require further from audio onset to remove
+    elif complexity == 'moderate':
+        strength_scale = 1.2
+        dist_scale = 1.2
+    else:
+        strength_scale = 1.0
+        dist_scale = 1.0
     onset_env = librosa.onset.onset_strength(y=y, sr=sr, hop_length=hop_length)
     onset_times = librosa.frames_to_time(
         np.arange(len(onset_env)), sr=sr, hop_length=hop_length
         # Category 1: Chord fragment -- near a matched onset, but only if
         # the onset has weak audio energy. Strong onsets near chords may be
         # real grace notes or arpeggios.
+        if near_matched and strength < 2.0 * strength_scale:
             onsets_to_remove.add(j)
             continue
         # Category 2: Isolated ghost -- single note, low strength or far from audio
+        if len(onset_notes) == 1 and (strength < 1.5 * strength_scale or nearest_audio_ms > 100 * dist_scale):
             onsets_to_remove.add(j)
             continue
         # low velocity (< 35), far from audio onset. These are rumble artifacts
         # that survive the energy filter.
         if (len(onset_notes) == 1 and onset_notes[0].pitch < 40
+                and onset_notes[0].velocity < 35 and nearest_audio_ms > 60 * dist_scale):
             onsets_to_remove.add(j)
             continue
         # Category 5: Multi-note onset far from any audio onset (> 120ms)
         # with weak-to-moderate onset strength. These are chord-split artifacts
         # or hallucinated events with no audio support.
+        if nearest_audio_ms > 120 * dist_scale and strength < 3.0 * strength_scale:
             onsets_to_remove.add(j)
             continue
         # Category 7: Moderate distance from audio (> 70ms) with weak
         # onset strength — catches near-miss hallucinations that are
         # just outside the 50ms matching window.
+        if nearest_audio_ms > 70 * dist_scale and strength < 2.5 * strength_scale:
             onsets_to_remove.add(j)
             continue
     return midi_out, trimmed
+def detect_sustain_regions(y, sr, hop_length=512):
+    """Detect regions where the sustain pedal is likely engaged.
+    Analyzes spectral flux and broadband energy decay. When the sustain pedal
+    is held, notes ring longer and the spectral energy decays slowly instead
+    of dropping abruptly at note release. Detects this by looking for:
+    1. Low spectral flux (sustained timbre, no new attacks)
+    2. Slow energy decay (notes ringing through pedal)
+    Returns a boolean array (per frame) indicating sustained regions.
+    """
+    # Compute spectral flux (rate of spectral change)
+    S = np.abs(librosa.stft(y, hop_length=hop_length))
+    flux = np.sqrt(np.mean(np.diff(S, axis=1) ** 2, axis=0))
+    flux = np.concatenate([[0], flux])  # pad to match frame count
+    # Compute RMS energy
+    rms = librosa.feature.rms(y=y, hop_length=hop_length)[0]
+    # Normalize both
+    flux_norm = flux / (np.percentile(flux, 95) + 1e-8)
+    rms_norm = rms / (np.max(rms) + 1e-8)
+    n_frames = min(len(flux_norm), len(rms_norm))
+    flux_norm = flux_norm[:n_frames]
+    rms_norm = rms_norm[:n_frames]
+    # Sustain pedal indicators:
+    # - Low spectral flux (< 30th percentile) = sustained sound, not new attacks
+    # - Moderate+ energy (> 10% of peak) = notes are still ringing
+    flux_thresh = np.percentile(flux_norm, 30)
+    sustain_mask = (flux_norm < flux_thresh) & (rms_norm > 0.10)
+    # Smooth: close 200ms gaps, remove blips shorter than 300ms
+    close_frames = max(1, int(0.2 * sr / hop_length))
+    min_region = max(1, int(0.3 * sr / hop_length))
+    # Morphological closing
+    for i in range(1, n_frames - 1):
+        if not sustain_mask[i]:
+            before = any(sustain_mask[max(0, i - close_frames):i])
+            after = any(sustain_mask[i + 1:min(n_frames, i + close_frames + 1)])
+            if before and after:
+                sustain_mask[i] = True
+    # Remove short blips
+    in_region = False
+    start = 0
+    for i in range(n_frames):
+        if sustain_mask[i] and not in_region:
+            start = i
+            in_region = True
+        elif not sustain_mask[i] and in_region:
+            if i - start < min_region:
+                sustain_mask[start:i] = False
+            in_region = False
+    return sustain_mask
 def extend_note_durations(midi_data, y, sr, hop_length=512, max_per_hand=4, hand_split=60):
     """Extend MIDI note durations to match audio CQT energy decay.
     C_norm = (C_norm + 80.0) / 80.0
     n_frames = C.shape[1]
+    # Detect sustain pedal regions for longer extension allowance
+    sustain_mask = detect_sustain_regions(y, sr, hop_length)
+    # Pad/trim to match CQT frame count
+    if len(sustain_mask) < n_frames:
+        sustain_mask = np.concatenate([sustain_mask, np.zeros(n_frames - len(sustain_mask), dtype=bool)])
+    else:
+        sustain_mask = sustain_mask[:n_frames]
     # Pre-compute per-frame concurrent counts per hand (fast O(1) lookup)
     right_count = np.zeros(n_frames, dtype=int)
     left_count = np.zeros(n_frames, dtype=int)
                 left_count[sf:ef] += 1
     extended = 0
+    sustain_extended = 0
     for inst in midi_out.instruments:
         # Sort notes by start time for overlap checking
         notes_sorted = sorted(inst.notes, key=lambda n: (n.pitch, n.start))
                 continue
             end_frame = min(n_frames, int(note.end * sr / hop_length))
+            # In sustain regions, allow longer extension (4s) and lower threshold
+            in_sustain = end_frame < n_frames and sustain_mask[min(end_frame, n_frames - 1)]
+            max_ext_seconds = 4.0 if in_sustain else 2.0
+            energy_thresh = 0.15 if in_sustain else 0.20
+            max_extend = min(n_frames, end_frame + int(max_ext_seconds * sr / hop_length))
             # Don't extend into the next note at the same pitch
             next_start_frame = max_extend
             for f in range(end_frame, min(max_extend, next_start_frame)):
                 lo = max(0, fund_bin - 1)
                 hi = min(N_BINS, fund_bin + 2)
+                if np.mean(C_norm[lo:hi, f]) > energy_thresh:
                     # Check concurrent: this note isn't counted in hand_count
                     # beyond end_frame, so hand_count[f] >= max_per_hand means
                     # extending here would create max_per_hand + 1 concurrent
                     hand_count[old_end_frame:new_end_frame] += 1
                 note.end = new_end
                 extended += 1
+                if in_sustain:
+                    sustain_extended += 1
     return midi_out, extended
     return midi_out, recovered
+def estimate_complexity(midi_data, audio_duration):
+    """Estimate piece complexity to adjust filter aggressiveness.
+    Returns a dict with:
+    - note_density: notes per second
+    - avg_polyphony: average concurrent notes at any onset
+    - complexity: 'simple' (<4 n/s), 'moderate' (4-8), 'complex' (>8)
+    Complex pieces need less aggressive ghost removal and wider tolerance
+    for concurrent notes, since dense textures are intentional.
+    """
+    all_notes = sorted(
+        [n for inst in midi_data.instruments for n in inst.notes],
+        key=lambda n: n.start
+    )
+    if not all_notes or audio_duration < 1:
+        return {'note_density': 0, 'avg_polyphony': 1, 'complexity': 'simple'}
+    note_density = len(all_notes) / audio_duration
+    # Compute average polyphony at each onset
+    onsets = sorted(set(round(n.start, 3) for n in all_notes))
+    polyphonies = []
+    for onset in onsets:
+        count = sum(1 for n in all_notes if abs(n.start - onset) < 0.03)
+        polyphonies.append(count)
+    avg_polyphony = np.mean(polyphonies) if polyphonies else 1
+    if note_density > 8 or avg_polyphony > 3.5:
+        complexity = 'complex'
+    elif note_density > 4 or avg_polyphony > 2.5:
+        complexity = 'moderate'
+    else:
+        complexity = 'simple'
+    return {
+        'note_density': note_density,
+        'avg_polyphony': avg_polyphony,
+        'complexity': complexity,
+    }
 def optimize(original_audio_path, midi_path, output_path=None):
     """Full optimization pipeline."""
     if output_path is None:
     total_notes = sum(len(inst.notes) for inst in midi_data.instruments)
     print(f"  {total_notes} MIDI notes")
+    # Estimate complexity to adjust filter thresholds
+    complexity_info = estimate_complexity(midi_data, audio_duration)
+    complexity = complexity_info['complexity']
+    print(f"  Complexity: {complexity} (density={complexity_info['note_density']:.1f} n/s, "
+          f"polyphony={complexity_info['avg_polyphony']:.1f})")
     # Step 0: Remove notes in leading silence (mic rumble artifacts)
     print("\nStep 0: Removing notes in leading silence...")
     midi_data, silence_removed, music_start = remove_leading_silence_notes(midi_data, y, sr)
     # Step 6b: Remove spurious false-positive onsets
     print("\nStep 6b: Removing spurious onsets (false positive cleanup)...")
     midi_data, spurious_notes, spurious_onsets = remove_spurious_onsets(
+        midi_data, y, sr, ref_onsets, hop_length, complexity=complexity
     )
     print(f"  Removed {spurious_notes} notes across {spurious_onsets} spurious onsets")
     )
     print(f"  Recovered {notes_recovered} notes from CQT energy")
+    # Step 8f: Playability filter — limit per-onset chord size
+    # Complex pieces get 5 notes/hand to preserve dense voicings
+    max_hand = 5 if complexity == 'complex' else 4
+    print(f"\nStep 8f: Playability filter (max {max_hand} notes per hand per chord)...")
+    midi_data, playability_removed = limit_concurrent_notes(midi_data, max_per_hand=max_hand)
     print(f"  Removed {playability_removed} excess chord notes")
+    # Step 8g: Limit total concurrent sounding notes
+    print(f"\nStep 8g: Concurrent sounding limit (max {max_hand} per hand)...")
+    midi_data, sustain_trimmed = limit_total_concurrent(midi_data, max_per_hand=max_hand)
     print(f"  Trimmed {sustain_trimmed} sustained notes to reduce pileup")
     # Final metrics