Ewan Claude Opus 4.6 commited on
Commit
1646c97
·
1 Parent(s): 7dd6b8a

Improve transcription fidelity: trailing notes, sustain pedal, complexity tuning

Browse files

- Fix trailing silence threshold (5% → 2% RMS + 3s protection zone) — recovers 7s of cut-off endings
- Fix leading silence threshold (10% → 5% + always protect first note)
- Add spectral masking in harmonic ghost removal for two-hand texture
- Add sustain pedal detection from audio spectral flux analysis
- Add complexity-aware tuning (note density + polyphony estimation)
- Add audio analysis toolkit (spectral comparison, CQT visualization)
- UI: larger transport icons, loop label, 5s skip labels

Jewish Bride spectral MSE: -40.3% overall, -73.8% at 95th percentile

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>

.gitignore CHANGED
@@ -8,3 +8,5 @@ transcriber/diagnose_*.py
8
  transcriber/simulate_*.py
9
  __pycache__
10
  *.pyc
 
 
 
8
  transcriber/simulate_*.py
9
  __pycache__
10
  *.pyc
11
+ transcriber/soundfonts/
12
+ transcriber/benchmarks/
app/src/components/Controls.jsx CHANGED
@@ -62,7 +62,7 @@ export default function Controls({
62
  <div className="controls-main">
63
  <div className="controls-left">
64
  <div className="brand-mark">
65
- <OctopusLogo size={28} />
66
  <span className="brand-name">Mr. Octopus</span>
67
  </div>
68
  {fileName && (
@@ -76,19 +76,20 @@ export default function Controls({
76
  onClick={() => seekTo(Math.max(0, displayTime - 5))}
77
  title="Back 5s"
78
  >
79
- <svg width="16" height="16" viewBox="0 0 24 24" fill="currentColor">
80
  <path d="M11 18V6l-8.5 6 8.5 6zm.5-6l8.5 6V6l-8.5 6z" />
81
  </svg>
 
82
  </button>
83
 
84
  <button className="play-btn" onClick={togglePlayPause}>
85
  {isPlaying ? (
86
- <svg width="20" height="20" viewBox="0 0 24 24" fill="currentColor">
87
  <rect x="6" y="4" width="4" height="16" rx="1" />
88
  <rect x="14" y="4" width="4" height="16" rx="1" />
89
  </svg>
90
  ) : (
91
- <svg width="20" height="20" viewBox="0 0 24 24" fill="currentColor">
92
  <path d="M8 5v14l11-7z" />
93
  </svg>
94
  )}
@@ -99,15 +100,17 @@ export default function Controls({
99
  onClick={() => seekTo(Math.min(totalDuration, displayTime + 5))}
100
  title="Forward 5s"
101
  >
102
- <svg width="16" height="16" viewBox="0 0 24 24" fill="currentColor">
103
  <path d="M4 18l8.5-6L4 6v12zm9-12v12l8.5-6L13 6z" />
104
  </svg>
 
105
  </button>
106
  </div>
107
 
108
  <div className="controls-right">
109
  {/* Loop controls */}
110
  <div className="loop-controls">
 
111
  {!isLooping ? (
112
  <>
113
  <button
@@ -138,12 +141,6 @@ export default function Controls({
138
  )}
139
  </div>
140
 
141
- {onNewSong && (
142
- <button className="btn btn-new" onClick={onNewSong}>
143
- + New Song
144
- </button>
145
- )}
146
-
147
  <div className="tempo-control">
148
  <span className="tempo-label">Speed</span>
149
  <input
@@ -155,6 +152,12 @@ export default function Controls({
155
  />
156
  <span className="tempo-value">{tempo}%</span>
157
  </div>
 
 
 
 
 
 
158
  </div>
159
  </div>
160
 
 
62
  <div className="controls-main">
63
  <div className="controls-left">
64
  <div className="brand-mark">
65
+ <OctopusLogo size={32} />
66
  <span className="brand-name">Mr. Octopus</span>
67
  </div>
68
  {fileName && (
 
76
  onClick={() => seekTo(Math.max(0, displayTime - 5))}
77
  title="Back 5s"
78
  >
79
+ <svg width="18" height="18" viewBox="0 0 24 24" fill="currentColor">
80
  <path d="M11 18V6l-8.5 6 8.5 6zm.5-6l8.5 6V6l-8.5 6z" />
81
  </svg>
82
+ <span className="transport-label">5s</span>
83
  </button>
84
 
85
  <button className="play-btn" onClick={togglePlayPause}>
86
  {isPlaying ? (
87
+ <svg width="24" height="24" viewBox="0 0 24 24" fill="currentColor">
88
  <rect x="6" y="4" width="4" height="16" rx="1" />
89
  <rect x="14" y="4" width="4" height="16" rx="1" />
90
  </svg>
91
  ) : (
92
+ <svg width="24" height="24" viewBox="0 0 24 24" fill="currentColor">
93
  <path d="M8 5v14l11-7z" />
94
  </svg>
95
  )}
 
100
  onClick={() => seekTo(Math.min(totalDuration, displayTime + 5))}
101
  title="Forward 5s"
102
  >
103
+ <svg width="18" height="18" viewBox="0 0 24 24" fill="currentColor">
104
  <path d="M4 18l8.5-6L4 6v12zm9-12v12l8.5-6L13 6z" />
105
  </svg>
106
+ <span className="transport-label">5s</span>
107
  </button>
108
  </div>
109
 
110
  <div className="controls-right">
111
  {/* Loop controls */}
112
  <div className="loop-controls">
113
+ <span className="loop-label">Loop</span>
114
  {!isLooping ? (
115
  <>
116
  <button
 
141
  )}
142
  </div>
143
 
 
 
 
 
 
 
144
  <div className="tempo-control">
145
  <span className="tempo-label">Speed</span>
146
  <input
 
152
  />
153
  <span className="tempo-value">{tempo}%</span>
154
  </div>
155
+
156
+ {onNewSong && (
157
+ <button className="btn btn-new" onClick={onNewSong}>
158
+ + New Song
159
+ </button>
160
+ )}
161
  </div>
162
  </div>
163
 
app/src/index.css CHANGED
@@ -228,18 +228,18 @@ body {
228
  }
229
 
230
  .controls-main {
231
- height: 56px;
232
  display: flex;
233
  align-items: center;
234
  justify-content: space-between;
235
- padding: 0 20px;
236
- gap: 16px;
237
  }
238
 
239
  .controls-left {
240
  display: flex;
241
  align-items: center;
242
- gap: 14px;
243
  min-width: 0;
244
  flex: 1;
245
  }
@@ -252,7 +252,7 @@ body {
252
  }
253
 
254
  .brand-name {
255
- font-size: 15px;
256
  font-weight: 700;
257
  background: linear-gradient(135deg, #a78bfa, #06b6d4);
258
  -webkit-background-clip: text;
@@ -263,13 +263,13 @@ body {
263
  }
264
 
265
  .file-name {
266
- font-size: 13px;
267
  color: var(--text-muted);
268
  white-space: nowrap;
269
  overflow: hidden;
270
  text-overflow: ellipsis;
271
- max-width: 200px;
272
- padding-left: 14px;
273
  border-left: 1.5px solid var(--border);
274
  font-weight: 500;
275
  }
@@ -277,73 +277,84 @@ body {
277
  .controls-center {
278
  display: flex;
279
  align-items: center;
280
- gap: 6px;
281
  flex-shrink: 0;
282
  }
283
 
284
  .controls-right {
285
  display: flex;
286
  align-items: center;
287
- gap: 16px;
288
  flex: 1;
289
  justify-content: flex-end;
290
  }
291
 
292
- /* Transport buttons */
293
  .transport-btn {
294
- width: 36px;
295
- height: 36px;
296
- border-radius: 8px;
297
  border: none;
298
  background: var(--surface-2);
299
  color: var(--text-muted);
300
  cursor: pointer;
301
  display: flex;
 
302
  align-items: center;
303
  justify-content: center;
 
304
  transition: all 0.15s;
 
305
  }
306
 
307
  .transport-btn:hover {
308
  background: var(--surface-3);
309
  color: var(--text);
 
 
 
 
 
 
 
 
310
  }
311
 
312
  /* Play button — bold and prominent */
313
  .play-btn {
314
- width: 48px;
315
- height: 48px;
316
  border-radius: 50%;
317
  border: none;
318
  background: var(--primary);
319
  color: white;
320
- font-size: 18px;
321
  cursor: pointer;
322
  transition: all 0.2s;
323
  display: flex;
324
  align-items: center;
325
  justify-content: center;
326
- box-shadow: 0 0 20px var(--primary-glow);
327
  }
328
 
329
  .play-btn:hover {
330
  background: var(--primary-hover);
331
- box-shadow: 0 0 30px var(--primary-glow);
332
- transform: scale(1.05);
333
  }
334
 
335
  .play-btn:active {
336
- transform: scale(0.97);
337
  }
338
 
339
- /* + New Song button */
340
  .btn {
341
  background: var(--surface-2);
342
  color: var(--text-muted);
343
  border: 1.5px solid var(--border);
344
  border-radius: 8px;
345
- padding: 7px 16px;
346
- font-size: 12px;
347
  font-weight: 600;
348
  font-family: inherit;
349
  cursor: pointer;
@@ -373,33 +384,33 @@ body {
373
  .tempo-control {
374
  display: flex;
375
  align-items: center;
376
- gap: 8px;
377
  background: var(--surface-2);
378
- padding: 6px 14px;
379
- border-radius: 8px;
380
  border: 1px solid var(--border);
381
  }
382
 
383
  .tempo-label {
384
- font-size: 11px;
385
- font-weight: 600;
386
- color: var(--text-subtle);
387
  text-transform: uppercase;
388
  letter-spacing: 0.5px;
389
  white-space: nowrap;
390
  }
391
 
392
  .tempo-value {
393
- font-size: 13px;
394
- font-weight: 600;
395
- color: var(--text-muted);
396
- min-width: 36px;
397
  text-align: right;
398
  font-variant-numeric: tabular-nums;
399
  }
400
 
401
  .tempo-control input[type='range'] {
402
- width: 80px;
403
  }
404
 
405
  /* ========================================
@@ -409,16 +420,16 @@ body {
409
  .timeline {
410
  display: flex;
411
  align-items: center;
412
- gap: 12px;
413
- padding: 0 20px 10px;
414
  }
415
 
416
  .timeline-time {
417
- font-size: 12px;
418
  font-weight: 600;
419
  color: var(--text-muted);
420
  font-variant-numeric: tabular-nums;
421
- min-width: 36px;
422
  }
423
 
424
  .timeline-time:last-child {
@@ -432,8 +443,8 @@ body {
432
 
433
  .timeline-track input[type='range'] {
434
  width: 100%;
435
- height: 6px;
436
- border-radius: 3px;
437
  -webkit-appearance: none;
438
  appearance: none;
439
  outline: none;
@@ -442,19 +453,19 @@ body {
442
  }
443
 
444
  .timeline-track input[type='range']:hover {
445
- height: 8px;
446
  }
447
 
448
  .timeline-track input[type='range']::-webkit-slider-thumb {
449
  -webkit-appearance: none;
450
  appearance: none;
451
- width: 14px;
452
- height: 14px;
453
  border-radius: 50%;
454
  background: var(--primary-hover);
455
  cursor: pointer;
456
  border: 2px solid white;
457
- box-shadow: 0 0 8px var(--primary-glow);
458
  transition: transform 0.1s;
459
  }
460
 
@@ -463,13 +474,13 @@ body {
463
  }
464
 
465
  .timeline-track input[type='range']::-moz-range-thumb {
466
- width: 14px;
467
- height: 14px;
468
  border-radius: 50%;
469
  background: var(--primary-hover);
470
  cursor: pointer;
471
  border: 2px solid white;
472
- box-shadow: 0 0 8px var(--primary-glow);
473
  }
474
 
475
  /* General range sliders (for tempo) */
@@ -477,8 +488,8 @@ input[type='range'] {
477
  -webkit-appearance: none;
478
  appearance: none;
479
  background: var(--border);
480
- height: 4px;
481
- border-radius: 2px;
482
  outline: none;
483
  cursor: pointer;
484
  }
@@ -486,8 +497,8 @@ input[type='range'] {
486
  input[type='range']::-webkit-slider-thumb {
487
  -webkit-appearance: none;
488
  appearance: none;
489
- width: 14px;
490
- height: 14px;
491
  border-radius: 50%;
492
  background: var(--primary);
493
  cursor: pointer;
@@ -500,8 +511,8 @@ input[type='range']::-webkit-slider-thumb:hover {
500
  }
501
 
502
  input[type='range']::-moz-range-thumb {
503
- width: 14px;
504
- height: 14px;
505
  border-radius: 50%;
506
  background: var(--primary);
507
  cursor: pointer;
@@ -512,16 +523,25 @@ input[type='range']::-moz-range-thumb {
512
  .loop-controls {
513
  display: flex;
514
  align-items: center;
515
- gap: 4px;
 
 
 
 
 
 
 
 
 
516
  }
517
 
518
  .btn-loop {
519
- min-width: 32px;
520
  text-align: center;
521
  font-weight: 700;
522
- font-size: 12px;
523
- padding: 6px 10px;
524
- border-radius: 6px;
525
  font-family: inherit;
526
  letter-spacing: 0.3px;
527
  }
@@ -538,8 +558,8 @@ input[type='range']::-moz-range-thumb {
538
  }
539
 
540
  .loop-x {
541
- margin-left: 6px;
542
- font-size: 14px;
543
  opacity: 0.6;
544
  }
545
 
 
228
  }
229
 
230
  .controls-main {
231
+ height: 72px;
232
  display: flex;
233
  align-items: center;
234
  justify-content: space-between;
235
+ padding: 0 24px;
236
+ gap: 20px;
237
  }
238
 
239
  .controls-left {
240
  display: flex;
241
  align-items: center;
242
+ gap: 16px;
243
  min-width: 0;
244
  flex: 1;
245
  }
 
252
  }
253
 
254
  .brand-name {
255
+ font-size: 16px;
256
  font-weight: 700;
257
  background: linear-gradient(135deg, #a78bfa, #06b6d4);
258
  -webkit-background-clip: text;
 
263
  }
264
 
265
  .file-name {
266
+ font-size: 14px;
267
  color: var(--text-muted);
268
  white-space: nowrap;
269
  overflow: hidden;
270
  text-overflow: ellipsis;
271
+ max-width: 240px;
272
+ padding-left: 16px;
273
  border-left: 1.5px solid var(--border);
274
  font-weight: 500;
275
  }
 
277
  .controls-center {
278
  display: flex;
279
  align-items: center;
280
+ gap: 10px;
281
  flex-shrink: 0;
282
  }
283
 
284
  .controls-right {
285
  display: flex;
286
  align-items: center;
287
+ gap: 20px;
288
  flex: 1;
289
  justify-content: flex-end;
290
  }
291
 
292
+ /* Transport buttons (skip back/forward) */
293
  .transport-btn {
294
+ width: 48px;
295
+ height: 48px;
296
+ border-radius: 10px;
297
  border: none;
298
  background: var(--surface-2);
299
  color: var(--text-muted);
300
  cursor: pointer;
301
  display: flex;
302
+ flex-direction: column;
303
  align-items: center;
304
  justify-content: center;
305
+ gap: 2px;
306
  transition: all 0.15s;
307
+ border: 1px solid var(--border);
308
  }
309
 
310
  .transport-btn:hover {
311
  background: var(--surface-3);
312
  color: var(--text);
313
+ border-color: var(--border-hover);
314
+ }
315
+
316
+ .transport-label {
317
+ font-size: 10px;
318
+ font-weight: 600;
319
+ letter-spacing: 0.3px;
320
+ opacity: 0.7;
321
  }
322
 
323
  /* Play button — bold and prominent */
324
  .play-btn {
325
+ width: 56px;
326
+ height: 56px;
327
  border-radius: 50%;
328
  border: none;
329
  background: var(--primary);
330
  color: white;
331
+ font-size: 20px;
332
  cursor: pointer;
333
  transition: all 0.2s;
334
  display: flex;
335
  align-items: center;
336
  justify-content: center;
337
+ box-shadow: 0 0 24px var(--primary-glow);
338
  }
339
 
340
  .play-btn:hover {
341
  background: var(--primary-hover);
342
+ box-shadow: 0 0 36px var(--primary-glow);
343
+ transform: scale(1.06);
344
  }
345
 
346
  .play-btn:active {
347
+ transform: scale(0.96);
348
  }
349
 
350
+ /* General button */
351
  .btn {
352
  background: var(--surface-2);
353
  color: var(--text-muted);
354
  border: 1.5px solid var(--border);
355
  border-radius: 8px;
356
+ padding: 8px 18px;
357
+ font-size: 13px;
358
  font-weight: 600;
359
  font-family: inherit;
360
  cursor: pointer;
 
384
  .tempo-control {
385
  display: flex;
386
  align-items: center;
387
+ gap: 10px;
388
  background: var(--surface-2);
389
+ padding: 8px 16px;
390
+ border-radius: 10px;
391
  border: 1px solid var(--border);
392
  }
393
 
394
  .tempo-label {
395
+ font-size: 12px;
396
+ font-weight: 700;
397
+ color: var(--text-muted);
398
  text-transform: uppercase;
399
  letter-spacing: 0.5px;
400
  white-space: nowrap;
401
  }
402
 
403
  .tempo-value {
404
+ font-size: 14px;
405
+ font-weight: 700;
406
+ color: var(--text);
407
+ min-width: 40px;
408
  text-align: right;
409
  font-variant-numeric: tabular-nums;
410
  }
411
 
412
  .tempo-control input[type='range'] {
413
+ width: 100px;
414
  }
415
 
416
  /* ========================================
 
420
  .timeline {
421
  display: flex;
422
  align-items: center;
423
+ gap: 14px;
424
+ padding: 0 24px 12px;
425
  }
426
 
427
  .timeline-time {
428
+ font-size: 13px;
429
  font-weight: 600;
430
  color: var(--text-muted);
431
  font-variant-numeric: tabular-nums;
432
+ min-width: 40px;
433
  }
434
 
435
  .timeline-time:last-child {
 
443
 
444
  .timeline-track input[type='range'] {
445
  width: 100%;
446
+ height: 8px;
447
+ border-radius: 4px;
448
  -webkit-appearance: none;
449
  appearance: none;
450
  outline: none;
 
453
  }
454
 
455
  .timeline-track input[type='range']:hover {
456
+ height: 10px;
457
  }
458
 
459
  .timeline-track input[type='range']::-webkit-slider-thumb {
460
  -webkit-appearance: none;
461
  appearance: none;
462
+ width: 16px;
463
+ height: 16px;
464
  border-radius: 50%;
465
  background: var(--primary-hover);
466
  cursor: pointer;
467
  border: 2px solid white;
468
+ box-shadow: 0 0 10px var(--primary-glow);
469
  transition: transform 0.1s;
470
  }
471
 
 
474
  }
475
 
476
  .timeline-track input[type='range']::-moz-range-thumb {
477
+ width: 16px;
478
+ height: 16px;
479
  border-radius: 50%;
480
  background: var(--primary-hover);
481
  cursor: pointer;
482
  border: 2px solid white;
483
+ box-shadow: 0 0 10px var(--primary-glow);
484
  }
485
 
486
  /* General range sliders (for tempo) */
 
488
  -webkit-appearance: none;
489
  appearance: none;
490
  background: var(--border);
491
+ height: 5px;
492
+ border-radius: 3px;
493
  outline: none;
494
  cursor: pointer;
495
  }
 
497
  input[type='range']::-webkit-slider-thumb {
498
  -webkit-appearance: none;
499
  appearance: none;
500
+ width: 16px;
501
+ height: 16px;
502
  border-radius: 50%;
503
  background: var(--primary);
504
  cursor: pointer;
 
511
  }
512
 
513
  input[type='range']::-moz-range-thumb {
514
+ width: 16px;
515
+ height: 16px;
516
  border-radius: 50%;
517
  background: var(--primary);
518
  cursor: pointer;
 
523
  .loop-controls {
524
  display: flex;
525
  align-items: center;
526
+ gap: 6px;
527
+ }
528
+
529
+ .loop-label {
530
+ font-size: 12px;
531
+ font-weight: 700;
532
+ color: var(--text-muted);
533
+ text-transform: uppercase;
534
+ letter-spacing: 0.5px;
535
+ margin-right: 2px;
536
  }
537
 
538
  .btn-loop {
539
+ min-width: 36px;
540
  text-align: center;
541
  font-weight: 700;
542
+ font-size: 13px;
543
+ padding: 7px 12px;
544
+ border-radius: 8px;
545
  font-family: inherit;
546
  letter-spacing: 0.3px;
547
  }
 
558
  }
559
 
560
  .loop-x {
561
+ margin-left: 8px;
562
+ font-size: 15px;
563
  opacity: 0.6;
564
  }
565
 
transcriber/audio_analysis.py ADDED
@@ -0,0 +1,425 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Audio Analysis Toolkit for Mr. Octopus Piano Transcription.
3
+
4
+ Three analysis modes:
5
+ 1. Spectral comparison: Renders MIDI→audio via FluidSynth, compares spectrograms
6
+ 2. Visual spectrogram: Generates PNG images for AI/human visual inspection
7
+ 3. Audio playback: Plays original, rendered MIDI, or both side-by-side
8
+
9
+ Usage:
10
+ python audio_analysis.py compare <original_audio> <midi_file> [--output-dir ./analysis]
11
+ python audio_analysis.py visualize <original_audio> <midi_file> [--output-dir ./analysis]
12
+ python audio_analysis.py play <audio_file> [--start 10.0] [--duration 5.0]
13
+ python audio_analysis.py play-both <original_audio> <midi_file> [--start 10.0] [--duration 5.0]
14
+ python audio_analysis.py full <original_audio> <midi_file> [--output-dir ./analysis]
15
+ """
16
+
17
+ import argparse
18
+ import os
19
+ import sys
20
+ import subprocess
21
+ import tempfile
22
+ import numpy as np
23
+
24
+ SOUNDFONT_PATH = os.path.join(os.path.dirname(__file__), "soundfonts", "FluidR3_GM.sf2")
25
+
26
+
27
+ def render_midi_to_audio(midi_path, output_wav, sample_rate=44100):
28
+ """Render a MIDI file to WAV using FluidSynth."""
29
+ cmd = [
30
+ "fluidsynth",
31
+ f"--fast-render={output_wav}",
32
+ f"--sample-rate={sample_rate}",
33
+ "--gain=0.5",
34
+ "-n", "-i",
35
+ SOUNDFONT_PATH,
36
+ midi_path,
37
+ ]
38
+ result = subprocess.run(cmd, capture_output=True, text=True, timeout=120)
39
+ if not os.path.exists(output_wav):
40
+ print(f"FluidSynth error: {result.stderr}")
41
+ raise RuntimeError("FluidSynth failed to render MIDI")
42
+ return output_wav
43
+
44
+
45
+ def load_audio(path, sr=22050, duration=None):
46
+ """Load audio file, return mono signal and sample rate."""
47
+ import librosa
48
+ y, sr = librosa.load(path, sr=sr, mono=True, duration=duration)
49
+ return y, sr
50
+
51
+
52
+ def compute_spectrogram(y, sr, hop_length=512, n_fft=2048):
53
+ """Compute a log-magnitude mel spectrogram."""
54
+ import librosa
55
+ S = librosa.feature.melspectrogram(y=y, sr=sr, n_fft=n_fft, hop_length=hop_length, n_mels=128)
56
+ S_db = librosa.power_to_db(S, ref=np.max)
57
+ return S_db
58
+
59
+
60
+ def compute_cqt(y, sr, hop_length=512):
61
+ """Compute constant-Q transform (better for music)."""
62
+ import librosa
63
+ C = np.abs(librosa.cqt(y=y, sr=sr, hop_length=hop_length, n_bins=84, bins_per_octave=12))
64
+ C_db = librosa.amplitude_to_db(C, ref=np.max)
65
+ return C_db
66
+
67
+
68
+ def align_lengths(spec_a, spec_b):
69
+ """Trim both spectrograms to the same number of time frames."""
70
+ min_frames = min(spec_a.shape[1], spec_b.shape[1])
71
+ return spec_a[:, :min_frames], spec_b[:, :min_frames]
72
+
73
+
74
+ def spectral_comparison(original_audio, midi_path, output_dir, sr=22050):
75
+ """
76
+ Full spectral comparison: renders MIDI to audio, computes spectrograms,
77
+ calculates frame-by-frame divergence, and identifies problem regions.
78
+ """
79
+ import librosa
80
+
81
+ os.makedirs(output_dir, exist_ok=True)
82
+
83
+ # Step 1: Render MIDI to audio
84
+ rendered_wav = os.path.join(output_dir, "midi_rendered.wav")
85
+ print("Rendering MIDI to audio via FluidSynth...")
86
+ render_midi_to_audio(midi_path, rendered_wav, sample_rate=44100)
87
+
88
+ # Step 2: Load both audio files
89
+ print("Loading original audio...")
90
+ y_orig, _ = load_audio(original_audio, sr=sr)
91
+ print("Loading rendered MIDI audio...")
92
+ y_midi, _ = load_audio(rendered_wav, sr=sr)
93
+
94
+ # Step 3: Compute spectrograms
95
+ hop = 512
96
+ print("Computing spectrograms...")
97
+ spec_orig = compute_spectrogram(y_orig, sr, hop_length=hop)
98
+ spec_midi = compute_spectrogram(y_midi, sr, hop_length=hop)
99
+
100
+ # Align lengths
101
+ spec_orig, spec_midi = align_lengths(spec_orig, spec_midi)
102
+
103
+ # Step 4: Compute frame-by-frame divergence
104
+ # Normalize to 0-1 range for comparison
105
+ spec_orig_norm = (spec_orig - spec_orig.min()) / (spec_orig.max() - spec_orig.min() + 1e-8)
106
+ spec_midi_norm = (spec_midi - spec_midi.min()) / (spec_midi.max() - spec_midi.min() + 1e-8)
107
+
108
+ # Mean squared error per frame (across frequency bins)
109
+ frame_mse = np.mean((spec_orig_norm - spec_midi_norm) ** 2, axis=0)
110
+
111
+ # Convert frame indices to time
112
+ n_frames = len(frame_mse)
113
+ times = librosa.frames_to_time(np.arange(n_frames), sr=sr, hop_length=hop)
114
+
115
+ # Step 5: Identify problem regions (frames with high divergence)
116
+ threshold = np.percentile(frame_mse, 90) # top 10% divergence
117
+ problem_mask = frame_mse > threshold
118
+
119
+ # Group consecutive problem frames into regions
120
+ regions = []
121
+ in_region = False
122
+ start = 0
123
+ for i, is_problem in enumerate(problem_mask):
124
+ if is_problem and not in_region:
125
+ start = i
126
+ in_region = True
127
+ elif not is_problem and in_region:
128
+ if times[i] - times[start] > 0.3: # min 300ms regions
129
+ regions.append((times[start], times[i - 1], np.mean(frame_mse[start:i])))
130
+ in_region = False
131
+ if in_region:
132
+ regions.append((times[start], times[-1], np.mean(frame_mse[start:])))
133
+
134
+ # Sort by divergence score (worst first)
135
+ regions.sort(key=lambda r: r[2], reverse=True)
136
+
137
+ # Step 6: Report
138
+ report_path = os.path.join(output_dir, "spectral_report.txt")
139
+ with open(report_path, "w") as f:
140
+ f.write("SPECTRAL COMPARISON REPORT\n")
141
+ f.write("=" * 60 + "\n\n")
142
+ f.write(f"Original: {original_audio}\n")
143
+ f.write(f"MIDI: {midi_path}\n")
144
+ f.write(f"Duration: {times[-1]:.1f}s ({n_frames} frames)\n\n")
145
+
146
+ overall_mse = np.mean(frame_mse)
147
+ f.write(f"Overall MSE: {overall_mse:.6f}\n")
148
+ f.write(f"Median MSE: {np.median(frame_mse):.6f}\n")
149
+ f.write(f"90th percentile: {threshold:.6f}\n\n")
150
+
151
+ f.write(f"TOP DIVERGENT REGIONS ({len(regions)} found):\n")
152
+ f.write("-" * 60 + "\n")
153
+ for i, (t_start, t_end, score) in enumerate(regions[:20]):
154
+ f.write(f" {i+1:2d}. {t_start:6.1f}s - {t_end:6.1f}s "
155
+ f"(duration: {t_end - t_start:.1f}s) MSE: {score:.6f}\n")
156
+
157
+ print(f"Report written to {report_path}")
158
+
159
+ # Save raw data for further analysis
160
+ np.savez(os.path.join(output_dir, "spectral_data.npz"),
161
+ frame_mse=frame_mse, times=times, threshold=threshold)
162
+
163
+ return frame_mse, times, regions
164
+
165
+
166
+ def generate_spectrograms(original_audio, midi_path, output_dir, sr=22050):
167
+ """
168
+ Generate side-by-side spectrogram images for visual inspection.
169
+ Creates: overview, difference map, and zoomed segments.
170
+ """
171
+ import librosa
172
+ import librosa.display
173
+ import matplotlib
174
+ matplotlib.use('Agg')
175
+ import matplotlib.pyplot as plt
176
+
177
+ os.makedirs(output_dir, exist_ok=True)
178
+
179
+ # Render MIDI
180
+ rendered_wav = os.path.join(output_dir, "midi_rendered.wav")
181
+ if not os.path.exists(rendered_wav):
182
+ print("Rendering MIDI to audio...")
183
+ render_midi_to_audio(midi_path, rendered_wav, sample_rate=44100)
184
+
185
+ # Load
186
+ print("Loading audio files...")
187
+ y_orig, _ = load_audio(original_audio, sr=sr)
188
+ y_midi, _ = load_audio(rendered_wav, sr=sr)
189
+
190
+ hop = 512
191
+
192
+ # CQT spectrograms (better for music than mel)
193
+ print("Computing CQT spectrograms...")
194
+ cqt_orig = compute_cqt(y_orig, sr, hop_length=hop)
195
+ cqt_midi = compute_cqt(y_midi, sr, hop_length=hop)
196
+ cqt_orig, cqt_midi = align_lengths(cqt_orig, cqt_midi)
197
+
198
+ duration = min(len(y_orig), len(y_midi)) / sr
199
+
200
+ # ===== Figure 1: Full overview side-by-side =====
201
+ fig, axes = plt.subplots(3, 1, figsize=(20, 12), constrained_layout=True)
202
+ fig.suptitle("Spectral Comparison: Original vs MIDI Transcription", fontsize=16, fontweight='bold')
203
+
204
+ # Original
205
+ img0 = axes[0].imshow(cqt_orig, aspect='auto', origin='lower',
206
+ extent=[0, duration, 0, 84], cmap='magma',
207
+ vmin=-60, vmax=0)
208
+ axes[0].set_title("Original Audio", fontsize=13)
209
+ axes[0].set_ylabel("CQT Bin (semitone)")
210
+ plt.colorbar(img0, ax=axes[0], label='dB')
211
+
212
+ # MIDI rendered
213
+ img1 = axes[1].imshow(cqt_midi, aspect='auto', origin='lower',
214
+ extent=[0, duration, 0, 84], cmap='magma',
215
+ vmin=-60, vmax=0)
216
+ axes[1].set_title("MIDI Transcription (rendered)", fontsize=13)
217
+ axes[1].set_ylabel("CQT Bin (semitone)")
218
+ plt.colorbar(img1, ax=axes[1], label='dB')
219
+
220
+ # Difference map
221
+ diff = cqt_orig - cqt_midi
222
+ img2 = axes[2].imshow(diff, aspect='auto', origin='lower',
223
+ extent=[0, duration, 0, 84], cmap='RdBu_r',
224
+ vmin=-30, vmax=30)
225
+ axes[2].set_title("Difference (Original − MIDI): Red=missing, Blue=extra", fontsize=13)
226
+ axes[2].set_ylabel("CQT Bin (semitone)")
227
+ axes[2].set_xlabel("Time (seconds)")
228
+ plt.colorbar(img2, ax=axes[2], label='dB difference')
229
+
230
+ overview_path = os.path.join(output_dir, "spectrogram_overview.png")
231
+ plt.savefig(overview_path, dpi=150)
232
+ plt.close()
233
+ print(f"Saved: {overview_path}")
234
+
235
+ # ===== Figure 2: Zoomed segments (first 30s, middle, last 30s) =====
236
+ segments = [
237
+ ("Opening (0-30s)", 0, 30),
238
+ ("Middle", max(0, duration / 2 - 15), min(duration, duration / 2 + 15)),
239
+ ("Ending", max(0, duration - 30), duration),
240
+ ]
241
+
242
+ for label, t_start, t_end in segments:
243
+ frame_start = int(t_start * sr / hop)
244
+ frame_end = int(t_end * sr / hop)
245
+ frame_end = min(frame_end, cqt_orig.shape[1])
246
+
247
+ if frame_end <= frame_start:
248
+ continue
249
+
250
+ fig, axes = plt.subplots(3, 1, figsize=(18, 10), constrained_layout=True)
251
+ fig.suptitle(f"Zoomed: {label} ({t_start:.0f}s - {t_end:.0f}s)", fontsize=14, fontweight='bold')
252
+
253
+ seg_orig = cqt_orig[:, frame_start:frame_end]
254
+ seg_midi = cqt_midi[:, frame_start:frame_end]
255
+
256
+ img0 = axes[0].imshow(seg_orig, aspect='auto', origin='lower',
257
+ extent=[t_start, t_end, 0, 84], cmap='magma',
258
+ vmin=-60, vmax=0)
259
+ axes[0].set_title("Original")
260
+ axes[0].set_ylabel("CQT Bin")
261
+ plt.colorbar(img0, ax=axes[0])
262
+
263
+ img1 = axes[1].imshow(seg_midi, aspect='auto', origin='lower',
264
+ extent=[t_start, t_end, 0, 84], cmap='magma',
265
+ vmin=-60, vmax=0)
266
+ axes[1].set_title("MIDI Transcription")
267
+ axes[1].set_ylabel("CQT Bin")
268
+ plt.colorbar(img1, ax=axes[1])
269
+
270
+ seg_diff = seg_orig - seg_midi
271
+ img2 = axes[2].imshow(seg_diff, aspect='auto', origin='lower',
272
+ extent=[t_start, t_end, 0, 84], cmap='RdBu_r',
273
+ vmin=-30, vmax=30)
274
+ axes[2].set_title("Difference (Red=missing in MIDI, Blue=extra in MIDI)")
275
+ axes[2].set_ylabel("CQT Bin")
276
+ axes[2].set_xlabel("Time (seconds)")
277
+ plt.colorbar(img2, ax=axes[2])
278
+
279
+ safe_label = label.replace(" ", "_").replace("(", "").replace(")", "").replace("-", "_")
280
+ seg_path = os.path.join(output_dir, f"spectrogram_{safe_label}.png")
281
+ plt.savefig(seg_path, dpi=150)
282
+ plt.close()
283
+ print(f"Saved: {seg_path}")
284
+
285
+ # ===== Figure 3: Energy envelope comparison =====
286
+ fig, ax = plt.subplots(figsize=(18, 4), constrained_layout=True)
287
+ energy_orig = np.mean(cqt_orig, axis=0)
288
+ energy_midi = np.mean(cqt_midi, axis=0)
289
+ n_frames = min(len(energy_orig), len(energy_midi))
290
+ times = librosa.frames_to_time(np.arange(n_frames), sr=sr, hop_length=hop)
291
+ ax.plot(times, energy_orig[:n_frames], label='Original', alpha=0.8, linewidth=0.5)
292
+ ax.plot(times, energy_midi[:n_frames], label='MIDI Transcription', alpha=0.8, linewidth=0.5)
293
+ ax.set_xlabel("Time (seconds)")
294
+ ax.set_ylabel("Mean CQT Energy (dB)")
295
+ ax.set_title("Energy Envelope Comparison")
296
+ ax.legend()
297
+
298
+ energy_path = os.path.join(output_dir, "energy_comparison.png")
299
+ plt.savefig(energy_path, dpi=150)
300
+ plt.close()
301
+ print(f"Saved: {energy_path}")
302
+
303
+ return [overview_path]
304
+
305
+
306
+ def play_audio(audio_path, start=None, duration=None):
307
+ """Play audio through system speakers using afplay (macOS)."""
308
+ cmd = ["afplay", audio_path]
309
+ if start is not None:
310
+ # afplay doesn't support start offset natively, so we trim with python
311
+ import soundfile as sf
312
+ data, sr = sf.read(audio_path)
313
+ start_sample = int(start * sr)
314
+ if duration:
315
+ end_sample = start_sample + int(duration * sr)
316
+ else:
317
+ end_sample = len(data)
318
+ start_sample = max(0, min(start_sample, len(data)))
319
+ end_sample = max(start_sample, min(end_sample, len(data)))
320
+ segment = data[start_sample:end_sample]
321
+
322
+ tmp = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
323
+ sf.write(tmp.name, segment, sr)
324
+ cmd = ["afplay", tmp.name]
325
+
326
+ print(f"Playing: {audio_path}" + (f" [{start:.1f}s - {start + duration:.1f}s]" if start and duration else ""))
327
+ subprocess.run(cmd)
328
+ print("Playback finished.")
329
+
330
+
331
+ def play_comparison(original_audio, midi_path, start=None, duration=None):
332
+ """Play original then MIDI rendering back-to-back for comparison."""
333
+ import soundfile as sf
334
+
335
+ # Render MIDI
336
+ rendered_wav = tempfile.NamedTemporaryFile(suffix=".wav", delete=False).name
337
+ print("Rendering MIDI to audio...")
338
+ render_midi_to_audio(midi_path, rendered_wav, sample_rate=44100)
339
+
340
+ print("\n--- Playing ORIGINAL ---")
341
+ play_audio(original_audio, start=start, duration=duration)
342
+
343
+ print("\n--- Playing MIDI TRANSCRIPTION ---")
344
+ play_audio(rendered_wav, start=start, duration=duration)
345
+
346
+ os.unlink(rendered_wav)
347
+
348
+
349
+ def full_analysis(original_audio, midi_path, output_dir):
350
+ """Run all analyses: spectral comparison + visual spectrograms."""
351
+ print("=" * 60)
352
+ print("FULL AUDIO ANALYSIS")
353
+ print("=" * 60)
354
+
355
+ # 1. Spectral comparison (metrics + report)
356
+ print("\n[1/2] Running spectral comparison...")
357
+ frame_mse, times, regions = spectral_comparison(original_audio, midi_path, output_dir)
358
+
359
+ # 2. Visual spectrograms
360
+ print("\n[2/2] Generating visual spectrograms...")
361
+ images = generate_spectrograms(original_audio, midi_path, output_dir)
362
+
363
+ print("\n" + "=" * 60)
364
+ print(f"Analysis complete! Results in: {output_dir}")
365
+ print(f" - spectral_report.txt (divergence metrics + problem regions)")
366
+ print(f" - spectrogram_overview.png (full comparison)")
367
+ print(f" - spectrogram_*.png (zoomed segments)")
368
+ print(f" - energy_comparison.png (energy envelopes)")
369
+ print(f" - midi_rendered.wav (MIDI rendered to audio for listening)")
370
+ print("=" * 60)
371
+
372
+ return regions
373
+
374
+
375
+ def main():
376
+ parser = argparse.ArgumentParser(description="Audio analysis toolkit for piano transcription")
377
+ subparsers = parser.add_subparsers(dest="command", required=True)
378
+
379
+ # compare
380
+ p_compare = subparsers.add_parser("compare", help="Spectral comparison")
381
+ p_compare.add_argument("original", help="Original audio file")
382
+ p_compare.add_argument("midi", help="MIDI transcription file")
383
+ p_compare.add_argument("--output-dir", default="./analysis", help="Output directory")
384
+
385
+ # visualize
386
+ p_viz = subparsers.add_parser("visualize", help="Generate spectrogram images")
387
+ p_viz.add_argument("original", help="Original audio file")
388
+ p_viz.add_argument("midi", help="MIDI transcription file")
389
+ p_viz.add_argument("--output-dir", default="./analysis", help="Output directory")
390
+
391
+ # play
392
+ p_play = subparsers.add_parser("play", help="Play an audio file")
393
+ p_play.add_argument("audio", help="Audio file to play")
394
+ p_play.add_argument("--start", type=float, default=None, help="Start time in seconds")
395
+ p_play.add_argument("--duration", type=float, default=None, help="Duration in seconds")
396
+
397
+ # play-both
398
+ p_both = subparsers.add_parser("play-both", help="Play original then MIDI back-to-back")
399
+ p_both.add_argument("original", help="Original audio file")
400
+ p_both.add_argument("midi", help="MIDI transcription file")
401
+ p_both.add_argument("--start", type=float, default=None, help="Start time in seconds")
402
+ p_both.add_argument("--duration", type=float, default=None, help="Duration in seconds")
403
+
404
+ # full
405
+ p_full = subparsers.add_parser("full", help="Run all analyses")
406
+ p_full.add_argument("original", help="Original audio file")
407
+ p_full.add_argument("midi", help="MIDI transcription file")
408
+ p_full.add_argument("--output-dir", default="./analysis", help="Output directory")
409
+
410
+ args = parser.parse_args()
411
+
412
+ if args.command == "compare":
413
+ spectral_comparison(args.original, args.midi, args.output_dir)
414
+ elif args.command == "visualize":
415
+ generate_spectrograms(args.original, args.midi, args.output_dir)
416
+ elif args.command == "play":
417
+ play_audio(args.audio, start=args.start, duration=args.duration)
418
+ elif args.command == "play-both":
419
+ play_comparison(args.original, args.midi, start=args.start, duration=args.duration)
420
+ elif args.command == "full":
421
+ full_analysis(args.original, args.midi, args.output_dir)
422
+
423
+
424
+ if __name__ == "__main__":
425
+ main()
transcriber/optimize.py CHANGED
@@ -15,6 +15,7 @@ def remove_leading_silence_notes(midi_data, y, sr):
15
 
16
  Finds the first moment of real musical energy and removes any MIDI notes
17
  before that point (typically microphone rumble / low-freq noise artifacts).
 
18
  """
19
  midi_out = copy.deepcopy(midi_data)
20
 
@@ -28,17 +29,32 @@ def remove_leading_silence_notes(midi_data, y, sr):
28
  if len(rms) == 0:
29
  return midi_out, 0, 0.0
30
 
31
- # Music starts when RMS first exceeds 10% of the peak energy
 
32
  max_rms = np.max(rms)
33
  music_start = 0.0
34
  for i, r in enumerate(rms):
35
- if r > max_rms * 0.1:
36
  music_start = i * 0.05
37
  break
38
 
39
  if music_start < 0.1:
40
  return midi_out, 0, music_start
41
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
42
  removed = 0
43
  for instrument in midi_out.instruments:
44
  filtered = []
@@ -53,7 +69,11 @@ def remove_leading_silence_notes(midi_data, y, sr):
53
 
54
 
55
  def remove_trailing_silence_notes(midi_data, y, sr):
56
- """Remove notes that appear during the audio fade-out/silence at the end."""
 
 
 
 
57
  midi_out = copy.deepcopy(midi_data)
58
 
59
  hop = int(0.05 * sr)
@@ -66,13 +86,18 @@ def remove_trailing_silence_notes(midi_data, y, sr):
66
 
67
  max_rms = np.max(rms)
68
 
69
- # Find the last moment where RMS exceeds 5% of peak (searching backwards)
 
70
  music_end = len(y) / sr
71
  for i in range(len(rms) - 1, -1, -1):
72
- if rms[i] > max_rms * 0.05:
73
- music_end = (i + 1) * 0.05
 
74
  break
75
 
 
 
 
76
  removed = 0
77
  for instrument in midi_out.instruments:
78
  filtered = []
@@ -150,13 +175,17 @@ def remove_low_energy_notes(midi_data, y, sr, hop_length=512):
150
  def remove_harmonic_ghosts(midi_data, y=None, sr=22050, hop_length=512):
151
  """Remove notes that are harmonic doublings of louder lower notes.
152
 
153
- Pairwise detector: for notes at harmonic intervals (7, 12, 19, 24
154
- semitones), remove the upper note if it's clearly a harmonic ghost.
 
 
 
 
 
155
 
156
  Uses CQT energy to protect strong notes: if the CQT shows the note
157
- has strong energy (> -10dB), it's a real played note regardless of
158
- velocity ratio. This prevents removing notes like C6 that happen to
159
- co-occur with C5 but are genuinely played.
160
  """
161
  midi_out = copy.deepcopy(midi_data)
162
  removed = 0
@@ -165,6 +194,7 @@ def remove_harmonic_ghosts(midi_data, y=None, sr=22050, hop_length=512):
165
 
166
  # Compute CQT for energy verification if audio provided
167
  C_db = None
 
168
  if y is not None:
169
  N_BINS = 88 * 3
170
  FMIN = librosa.note_to_hz('A0')
@@ -222,6 +252,47 @@ def remove_harmonic_ghosts(midi_data, y=None, sr=22050, hop_length=512):
222
  to_remove.add(i)
223
  break
224
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
225
  instrument.notes = [n for k, n in enumerate(notes) if k not in to_remove]
226
  removed += len(to_remove)
227
 
@@ -288,7 +359,7 @@ def remove_phantom_notes(midi_data, max_pitch=None):
288
  return midi_out, removed
289
 
290
 
291
- def remove_spurious_onsets(midi_data, y, sr, ref_onsets, hop_length=512):
292
  """Remove MIDI notes that form false-positive onsets not backed by audio.
293
 
294
  Analysis shows 37 extra MIDI onsets cause the biggest F1 drag (precision=0.918).
@@ -302,12 +373,27 @@ def remove_spurious_onsets(midi_data, y, sr, ref_onsets, hop_length=512):
302
  3. Short+quiet artifacts: onsets where every note is both short (<200ms)
303
  and quiet (velocity < 50).
304
 
 
 
 
305
  The filter first identifies which MIDI onsets already match audio onsets,
306
  then only removes unmatched onsets meeting the above criteria.
307
  """
308
  midi_out = copy.deepcopy(midi_data)
309
  tolerance = 0.05
310
 
 
 
 
 
 
 
 
 
 
 
 
 
311
  onset_env = librosa.onset.onset_strength(y=y, sr=sr, hop_length=hop_length)
312
  onset_times = librosa.frames_to_time(
313
  np.arange(len(onset_env)), sr=sr, hop_length=hop_length
@@ -359,12 +445,12 @@ def remove_spurious_onsets(midi_data, y, sr, ref_onsets, hop_length=512):
359
  # Category 1: Chord fragment -- near a matched onset, but only if
360
  # the onset has weak audio energy. Strong onsets near chords may be
361
  # real grace notes or arpeggios.
362
- if near_matched and strength < 2.0:
363
  onsets_to_remove.add(j)
364
  continue
365
 
366
  # Category 2: Isolated ghost -- single note, low strength or far from audio
367
- if len(onset_notes) == 1 and (strength < 1.5 or nearest_audio_ms > 100):
368
  onsets_to_remove.add(j)
369
  continue
370
 
@@ -377,14 +463,14 @@ def remove_spurious_onsets(midi_data, y, sr, ref_onsets, hop_length=512):
377
  # low velocity (< 35), far from audio onset. These are rumble artifacts
378
  # that survive the energy filter.
379
  if (len(onset_notes) == 1 and onset_notes[0].pitch < 40
380
- and onset_notes[0].velocity < 35 and nearest_audio_ms > 60):
381
  onsets_to_remove.add(j)
382
  continue
383
 
384
  # Category 5: Multi-note onset far from any audio onset (> 120ms)
385
  # with weak-to-moderate onset strength. These are chord-split artifacts
386
  # or hallucinated events with no audio support.
387
- if nearest_audio_ms > 120 and strength < 3.0:
388
  onsets_to_remove.add(j)
389
  continue
390
 
@@ -397,7 +483,7 @@ def remove_spurious_onsets(midi_data, y, sr, ref_onsets, hop_length=512):
397
  # Category 7: Moderate distance from audio (> 70ms) with weak
398
  # onset strength — catches near-miss hallucinations that are
399
  # just outside the 50ms matching window.
400
- if nearest_audio_ms > 70 and strength < 2.5:
401
  onsets_to_remove.add(j)
402
  continue
403
 
@@ -633,6 +719,66 @@ def limit_total_concurrent(midi_data, max_per_hand=4, hand_split=60):
633
  return midi_out, trimmed
634
 
635
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
636
  def extend_note_durations(midi_data, y, sr, hop_length=512, max_per_hand=4, hand_split=60):
637
  """Extend MIDI note durations to match audio CQT energy decay.
638
 
@@ -658,6 +804,14 @@ def extend_note_durations(midi_data, y, sr, hop_length=512, max_per_hand=4, hand
658
  C_norm = (C_norm + 80.0) / 80.0
659
  n_frames = C.shape[1]
660
 
 
 
 
 
 
 
 
 
661
  # Pre-compute per-frame concurrent counts per hand (fast O(1) lookup)
662
  right_count = np.zeros(n_frames, dtype=int)
663
  left_count = np.zeros(n_frames, dtype=int)
@@ -671,6 +825,7 @@ def extend_note_durations(midi_data, y, sr, hop_length=512, max_per_hand=4, hand
671
  left_count[sf:ef] += 1
672
 
673
  extended = 0
 
674
  for inst in midi_out.instruments:
675
  # Sort notes by start time for overlap checking
676
  notes_sorted = sorted(inst.notes, key=lambda n: (n.pitch, n.start))
@@ -681,8 +836,13 @@ def extend_note_durations(midi_data, y, sr, hop_length=512, max_per_hand=4, hand
681
  continue
682
 
683
  end_frame = min(n_frames, int(note.end * sr / hop_length))
684
- # Max extension: 2 seconds beyond current end
685
- max_extend = min(n_frames, end_frame + int(2.0 * sr / hop_length))
 
 
 
 
 
686
 
687
  # Don't extend into the next note at the same pitch
688
  next_start_frame = max_extend
@@ -698,7 +858,7 @@ def extend_note_durations(midi_data, y, sr, hop_length=512, max_per_hand=4, hand
698
  for f in range(end_frame, min(max_extend, next_start_frame)):
699
  lo = max(0, fund_bin - 1)
700
  hi = min(N_BINS, fund_bin + 2)
701
- if np.mean(C_norm[lo:hi, f]) > 0.20:
702
  # Check concurrent: this note isn't counted in hand_count
703
  # beyond end_frame, so hand_count[f] >= max_per_hand means
704
  # extending here would create max_per_hand + 1 concurrent
@@ -717,6 +877,8 @@ def extend_note_durations(midi_data, y, sr, hop_length=512, max_per_hand=4, hand
717
  hand_count[old_end_frame:new_end_frame] += 1
718
  note.end = new_end
719
  extended += 1
 
 
720
 
721
  return midi_out, extended
722
 
@@ -1084,6 +1246,48 @@ def recover_missing_notes(midi_data, y, sr, hop_length=512, snap_onsets=None):
1084
  return midi_out, recovered
1085
 
1086
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1087
  def optimize(original_audio_path, midi_path, output_path=None):
1088
  """Full optimization pipeline."""
1089
  if output_path is None:
@@ -1114,6 +1318,12 @@ def optimize(original_audio_path, midi_path, output_path=None):
1114
  total_notes = sum(len(inst.notes) for inst in midi_data.instruments)
1115
  print(f" {total_notes} MIDI notes")
1116
 
 
 
 
 
 
 
1117
  # Step 0: Remove notes in leading silence (mic rumble artifacts)
1118
  print("\nStep 0: Removing notes in leading silence...")
1119
  midi_data, silence_removed, music_start = remove_leading_silence_notes(midi_data, y, sr)
@@ -1190,7 +1400,7 @@ def optimize(original_audio_path, midi_path, output_path=None):
1190
  # Step 6b: Remove spurious false-positive onsets
1191
  print("\nStep 6b: Removing spurious onsets (false positive cleanup)...")
1192
  midi_data, spurious_notes, spurious_onsets = remove_spurious_onsets(
1193
- midi_data, y, sr, ref_onsets, hop_length
1194
  )
1195
  print(f" Removed {spurious_notes} notes across {spurious_onsets} spurious onsets")
1196
 
@@ -1246,14 +1456,16 @@ def optimize(original_audio_path, midi_path, output_path=None):
1246
  )
1247
  print(f" Recovered {notes_recovered} notes from CQT energy")
1248
 
1249
- # Step 8f: Playability filter — limit per-onset chord size (4 per hand)
1250
- print("\nStep 8f: Playability filter (max 4 notes per hand per chord)...")
1251
- midi_data, playability_removed = limit_concurrent_notes(midi_data, max_per_hand=4)
 
 
1252
  print(f" Removed {playability_removed} excess chord notes")
1253
 
1254
- # Step 8g: Limit total concurrent sounding notes (4 per hand)
1255
- print("\nStep 8g: Concurrent sounding limit (max 4 per hand)...")
1256
- midi_data, sustain_trimmed = limit_total_concurrent(midi_data, max_per_hand=4)
1257
  print(f" Trimmed {sustain_trimmed} sustained notes to reduce pileup")
1258
 
1259
  # Final metrics
 
15
 
16
  Finds the first moment of real musical energy and removes any MIDI notes
17
  before that point (typically microphone rumble / low-freq noise artifacts).
18
+ Always preserves the first detected MIDI note to prevent eating the opening.
19
  """
20
  midi_out = copy.deepcopy(midi_data)
21
 
 
29
  if len(rms) == 0:
30
  return midi_out, 0, 0.0
31
 
32
+ # Music starts when RMS first exceeds 5% of the peak energy
33
+ # (reduced from 10% to avoid eating quiet openings)
34
  max_rms = np.max(rms)
35
  music_start = 0.0
36
  for i, r in enumerate(rms):
37
+ if r > max_rms * 0.05:
38
  music_start = i * 0.05
39
  break
40
 
41
  if music_start < 0.1:
42
  return midi_out, 0, music_start
43
 
44
+ # Find the earliest MIDI note onset — always protect it
45
+ all_notes = sorted(
46
+ [n for inst in midi_out.instruments for n in inst.notes],
47
+ key=lambda n: n.start
48
+ )
49
+ earliest_onset = all_notes[0].start if all_notes else 0.0
50
+
51
+ # If the "silence" region would eat the first note, clamp music_start
52
+ if music_start > earliest_onset:
53
+ music_start = earliest_onset
54
+
55
+ if music_start < 0.1:
56
+ return midi_out, 0, music_start
57
+
58
  removed = 0
59
  for instrument in midi_out.instruments:
60
  filtered = []
 
69
 
70
 
71
  def remove_trailing_silence_notes(midi_data, y, sr):
72
+ """Remove notes that appear during the audio fade-out/silence at the end.
73
+
74
+ Uses a 2% RMS threshold (reduced from 5%) and adds a 3-second protection
75
+ zone after the detected music end to preserve natural piano decay/sustain.
76
+ """
77
  midi_out = copy.deepcopy(midi_data)
78
 
79
  hop = int(0.05 * sr)
 
86
 
87
  max_rms = np.max(rms)
88
 
89
+ # Find the last moment where RMS exceeds 2% of peak (searching backwards)
90
+ # Reduced from 5% to preserve quiet endings, fade-outs, and final sustain
91
  music_end = len(y) / sr
92
  for i in range(len(rms) - 1, -1, -1):
93
+ if rms[i] > max_rms * 0.02:
94
+ # Add 3-second protection zone for natural piano decay
95
+ music_end = (i + 1) * 0.05 + 3.0
96
  break
97
 
98
+ # Clamp to actual audio duration
99
+ music_end = min(music_end, len(y) / sr)
100
+
101
  removed = 0
102
  for instrument in midi_out.instruments:
103
  filtered = []
 
175
  def remove_harmonic_ghosts(midi_data, y=None, sr=22050, hop_length=512):
176
  """Remove notes that are harmonic doublings of louder lower notes.
177
 
178
+ Two-stage detector:
179
+ 1. Pairwise: for notes at harmonic intervals (7, 12, 19, 24 semitones),
180
+ remove the upper note if it's clearly a harmonic ghost.
181
+ 2. Spectral masking: when bass + melody overlap (two-hand texture),
182
+ check if upper notes can be explained by the harmonic series of
183
+ strong lower notes. This catches ghost notes that the pairwise
184
+ detector misses because they're at non-standard intervals.
185
 
186
  Uses CQT energy to protect strong notes: if the CQT shows the note
187
+ has strong independent energy distinct from what the lower note's
188
+ harmonics would produce, it's a real played note.
 
189
  """
190
  midi_out = copy.deepcopy(midi_data)
191
  removed = 0
 
194
 
195
  # Compute CQT for energy verification if audio provided
196
  C_db = None
197
+ N_BINS = 0
198
  if y is not None:
199
  N_BINS = 88 * 3
200
  FMIN = librosa.note_to_hz('A0')
 
252
  to_remove.add(i)
253
  break
254
 
255
+ # Stage 2: Spectral masking for two-hand texture
256
+ # When bass (< MIDI 55) and melody (>= MIDI 60) overlap, bass harmonics
257
+ # can produce ghost notes in the melody range. Check if a mid-range note
258
+ # is explainable as a harmonic partial of a concurrent bass note.
259
+ if C_db is not None:
260
+ remaining = [(k, n) for k, n in enumerate(notes) if k not in to_remove]
261
+ bass_notes = [(k, n) for k, n in remaining if n.pitch < 55]
262
+ mid_notes = [(k, n) for k, n in remaining if 55 <= n.pitch < 72]
263
+
264
+ for mid_k, mid_n in mid_notes:
265
+ if mid_k in to_remove:
266
+ continue
267
+ for bass_k, bass_n in bass_notes:
268
+ if abs(bass_n.start - mid_n.start) > 0.05:
269
+ continue
270
+ # Check if mid_n.pitch matches any harmonic partial of bass_n
271
+ # Harmonics: 2nd (+12), 3rd (+19), 4th (+24), 5th (+28), 6th (+31)
272
+ bass_pitch = bass_n.pitch
273
+ harmonic_pitches = {
274
+ bass_pitch + 12, # 2nd harmonic (octave)
275
+ bass_pitch + 19, # 3rd (octave + fifth)
276
+ bass_pitch + 24, # 4th (2 octaves)
277
+ bass_pitch + 28, # 5th (2 oct + major 3rd)
278
+ bass_pitch + 31, # 6th (2 oct + fifth)
279
+ }
280
+ if mid_n.pitch in harmonic_pitches:
281
+ # This mid note matches a bass harmonic — check if
282
+ # it has independent CQT energy above the harmonic level
283
+ mid_bin = (mid_n.pitch - 21) * 3 + 1
284
+ bass_bin = (bass_pitch - 21) * 3 + 1
285
+ if 0 <= mid_bin < N_BINS and 0 <= bass_bin < N_BINS:
286
+ sf = max(0, int(mid_n.start * sr / hop_length))
287
+ ef = min(C_db.shape[1], sf + max(1, int(0.15 * sr / hop_length)))
288
+ mid_energy = float(np.max(C_db[max(0, mid_bin-1):min(N_BINS, mid_bin+2), sf:ef]))
289
+ bass_energy = float(np.max(C_db[max(0, bass_bin-1):min(N_BINS, bass_bin+2), sf:ef]))
290
+ # If bass is much louder (>8dB) and mid note is quiet,
291
+ # it's likely a harmonic ghost
292
+ if bass_energy - mid_energy > 8.0 and mid_n.velocity < bass_n.velocity * 0.7:
293
+ to_remove.add(mid_k)
294
+ break
295
+
296
  instrument.notes = [n for k, n in enumerate(notes) if k not in to_remove]
297
  removed += len(to_remove)
298
 
 
359
  return midi_out, removed
360
 
361
 
362
+ def remove_spurious_onsets(midi_data, y, sr, ref_onsets, hop_length=512, complexity='simple'):
363
  """Remove MIDI notes that form false-positive onsets not backed by audio.
364
 
365
  Analysis shows 37 extra MIDI onsets cause the biggest F1 drag (precision=0.918).
 
373
  3. Short+quiet artifacts: onsets where every note is both short (<200ms)
374
  and quiet (velocity < 50).
375
 
376
+ For complex pieces, thresholds are relaxed to preserve legitimate dense
377
+ textures that might otherwise be classified as spurious.
378
+
379
  The filter first identifies which MIDI onsets already match audio onsets,
380
  then only removes unmatched onsets meeting the above criteria.
381
  """
382
  midi_out = copy.deepcopy(midi_data)
383
  tolerance = 0.05
384
 
385
+ # Complexity-adjusted thresholds: complex pieces are more permissive
386
+ # to preserve legitimate dense textures
387
+ if complexity == 'complex':
388
+ strength_scale = 1.5 # require stronger evidence to remove
389
+ dist_scale = 1.4 # require further from audio onset to remove
390
+ elif complexity == 'moderate':
391
+ strength_scale = 1.2
392
+ dist_scale = 1.2
393
+ else:
394
+ strength_scale = 1.0
395
+ dist_scale = 1.0
396
+
397
  onset_env = librosa.onset.onset_strength(y=y, sr=sr, hop_length=hop_length)
398
  onset_times = librosa.frames_to_time(
399
  np.arange(len(onset_env)), sr=sr, hop_length=hop_length
 
445
  # Category 1: Chord fragment -- near a matched onset, but only if
446
  # the onset has weak audio energy. Strong onsets near chords may be
447
  # real grace notes or arpeggios.
448
+ if near_matched and strength < 2.0 * strength_scale:
449
  onsets_to_remove.add(j)
450
  continue
451
 
452
  # Category 2: Isolated ghost -- single note, low strength or far from audio
453
+ if len(onset_notes) == 1 and (strength < 1.5 * strength_scale or nearest_audio_ms > 100 * dist_scale):
454
  onsets_to_remove.add(j)
455
  continue
456
 
 
463
  # low velocity (< 35), far from audio onset. These are rumble artifacts
464
  # that survive the energy filter.
465
  if (len(onset_notes) == 1 and onset_notes[0].pitch < 40
466
+ and onset_notes[0].velocity < 35 and nearest_audio_ms > 60 * dist_scale):
467
  onsets_to_remove.add(j)
468
  continue
469
 
470
  # Category 5: Multi-note onset far from any audio onset (> 120ms)
471
  # with weak-to-moderate onset strength. These are chord-split artifacts
472
  # or hallucinated events with no audio support.
473
+ if nearest_audio_ms > 120 * dist_scale and strength < 3.0 * strength_scale:
474
  onsets_to_remove.add(j)
475
  continue
476
 
 
483
  # Category 7: Moderate distance from audio (> 70ms) with weak
484
  # onset strength — catches near-miss hallucinations that are
485
  # just outside the 50ms matching window.
486
+ if nearest_audio_ms > 70 * dist_scale and strength < 2.5 * strength_scale:
487
  onsets_to_remove.add(j)
488
  continue
489
 
 
719
  return midi_out, trimmed
720
 
721
 
722
+ def detect_sustain_regions(y, sr, hop_length=512):
723
+ """Detect regions where the sustain pedal is likely engaged.
724
+
725
+ Analyzes spectral flux and broadband energy decay. When the sustain pedal
726
+ is held, notes ring longer and the spectral energy decays slowly instead
727
+ of dropping abruptly at note release. Detects this by looking for:
728
+ 1. Low spectral flux (sustained timbre, no new attacks)
729
+ 2. Slow energy decay (notes ringing through pedal)
730
+
731
+ Returns a boolean array (per frame) indicating sustained regions.
732
+ """
733
+ # Compute spectral flux (rate of spectral change)
734
+ S = np.abs(librosa.stft(y, hop_length=hop_length))
735
+ flux = np.sqrt(np.mean(np.diff(S, axis=1) ** 2, axis=0))
736
+ flux = np.concatenate([[0], flux]) # pad to match frame count
737
+
738
+ # Compute RMS energy
739
+ rms = librosa.feature.rms(y=y, hop_length=hop_length)[0]
740
+
741
+ # Normalize both
742
+ flux_norm = flux / (np.percentile(flux, 95) + 1e-8)
743
+ rms_norm = rms / (np.max(rms) + 1e-8)
744
+
745
+ n_frames = min(len(flux_norm), len(rms_norm))
746
+ flux_norm = flux_norm[:n_frames]
747
+ rms_norm = rms_norm[:n_frames]
748
+
749
+ # Sustain pedal indicators:
750
+ # - Low spectral flux (< 30th percentile) = sustained sound, not new attacks
751
+ # - Moderate+ energy (> 10% of peak) = notes are still ringing
752
+ flux_thresh = np.percentile(flux_norm, 30)
753
+ sustain_mask = (flux_norm < flux_thresh) & (rms_norm > 0.10)
754
+
755
+ # Smooth: close 200ms gaps, remove blips shorter than 300ms
756
+ close_frames = max(1, int(0.2 * sr / hop_length))
757
+ min_region = max(1, int(0.3 * sr / hop_length))
758
+
759
+ # Morphological closing
760
+ for i in range(1, n_frames - 1):
761
+ if not sustain_mask[i]:
762
+ before = any(sustain_mask[max(0, i - close_frames):i])
763
+ after = any(sustain_mask[i + 1:min(n_frames, i + close_frames + 1)])
764
+ if before and after:
765
+ sustain_mask[i] = True
766
+
767
+ # Remove short blips
768
+ in_region = False
769
+ start = 0
770
+ for i in range(n_frames):
771
+ if sustain_mask[i] and not in_region:
772
+ start = i
773
+ in_region = True
774
+ elif not sustain_mask[i] and in_region:
775
+ if i - start < min_region:
776
+ sustain_mask[start:i] = False
777
+ in_region = False
778
+
779
+ return sustain_mask
780
+
781
+
782
  def extend_note_durations(midi_data, y, sr, hop_length=512, max_per_hand=4, hand_split=60):
783
  """Extend MIDI note durations to match audio CQT energy decay.
784
 
 
804
  C_norm = (C_norm + 80.0) / 80.0
805
  n_frames = C.shape[1]
806
 
807
+ # Detect sustain pedal regions for longer extension allowance
808
+ sustain_mask = detect_sustain_regions(y, sr, hop_length)
809
+ # Pad/trim to match CQT frame count
810
+ if len(sustain_mask) < n_frames:
811
+ sustain_mask = np.concatenate([sustain_mask, np.zeros(n_frames - len(sustain_mask), dtype=bool)])
812
+ else:
813
+ sustain_mask = sustain_mask[:n_frames]
814
+
815
  # Pre-compute per-frame concurrent counts per hand (fast O(1) lookup)
816
  right_count = np.zeros(n_frames, dtype=int)
817
  left_count = np.zeros(n_frames, dtype=int)
 
825
  left_count[sf:ef] += 1
826
 
827
  extended = 0
828
+ sustain_extended = 0
829
  for inst in midi_out.instruments:
830
  # Sort notes by start time for overlap checking
831
  notes_sorted = sorted(inst.notes, key=lambda n: (n.pitch, n.start))
 
836
  continue
837
 
838
  end_frame = min(n_frames, int(note.end * sr / hop_length))
839
+
840
+ # In sustain regions, allow longer extension (4s) and lower threshold
841
+ in_sustain = end_frame < n_frames and sustain_mask[min(end_frame, n_frames - 1)]
842
+ max_ext_seconds = 4.0 if in_sustain else 2.0
843
+ energy_thresh = 0.15 if in_sustain else 0.20
844
+
845
+ max_extend = min(n_frames, end_frame + int(max_ext_seconds * sr / hop_length))
846
 
847
  # Don't extend into the next note at the same pitch
848
  next_start_frame = max_extend
 
858
  for f in range(end_frame, min(max_extend, next_start_frame)):
859
  lo = max(0, fund_bin - 1)
860
  hi = min(N_BINS, fund_bin + 2)
861
+ if np.mean(C_norm[lo:hi, f]) > energy_thresh:
862
  # Check concurrent: this note isn't counted in hand_count
863
  # beyond end_frame, so hand_count[f] >= max_per_hand means
864
  # extending here would create max_per_hand + 1 concurrent
 
877
  hand_count[old_end_frame:new_end_frame] += 1
878
  note.end = new_end
879
  extended += 1
880
+ if in_sustain:
881
+ sustain_extended += 1
882
 
883
  return midi_out, extended
884
 
 
1246
  return midi_out, recovered
1247
 
1248
 
1249
+ def estimate_complexity(midi_data, audio_duration):
1250
+ """Estimate piece complexity to adjust filter aggressiveness.
1251
+
1252
+ Returns a dict with:
1253
+ - note_density: notes per second
1254
+ - avg_polyphony: average concurrent notes at any onset
1255
+ - complexity: 'simple' (<4 n/s), 'moderate' (4-8), 'complex' (>8)
1256
+
1257
+ Complex pieces need less aggressive ghost removal and wider tolerance
1258
+ for concurrent notes, since dense textures are intentional.
1259
+ """
1260
+ all_notes = sorted(
1261
+ [n for inst in midi_data.instruments for n in inst.notes],
1262
+ key=lambda n: n.start
1263
+ )
1264
+ if not all_notes or audio_duration < 1:
1265
+ return {'note_density': 0, 'avg_polyphony': 1, 'complexity': 'simple'}
1266
+
1267
+ note_density = len(all_notes) / audio_duration
1268
+
1269
+ # Compute average polyphony at each onset
1270
+ onsets = sorted(set(round(n.start, 3) for n in all_notes))
1271
+ polyphonies = []
1272
+ for onset in onsets:
1273
+ count = sum(1 for n in all_notes if abs(n.start - onset) < 0.03)
1274
+ polyphonies.append(count)
1275
+ avg_polyphony = np.mean(polyphonies) if polyphonies else 1
1276
+
1277
+ if note_density > 8 or avg_polyphony > 3.5:
1278
+ complexity = 'complex'
1279
+ elif note_density > 4 or avg_polyphony > 2.5:
1280
+ complexity = 'moderate'
1281
+ else:
1282
+ complexity = 'simple'
1283
+
1284
+ return {
1285
+ 'note_density': note_density,
1286
+ 'avg_polyphony': avg_polyphony,
1287
+ 'complexity': complexity,
1288
+ }
1289
+
1290
+
1291
  def optimize(original_audio_path, midi_path, output_path=None):
1292
  """Full optimization pipeline."""
1293
  if output_path is None:
 
1318
  total_notes = sum(len(inst.notes) for inst in midi_data.instruments)
1319
  print(f" {total_notes} MIDI notes")
1320
 
1321
+ # Estimate complexity to adjust filter thresholds
1322
+ complexity_info = estimate_complexity(midi_data, audio_duration)
1323
+ complexity = complexity_info['complexity']
1324
+ print(f" Complexity: {complexity} (density={complexity_info['note_density']:.1f} n/s, "
1325
+ f"polyphony={complexity_info['avg_polyphony']:.1f})")
1326
+
1327
  # Step 0: Remove notes in leading silence (mic rumble artifacts)
1328
  print("\nStep 0: Removing notes in leading silence...")
1329
  midi_data, silence_removed, music_start = remove_leading_silence_notes(midi_data, y, sr)
 
1400
  # Step 6b: Remove spurious false-positive onsets
1401
  print("\nStep 6b: Removing spurious onsets (false positive cleanup)...")
1402
  midi_data, spurious_notes, spurious_onsets = remove_spurious_onsets(
1403
+ midi_data, y, sr, ref_onsets, hop_length, complexity=complexity
1404
  )
1405
  print(f" Removed {spurious_notes} notes across {spurious_onsets} spurious onsets")
1406
 
 
1456
  )
1457
  print(f" Recovered {notes_recovered} notes from CQT energy")
1458
 
1459
+ # Step 8f: Playability filter — limit per-onset chord size
1460
+ # Complex pieces get 5 notes/hand to preserve dense voicings
1461
+ max_hand = 5 if complexity == 'complex' else 4
1462
+ print(f"\nStep 8f: Playability filter (max {max_hand} notes per hand per chord)...")
1463
+ midi_data, playability_removed = limit_concurrent_notes(midi_data, max_per_hand=max_hand)
1464
  print(f" Removed {playability_removed} excess chord notes")
1465
 
1466
+ # Step 8g: Limit total concurrent sounding notes
1467
+ print(f"\nStep 8g: Concurrent sounding limit (max {max_hand} per hand)...")
1468
+ midi_data, sustain_trimmed = limit_total_concurrent(midi_data, max_per_hand=max_hand)
1469
  print(f" Trimmed {sustain_trimmed} sustained notes to reduce pileup")
1470
 
1471
  # Final metrics