Spaces:
Sleeping
Sleeping
Ewan Claude Opus 4.6 commited on
Commit ·
1646c97
1
Parent(s): 7dd6b8a
Improve transcription fidelity: trailing notes, sustain pedal, complexity tuning
Browse files- Fix trailing silence threshold (5% → 2% RMS + 3s protection zone) — recovers 7s of cut-off endings
- Fix leading silence threshold (10% → 5% + always protect first note)
- Add spectral masking in harmonic ghost removal for two-hand texture
- Add sustain pedal detection from audio spectral flux analysis
- Add complexity-aware tuning (note density + polyphony estimation)
- Add audio analysis toolkit (spectral comparison, CQT visualization)
- UI: larger transport icons, loop label, 5s skip labels
Jewish Bride spectral MSE: -40.3% overall, -73.8% at 95th percentile
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
- .gitignore +2 -0
- app/src/components/Controls.jsx +14 -11
- app/src/index.css +81 -61
- transcriber/audio_analysis.py +425 -0
- transcriber/optimize.py +239 -27
.gitignore
CHANGED
|
@@ -8,3 +8,5 @@ transcriber/diagnose_*.py
|
|
| 8 |
transcriber/simulate_*.py
|
| 9 |
__pycache__
|
| 10 |
*.pyc
|
|
|
|
|
|
|
|
|
| 8 |
transcriber/simulate_*.py
|
| 9 |
__pycache__
|
| 10 |
*.pyc
|
| 11 |
+
transcriber/soundfonts/
|
| 12 |
+
transcriber/benchmarks/
|
app/src/components/Controls.jsx
CHANGED
|
@@ -62,7 +62,7 @@ export default function Controls({
|
|
| 62 |
<div className="controls-main">
|
| 63 |
<div className="controls-left">
|
| 64 |
<div className="brand-mark">
|
| 65 |
-
<OctopusLogo size={
|
| 66 |
<span className="brand-name">Mr. Octopus</span>
|
| 67 |
</div>
|
| 68 |
{fileName && (
|
|
@@ -76,19 +76,20 @@ export default function Controls({
|
|
| 76 |
onClick={() => seekTo(Math.max(0, displayTime - 5))}
|
| 77 |
title="Back 5s"
|
| 78 |
>
|
| 79 |
-
<svg width="
|
| 80 |
<path d="M11 18V6l-8.5 6 8.5 6zm.5-6l8.5 6V6l-8.5 6z" />
|
| 81 |
</svg>
|
|
|
|
| 82 |
</button>
|
| 83 |
|
| 84 |
<button className="play-btn" onClick={togglePlayPause}>
|
| 85 |
{isPlaying ? (
|
| 86 |
-
<svg width="
|
| 87 |
<rect x="6" y="4" width="4" height="16" rx="1" />
|
| 88 |
<rect x="14" y="4" width="4" height="16" rx="1" />
|
| 89 |
</svg>
|
| 90 |
) : (
|
| 91 |
-
<svg width="
|
| 92 |
<path d="M8 5v14l11-7z" />
|
| 93 |
</svg>
|
| 94 |
)}
|
|
@@ -99,15 +100,17 @@ export default function Controls({
|
|
| 99 |
onClick={() => seekTo(Math.min(totalDuration, displayTime + 5))}
|
| 100 |
title="Forward 5s"
|
| 101 |
>
|
| 102 |
-
<svg width="
|
| 103 |
<path d="M4 18l8.5-6L4 6v12zm9-12v12l8.5-6L13 6z" />
|
| 104 |
</svg>
|
|
|
|
| 105 |
</button>
|
| 106 |
</div>
|
| 107 |
|
| 108 |
<div className="controls-right">
|
| 109 |
{/* Loop controls */}
|
| 110 |
<div className="loop-controls">
|
|
|
|
| 111 |
{!isLooping ? (
|
| 112 |
<>
|
| 113 |
<button
|
|
@@ -138,12 +141,6 @@ export default function Controls({
|
|
| 138 |
)}
|
| 139 |
</div>
|
| 140 |
|
| 141 |
-
{onNewSong && (
|
| 142 |
-
<button className="btn btn-new" onClick={onNewSong}>
|
| 143 |
-
+ New Song
|
| 144 |
-
</button>
|
| 145 |
-
)}
|
| 146 |
-
|
| 147 |
<div className="tempo-control">
|
| 148 |
<span className="tempo-label">Speed</span>
|
| 149 |
<input
|
|
@@ -155,6 +152,12 @@ export default function Controls({
|
|
| 155 |
/>
|
| 156 |
<span className="tempo-value">{tempo}%</span>
|
| 157 |
</div>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 158 |
</div>
|
| 159 |
</div>
|
| 160 |
|
|
|
|
| 62 |
<div className="controls-main">
|
| 63 |
<div className="controls-left">
|
| 64 |
<div className="brand-mark">
|
| 65 |
+
<OctopusLogo size={32} />
|
| 66 |
<span className="brand-name">Mr. Octopus</span>
|
| 67 |
</div>
|
| 68 |
{fileName && (
|
|
|
|
| 76 |
onClick={() => seekTo(Math.max(0, displayTime - 5))}
|
| 77 |
title="Back 5s"
|
| 78 |
>
|
| 79 |
+
<svg width="18" height="18" viewBox="0 0 24 24" fill="currentColor">
|
| 80 |
<path d="M11 18V6l-8.5 6 8.5 6zm.5-6l8.5 6V6l-8.5 6z" />
|
| 81 |
</svg>
|
| 82 |
+
<span className="transport-label">5s</span>
|
| 83 |
</button>
|
| 84 |
|
| 85 |
<button className="play-btn" onClick={togglePlayPause}>
|
| 86 |
{isPlaying ? (
|
| 87 |
+
<svg width="24" height="24" viewBox="0 0 24 24" fill="currentColor">
|
| 88 |
<rect x="6" y="4" width="4" height="16" rx="1" />
|
| 89 |
<rect x="14" y="4" width="4" height="16" rx="1" />
|
| 90 |
</svg>
|
| 91 |
) : (
|
| 92 |
+
<svg width="24" height="24" viewBox="0 0 24 24" fill="currentColor">
|
| 93 |
<path d="M8 5v14l11-7z" />
|
| 94 |
</svg>
|
| 95 |
)}
|
|
|
|
| 100 |
onClick={() => seekTo(Math.min(totalDuration, displayTime + 5))}
|
| 101 |
title="Forward 5s"
|
| 102 |
>
|
| 103 |
+
<svg width="18" height="18" viewBox="0 0 24 24" fill="currentColor">
|
| 104 |
<path d="M4 18l8.5-6L4 6v12zm9-12v12l8.5-6L13 6z" />
|
| 105 |
</svg>
|
| 106 |
+
<span className="transport-label">5s</span>
|
| 107 |
</button>
|
| 108 |
</div>
|
| 109 |
|
| 110 |
<div className="controls-right">
|
| 111 |
{/* Loop controls */}
|
| 112 |
<div className="loop-controls">
|
| 113 |
+
<span className="loop-label">Loop</span>
|
| 114 |
{!isLooping ? (
|
| 115 |
<>
|
| 116 |
<button
|
|
|
|
| 141 |
)}
|
| 142 |
</div>
|
| 143 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 144 |
<div className="tempo-control">
|
| 145 |
<span className="tempo-label">Speed</span>
|
| 146 |
<input
|
|
|
|
| 152 |
/>
|
| 153 |
<span className="tempo-value">{tempo}%</span>
|
| 154 |
</div>
|
| 155 |
+
|
| 156 |
+
{onNewSong && (
|
| 157 |
+
<button className="btn btn-new" onClick={onNewSong}>
|
| 158 |
+
+ New Song
|
| 159 |
+
</button>
|
| 160 |
+
)}
|
| 161 |
</div>
|
| 162 |
</div>
|
| 163 |
|
app/src/index.css
CHANGED
|
@@ -228,18 +228,18 @@ body {
|
|
| 228 |
}
|
| 229 |
|
| 230 |
.controls-main {
|
| 231 |
-
height:
|
| 232 |
display: flex;
|
| 233 |
align-items: center;
|
| 234 |
justify-content: space-between;
|
| 235 |
-
padding: 0
|
| 236 |
-
gap:
|
| 237 |
}
|
| 238 |
|
| 239 |
.controls-left {
|
| 240 |
display: flex;
|
| 241 |
align-items: center;
|
| 242 |
-
gap:
|
| 243 |
min-width: 0;
|
| 244 |
flex: 1;
|
| 245 |
}
|
|
@@ -252,7 +252,7 @@ body {
|
|
| 252 |
}
|
| 253 |
|
| 254 |
.brand-name {
|
| 255 |
-
font-size:
|
| 256 |
font-weight: 700;
|
| 257 |
background: linear-gradient(135deg, #a78bfa, #06b6d4);
|
| 258 |
-webkit-background-clip: text;
|
|
@@ -263,13 +263,13 @@ body {
|
|
| 263 |
}
|
| 264 |
|
| 265 |
.file-name {
|
| 266 |
-
font-size:
|
| 267 |
color: var(--text-muted);
|
| 268 |
white-space: nowrap;
|
| 269 |
overflow: hidden;
|
| 270 |
text-overflow: ellipsis;
|
| 271 |
-
max-width:
|
| 272 |
-
padding-left:
|
| 273 |
border-left: 1.5px solid var(--border);
|
| 274 |
font-weight: 500;
|
| 275 |
}
|
|
@@ -277,73 +277,84 @@ body {
|
|
| 277 |
.controls-center {
|
| 278 |
display: flex;
|
| 279 |
align-items: center;
|
| 280 |
-
gap:
|
| 281 |
flex-shrink: 0;
|
| 282 |
}
|
| 283 |
|
| 284 |
.controls-right {
|
| 285 |
display: flex;
|
| 286 |
align-items: center;
|
| 287 |
-
gap:
|
| 288 |
flex: 1;
|
| 289 |
justify-content: flex-end;
|
| 290 |
}
|
| 291 |
|
| 292 |
-
/* Transport buttons */
|
| 293 |
.transport-btn {
|
| 294 |
-
width:
|
| 295 |
-
height:
|
| 296 |
-
border-radius:
|
| 297 |
border: none;
|
| 298 |
background: var(--surface-2);
|
| 299 |
color: var(--text-muted);
|
| 300 |
cursor: pointer;
|
| 301 |
display: flex;
|
|
|
|
| 302 |
align-items: center;
|
| 303 |
justify-content: center;
|
|
|
|
| 304 |
transition: all 0.15s;
|
|
|
|
| 305 |
}
|
| 306 |
|
| 307 |
.transport-btn:hover {
|
| 308 |
background: var(--surface-3);
|
| 309 |
color: var(--text);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 310 |
}
|
| 311 |
|
| 312 |
/* Play button — bold and prominent */
|
| 313 |
.play-btn {
|
| 314 |
-
width:
|
| 315 |
-
height:
|
| 316 |
border-radius: 50%;
|
| 317 |
border: none;
|
| 318 |
background: var(--primary);
|
| 319 |
color: white;
|
| 320 |
-
font-size:
|
| 321 |
cursor: pointer;
|
| 322 |
transition: all 0.2s;
|
| 323 |
display: flex;
|
| 324 |
align-items: center;
|
| 325 |
justify-content: center;
|
| 326 |
-
box-shadow: 0 0
|
| 327 |
}
|
| 328 |
|
| 329 |
.play-btn:hover {
|
| 330 |
background: var(--primary-hover);
|
| 331 |
-
box-shadow: 0 0
|
| 332 |
-
transform: scale(1.
|
| 333 |
}
|
| 334 |
|
| 335 |
.play-btn:active {
|
| 336 |
-
transform: scale(0.
|
| 337 |
}
|
| 338 |
|
| 339 |
-
/*
|
| 340 |
.btn {
|
| 341 |
background: var(--surface-2);
|
| 342 |
color: var(--text-muted);
|
| 343 |
border: 1.5px solid var(--border);
|
| 344 |
border-radius: 8px;
|
| 345 |
-
padding:
|
| 346 |
-
font-size:
|
| 347 |
font-weight: 600;
|
| 348 |
font-family: inherit;
|
| 349 |
cursor: pointer;
|
|
@@ -373,33 +384,33 @@ body {
|
|
| 373 |
.tempo-control {
|
| 374 |
display: flex;
|
| 375 |
align-items: center;
|
| 376 |
-
gap:
|
| 377 |
background: var(--surface-2);
|
| 378 |
-
padding:
|
| 379 |
-
border-radius:
|
| 380 |
border: 1px solid var(--border);
|
| 381 |
}
|
| 382 |
|
| 383 |
.tempo-label {
|
| 384 |
-
font-size:
|
| 385 |
-
font-weight:
|
| 386 |
-
color: var(--text-
|
| 387 |
text-transform: uppercase;
|
| 388 |
letter-spacing: 0.5px;
|
| 389 |
white-space: nowrap;
|
| 390 |
}
|
| 391 |
|
| 392 |
.tempo-value {
|
| 393 |
-
font-size:
|
| 394 |
-
font-weight:
|
| 395 |
-
color: var(--text
|
| 396 |
-
min-width:
|
| 397 |
text-align: right;
|
| 398 |
font-variant-numeric: tabular-nums;
|
| 399 |
}
|
| 400 |
|
| 401 |
.tempo-control input[type='range'] {
|
| 402 |
-
width:
|
| 403 |
}
|
| 404 |
|
| 405 |
/* ========================================
|
|
@@ -409,16 +420,16 @@ body {
|
|
| 409 |
.timeline {
|
| 410 |
display: flex;
|
| 411 |
align-items: center;
|
| 412 |
-
gap:
|
| 413 |
-
padding: 0
|
| 414 |
}
|
| 415 |
|
| 416 |
.timeline-time {
|
| 417 |
-
font-size:
|
| 418 |
font-weight: 600;
|
| 419 |
color: var(--text-muted);
|
| 420 |
font-variant-numeric: tabular-nums;
|
| 421 |
-
min-width:
|
| 422 |
}
|
| 423 |
|
| 424 |
.timeline-time:last-child {
|
|
@@ -432,8 +443,8 @@ body {
|
|
| 432 |
|
| 433 |
.timeline-track input[type='range'] {
|
| 434 |
width: 100%;
|
| 435 |
-
height:
|
| 436 |
-
border-radius:
|
| 437 |
-webkit-appearance: none;
|
| 438 |
appearance: none;
|
| 439 |
outline: none;
|
|
@@ -442,19 +453,19 @@ body {
|
|
| 442 |
}
|
| 443 |
|
| 444 |
.timeline-track input[type='range']:hover {
|
| 445 |
-
height:
|
| 446 |
}
|
| 447 |
|
| 448 |
.timeline-track input[type='range']::-webkit-slider-thumb {
|
| 449 |
-webkit-appearance: none;
|
| 450 |
appearance: none;
|
| 451 |
-
width:
|
| 452 |
-
height:
|
| 453 |
border-radius: 50%;
|
| 454 |
background: var(--primary-hover);
|
| 455 |
cursor: pointer;
|
| 456 |
border: 2px solid white;
|
| 457 |
-
box-shadow: 0 0
|
| 458 |
transition: transform 0.1s;
|
| 459 |
}
|
| 460 |
|
|
@@ -463,13 +474,13 @@ body {
|
|
| 463 |
}
|
| 464 |
|
| 465 |
.timeline-track input[type='range']::-moz-range-thumb {
|
| 466 |
-
width:
|
| 467 |
-
height:
|
| 468 |
border-radius: 50%;
|
| 469 |
background: var(--primary-hover);
|
| 470 |
cursor: pointer;
|
| 471 |
border: 2px solid white;
|
| 472 |
-
box-shadow: 0 0
|
| 473 |
}
|
| 474 |
|
| 475 |
/* General range sliders (for tempo) */
|
|
@@ -477,8 +488,8 @@ input[type='range'] {
|
|
| 477 |
-webkit-appearance: none;
|
| 478 |
appearance: none;
|
| 479 |
background: var(--border);
|
| 480 |
-
height:
|
| 481 |
-
border-radius:
|
| 482 |
outline: none;
|
| 483 |
cursor: pointer;
|
| 484 |
}
|
|
@@ -486,8 +497,8 @@ input[type='range'] {
|
|
| 486 |
input[type='range']::-webkit-slider-thumb {
|
| 487 |
-webkit-appearance: none;
|
| 488 |
appearance: none;
|
| 489 |
-
width:
|
| 490 |
-
height:
|
| 491 |
border-radius: 50%;
|
| 492 |
background: var(--primary);
|
| 493 |
cursor: pointer;
|
|
@@ -500,8 +511,8 @@ input[type='range']::-webkit-slider-thumb:hover {
|
|
| 500 |
}
|
| 501 |
|
| 502 |
input[type='range']::-moz-range-thumb {
|
| 503 |
-
width:
|
| 504 |
-
height:
|
| 505 |
border-radius: 50%;
|
| 506 |
background: var(--primary);
|
| 507 |
cursor: pointer;
|
|
@@ -512,16 +523,25 @@ input[type='range']::-moz-range-thumb {
|
|
| 512 |
.loop-controls {
|
| 513 |
display: flex;
|
| 514 |
align-items: center;
|
| 515 |
-
gap:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 516 |
}
|
| 517 |
|
| 518 |
.btn-loop {
|
| 519 |
-
min-width:
|
| 520 |
text-align: center;
|
| 521 |
font-weight: 700;
|
| 522 |
-
font-size:
|
| 523 |
-
padding:
|
| 524 |
-
border-radius:
|
| 525 |
font-family: inherit;
|
| 526 |
letter-spacing: 0.3px;
|
| 527 |
}
|
|
@@ -538,8 +558,8 @@ input[type='range']::-moz-range-thumb {
|
|
| 538 |
}
|
| 539 |
|
| 540 |
.loop-x {
|
| 541 |
-
margin-left:
|
| 542 |
-
font-size:
|
| 543 |
opacity: 0.6;
|
| 544 |
}
|
| 545 |
|
|
|
|
| 228 |
}
|
| 229 |
|
| 230 |
.controls-main {
|
| 231 |
+
height: 72px;
|
| 232 |
display: flex;
|
| 233 |
align-items: center;
|
| 234 |
justify-content: space-between;
|
| 235 |
+
padding: 0 24px;
|
| 236 |
+
gap: 20px;
|
| 237 |
}
|
| 238 |
|
| 239 |
.controls-left {
|
| 240 |
display: flex;
|
| 241 |
align-items: center;
|
| 242 |
+
gap: 16px;
|
| 243 |
min-width: 0;
|
| 244 |
flex: 1;
|
| 245 |
}
|
|
|
|
| 252 |
}
|
| 253 |
|
| 254 |
.brand-name {
|
| 255 |
+
font-size: 16px;
|
| 256 |
font-weight: 700;
|
| 257 |
background: linear-gradient(135deg, #a78bfa, #06b6d4);
|
| 258 |
-webkit-background-clip: text;
|
|
|
|
| 263 |
}
|
| 264 |
|
| 265 |
.file-name {
|
| 266 |
+
font-size: 14px;
|
| 267 |
color: var(--text-muted);
|
| 268 |
white-space: nowrap;
|
| 269 |
overflow: hidden;
|
| 270 |
text-overflow: ellipsis;
|
| 271 |
+
max-width: 240px;
|
| 272 |
+
padding-left: 16px;
|
| 273 |
border-left: 1.5px solid var(--border);
|
| 274 |
font-weight: 500;
|
| 275 |
}
|
|
|
|
| 277 |
.controls-center {
|
| 278 |
display: flex;
|
| 279 |
align-items: center;
|
| 280 |
+
gap: 10px;
|
| 281 |
flex-shrink: 0;
|
| 282 |
}
|
| 283 |
|
| 284 |
.controls-right {
|
| 285 |
display: flex;
|
| 286 |
align-items: center;
|
| 287 |
+
gap: 20px;
|
| 288 |
flex: 1;
|
| 289 |
justify-content: flex-end;
|
| 290 |
}
|
| 291 |
|
| 292 |
+
/* Transport buttons (skip back/forward) */
|
| 293 |
.transport-btn {
|
| 294 |
+
width: 48px;
|
| 295 |
+
height: 48px;
|
| 296 |
+
border-radius: 10px;
|
| 297 |
border: none;
|
| 298 |
background: var(--surface-2);
|
| 299 |
color: var(--text-muted);
|
| 300 |
cursor: pointer;
|
| 301 |
display: flex;
|
| 302 |
+
flex-direction: column;
|
| 303 |
align-items: center;
|
| 304 |
justify-content: center;
|
| 305 |
+
gap: 2px;
|
| 306 |
transition: all 0.15s;
|
| 307 |
+
border: 1px solid var(--border);
|
| 308 |
}
|
| 309 |
|
| 310 |
.transport-btn:hover {
|
| 311 |
background: var(--surface-3);
|
| 312 |
color: var(--text);
|
| 313 |
+
border-color: var(--border-hover);
|
| 314 |
+
}
|
| 315 |
+
|
| 316 |
+
.transport-label {
|
| 317 |
+
font-size: 10px;
|
| 318 |
+
font-weight: 600;
|
| 319 |
+
letter-spacing: 0.3px;
|
| 320 |
+
opacity: 0.7;
|
| 321 |
}
|
| 322 |
|
| 323 |
/* Play button — bold and prominent */
|
| 324 |
.play-btn {
|
| 325 |
+
width: 56px;
|
| 326 |
+
height: 56px;
|
| 327 |
border-radius: 50%;
|
| 328 |
border: none;
|
| 329 |
background: var(--primary);
|
| 330 |
color: white;
|
| 331 |
+
font-size: 20px;
|
| 332 |
cursor: pointer;
|
| 333 |
transition: all 0.2s;
|
| 334 |
display: flex;
|
| 335 |
align-items: center;
|
| 336 |
justify-content: center;
|
| 337 |
+
box-shadow: 0 0 24px var(--primary-glow);
|
| 338 |
}
|
| 339 |
|
| 340 |
.play-btn:hover {
|
| 341 |
background: var(--primary-hover);
|
| 342 |
+
box-shadow: 0 0 36px var(--primary-glow);
|
| 343 |
+
transform: scale(1.06);
|
| 344 |
}
|
| 345 |
|
| 346 |
.play-btn:active {
|
| 347 |
+
transform: scale(0.96);
|
| 348 |
}
|
| 349 |
|
| 350 |
+
/* General button */
|
| 351 |
.btn {
|
| 352 |
background: var(--surface-2);
|
| 353 |
color: var(--text-muted);
|
| 354 |
border: 1.5px solid var(--border);
|
| 355 |
border-radius: 8px;
|
| 356 |
+
padding: 8px 18px;
|
| 357 |
+
font-size: 13px;
|
| 358 |
font-weight: 600;
|
| 359 |
font-family: inherit;
|
| 360 |
cursor: pointer;
|
|
|
|
| 384 |
.tempo-control {
|
| 385 |
display: flex;
|
| 386 |
align-items: center;
|
| 387 |
+
gap: 10px;
|
| 388 |
background: var(--surface-2);
|
| 389 |
+
padding: 8px 16px;
|
| 390 |
+
border-radius: 10px;
|
| 391 |
border: 1px solid var(--border);
|
| 392 |
}
|
| 393 |
|
| 394 |
.tempo-label {
|
| 395 |
+
font-size: 12px;
|
| 396 |
+
font-weight: 700;
|
| 397 |
+
color: var(--text-muted);
|
| 398 |
text-transform: uppercase;
|
| 399 |
letter-spacing: 0.5px;
|
| 400 |
white-space: nowrap;
|
| 401 |
}
|
| 402 |
|
| 403 |
.tempo-value {
|
| 404 |
+
font-size: 14px;
|
| 405 |
+
font-weight: 700;
|
| 406 |
+
color: var(--text);
|
| 407 |
+
min-width: 40px;
|
| 408 |
text-align: right;
|
| 409 |
font-variant-numeric: tabular-nums;
|
| 410 |
}
|
| 411 |
|
| 412 |
.tempo-control input[type='range'] {
|
| 413 |
+
width: 100px;
|
| 414 |
}
|
| 415 |
|
| 416 |
/* ========================================
|
|
|
|
| 420 |
.timeline {
|
| 421 |
display: flex;
|
| 422 |
align-items: center;
|
| 423 |
+
gap: 14px;
|
| 424 |
+
padding: 0 24px 12px;
|
| 425 |
}
|
| 426 |
|
| 427 |
.timeline-time {
|
| 428 |
+
font-size: 13px;
|
| 429 |
font-weight: 600;
|
| 430 |
color: var(--text-muted);
|
| 431 |
font-variant-numeric: tabular-nums;
|
| 432 |
+
min-width: 40px;
|
| 433 |
}
|
| 434 |
|
| 435 |
.timeline-time:last-child {
|
|
|
|
| 443 |
|
| 444 |
.timeline-track input[type='range'] {
|
| 445 |
width: 100%;
|
| 446 |
+
height: 8px;
|
| 447 |
+
border-radius: 4px;
|
| 448 |
-webkit-appearance: none;
|
| 449 |
appearance: none;
|
| 450 |
outline: none;
|
|
|
|
| 453 |
}
|
| 454 |
|
| 455 |
.timeline-track input[type='range']:hover {
|
| 456 |
+
height: 10px;
|
| 457 |
}
|
| 458 |
|
| 459 |
.timeline-track input[type='range']::-webkit-slider-thumb {
|
| 460 |
-webkit-appearance: none;
|
| 461 |
appearance: none;
|
| 462 |
+
width: 16px;
|
| 463 |
+
height: 16px;
|
| 464 |
border-radius: 50%;
|
| 465 |
background: var(--primary-hover);
|
| 466 |
cursor: pointer;
|
| 467 |
border: 2px solid white;
|
| 468 |
+
box-shadow: 0 0 10px var(--primary-glow);
|
| 469 |
transition: transform 0.1s;
|
| 470 |
}
|
| 471 |
|
|
|
|
| 474 |
}
|
| 475 |
|
| 476 |
.timeline-track input[type='range']::-moz-range-thumb {
|
| 477 |
+
width: 16px;
|
| 478 |
+
height: 16px;
|
| 479 |
border-radius: 50%;
|
| 480 |
background: var(--primary-hover);
|
| 481 |
cursor: pointer;
|
| 482 |
border: 2px solid white;
|
| 483 |
+
box-shadow: 0 0 10px var(--primary-glow);
|
| 484 |
}
|
| 485 |
|
| 486 |
/* General range sliders (for tempo) */
|
|
|
|
| 488 |
-webkit-appearance: none;
|
| 489 |
appearance: none;
|
| 490 |
background: var(--border);
|
| 491 |
+
height: 5px;
|
| 492 |
+
border-radius: 3px;
|
| 493 |
outline: none;
|
| 494 |
cursor: pointer;
|
| 495 |
}
|
|
|
|
| 497 |
input[type='range']::-webkit-slider-thumb {
|
| 498 |
-webkit-appearance: none;
|
| 499 |
appearance: none;
|
| 500 |
+
width: 16px;
|
| 501 |
+
height: 16px;
|
| 502 |
border-radius: 50%;
|
| 503 |
background: var(--primary);
|
| 504 |
cursor: pointer;
|
|
|
|
| 511 |
}
|
| 512 |
|
| 513 |
input[type='range']::-moz-range-thumb {
|
| 514 |
+
width: 16px;
|
| 515 |
+
height: 16px;
|
| 516 |
border-radius: 50%;
|
| 517 |
background: var(--primary);
|
| 518 |
cursor: pointer;
|
|
|
|
| 523 |
.loop-controls {
|
| 524 |
display: flex;
|
| 525 |
align-items: center;
|
| 526 |
+
gap: 6px;
|
| 527 |
+
}
|
| 528 |
+
|
| 529 |
+
.loop-label {
|
| 530 |
+
font-size: 12px;
|
| 531 |
+
font-weight: 700;
|
| 532 |
+
color: var(--text-muted);
|
| 533 |
+
text-transform: uppercase;
|
| 534 |
+
letter-spacing: 0.5px;
|
| 535 |
+
margin-right: 2px;
|
| 536 |
}
|
| 537 |
|
| 538 |
.btn-loop {
|
| 539 |
+
min-width: 36px;
|
| 540 |
text-align: center;
|
| 541 |
font-weight: 700;
|
| 542 |
+
font-size: 13px;
|
| 543 |
+
padding: 7px 12px;
|
| 544 |
+
border-radius: 8px;
|
| 545 |
font-family: inherit;
|
| 546 |
letter-spacing: 0.3px;
|
| 547 |
}
|
|
|
|
| 558 |
}
|
| 559 |
|
| 560 |
.loop-x {
|
| 561 |
+
margin-left: 8px;
|
| 562 |
+
font-size: 15px;
|
| 563 |
opacity: 0.6;
|
| 564 |
}
|
| 565 |
|
transcriber/audio_analysis.py
ADDED
|
@@ -0,0 +1,425 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Audio Analysis Toolkit for Mr. Octopus Piano Transcription.
|
| 3 |
+
|
| 4 |
+
Three analysis modes:
|
| 5 |
+
1. Spectral comparison: Renders MIDI→audio via FluidSynth, compares spectrograms
|
| 6 |
+
2. Visual spectrogram: Generates PNG images for AI/human visual inspection
|
| 7 |
+
3. Audio playback: Plays original, rendered MIDI, or both side-by-side
|
| 8 |
+
|
| 9 |
+
Usage:
|
| 10 |
+
python audio_analysis.py compare <original_audio> <midi_file> [--output-dir ./analysis]
|
| 11 |
+
python audio_analysis.py visualize <original_audio> <midi_file> [--output-dir ./analysis]
|
| 12 |
+
python audio_analysis.py play <audio_file> [--start 10.0] [--duration 5.0]
|
| 13 |
+
python audio_analysis.py play-both <original_audio> <midi_file> [--start 10.0] [--duration 5.0]
|
| 14 |
+
python audio_analysis.py full <original_audio> <midi_file> [--output-dir ./analysis]
|
| 15 |
+
"""
|
| 16 |
+
|
| 17 |
+
import argparse
|
| 18 |
+
import os
|
| 19 |
+
import sys
|
| 20 |
+
import subprocess
|
| 21 |
+
import tempfile
|
| 22 |
+
import numpy as np
|
| 23 |
+
|
| 24 |
+
SOUNDFONT_PATH = os.path.join(os.path.dirname(__file__), "soundfonts", "FluidR3_GM.sf2")
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
def render_midi_to_audio(midi_path, output_wav, sample_rate=44100):
|
| 28 |
+
"""Render a MIDI file to WAV using FluidSynth."""
|
| 29 |
+
cmd = [
|
| 30 |
+
"fluidsynth",
|
| 31 |
+
f"--fast-render={output_wav}",
|
| 32 |
+
f"--sample-rate={sample_rate}",
|
| 33 |
+
"--gain=0.5",
|
| 34 |
+
"-n", "-i",
|
| 35 |
+
SOUNDFONT_PATH,
|
| 36 |
+
midi_path,
|
| 37 |
+
]
|
| 38 |
+
result = subprocess.run(cmd, capture_output=True, text=True, timeout=120)
|
| 39 |
+
if not os.path.exists(output_wav):
|
| 40 |
+
print(f"FluidSynth error: {result.stderr}")
|
| 41 |
+
raise RuntimeError("FluidSynth failed to render MIDI")
|
| 42 |
+
return output_wav
|
| 43 |
+
|
| 44 |
+
|
| 45 |
+
def load_audio(path, sr=22050, duration=None):
|
| 46 |
+
"""Load audio file, return mono signal and sample rate."""
|
| 47 |
+
import librosa
|
| 48 |
+
y, sr = librosa.load(path, sr=sr, mono=True, duration=duration)
|
| 49 |
+
return y, sr
|
| 50 |
+
|
| 51 |
+
|
| 52 |
+
def compute_spectrogram(y, sr, hop_length=512, n_fft=2048):
|
| 53 |
+
"""Compute a log-magnitude mel spectrogram."""
|
| 54 |
+
import librosa
|
| 55 |
+
S = librosa.feature.melspectrogram(y=y, sr=sr, n_fft=n_fft, hop_length=hop_length, n_mels=128)
|
| 56 |
+
S_db = librosa.power_to_db(S, ref=np.max)
|
| 57 |
+
return S_db
|
| 58 |
+
|
| 59 |
+
|
| 60 |
+
def compute_cqt(y, sr, hop_length=512):
|
| 61 |
+
"""Compute constant-Q transform (better for music)."""
|
| 62 |
+
import librosa
|
| 63 |
+
C = np.abs(librosa.cqt(y=y, sr=sr, hop_length=hop_length, n_bins=84, bins_per_octave=12))
|
| 64 |
+
C_db = librosa.amplitude_to_db(C, ref=np.max)
|
| 65 |
+
return C_db
|
| 66 |
+
|
| 67 |
+
|
| 68 |
+
def align_lengths(spec_a, spec_b):
|
| 69 |
+
"""Trim both spectrograms to the same number of time frames."""
|
| 70 |
+
min_frames = min(spec_a.shape[1], spec_b.shape[1])
|
| 71 |
+
return spec_a[:, :min_frames], spec_b[:, :min_frames]
|
| 72 |
+
|
| 73 |
+
|
| 74 |
+
def spectral_comparison(original_audio, midi_path, output_dir, sr=22050):
|
| 75 |
+
"""
|
| 76 |
+
Full spectral comparison: renders MIDI to audio, computes spectrograms,
|
| 77 |
+
calculates frame-by-frame divergence, and identifies problem regions.
|
| 78 |
+
"""
|
| 79 |
+
import librosa
|
| 80 |
+
|
| 81 |
+
os.makedirs(output_dir, exist_ok=True)
|
| 82 |
+
|
| 83 |
+
# Step 1: Render MIDI to audio
|
| 84 |
+
rendered_wav = os.path.join(output_dir, "midi_rendered.wav")
|
| 85 |
+
print("Rendering MIDI to audio via FluidSynth...")
|
| 86 |
+
render_midi_to_audio(midi_path, rendered_wav, sample_rate=44100)
|
| 87 |
+
|
| 88 |
+
# Step 2: Load both audio files
|
| 89 |
+
print("Loading original audio...")
|
| 90 |
+
y_orig, _ = load_audio(original_audio, sr=sr)
|
| 91 |
+
print("Loading rendered MIDI audio...")
|
| 92 |
+
y_midi, _ = load_audio(rendered_wav, sr=sr)
|
| 93 |
+
|
| 94 |
+
# Step 3: Compute spectrograms
|
| 95 |
+
hop = 512
|
| 96 |
+
print("Computing spectrograms...")
|
| 97 |
+
spec_orig = compute_spectrogram(y_orig, sr, hop_length=hop)
|
| 98 |
+
spec_midi = compute_spectrogram(y_midi, sr, hop_length=hop)
|
| 99 |
+
|
| 100 |
+
# Align lengths
|
| 101 |
+
spec_orig, spec_midi = align_lengths(spec_orig, spec_midi)
|
| 102 |
+
|
| 103 |
+
# Step 4: Compute frame-by-frame divergence
|
| 104 |
+
# Normalize to 0-1 range for comparison
|
| 105 |
+
spec_orig_norm = (spec_orig - spec_orig.min()) / (spec_orig.max() - spec_orig.min() + 1e-8)
|
| 106 |
+
spec_midi_norm = (spec_midi - spec_midi.min()) / (spec_midi.max() - spec_midi.min() + 1e-8)
|
| 107 |
+
|
| 108 |
+
# Mean squared error per frame (across frequency bins)
|
| 109 |
+
frame_mse = np.mean((spec_orig_norm - spec_midi_norm) ** 2, axis=0)
|
| 110 |
+
|
| 111 |
+
# Convert frame indices to time
|
| 112 |
+
n_frames = len(frame_mse)
|
| 113 |
+
times = librosa.frames_to_time(np.arange(n_frames), sr=sr, hop_length=hop)
|
| 114 |
+
|
| 115 |
+
# Step 5: Identify problem regions (frames with high divergence)
|
| 116 |
+
threshold = np.percentile(frame_mse, 90) # top 10% divergence
|
| 117 |
+
problem_mask = frame_mse > threshold
|
| 118 |
+
|
| 119 |
+
# Group consecutive problem frames into regions
|
| 120 |
+
regions = []
|
| 121 |
+
in_region = False
|
| 122 |
+
start = 0
|
| 123 |
+
for i, is_problem in enumerate(problem_mask):
|
| 124 |
+
if is_problem and not in_region:
|
| 125 |
+
start = i
|
| 126 |
+
in_region = True
|
| 127 |
+
elif not is_problem and in_region:
|
| 128 |
+
if times[i] - times[start] > 0.3: # min 300ms regions
|
| 129 |
+
regions.append((times[start], times[i - 1], np.mean(frame_mse[start:i])))
|
| 130 |
+
in_region = False
|
| 131 |
+
if in_region:
|
| 132 |
+
regions.append((times[start], times[-1], np.mean(frame_mse[start:])))
|
| 133 |
+
|
| 134 |
+
# Sort by divergence score (worst first)
|
| 135 |
+
regions.sort(key=lambda r: r[2], reverse=True)
|
| 136 |
+
|
| 137 |
+
# Step 6: Report
|
| 138 |
+
report_path = os.path.join(output_dir, "spectral_report.txt")
|
| 139 |
+
with open(report_path, "w") as f:
|
| 140 |
+
f.write("SPECTRAL COMPARISON REPORT\n")
|
| 141 |
+
f.write("=" * 60 + "\n\n")
|
| 142 |
+
f.write(f"Original: {original_audio}\n")
|
| 143 |
+
f.write(f"MIDI: {midi_path}\n")
|
| 144 |
+
f.write(f"Duration: {times[-1]:.1f}s ({n_frames} frames)\n\n")
|
| 145 |
+
|
| 146 |
+
overall_mse = np.mean(frame_mse)
|
| 147 |
+
f.write(f"Overall MSE: {overall_mse:.6f}\n")
|
| 148 |
+
f.write(f"Median MSE: {np.median(frame_mse):.6f}\n")
|
| 149 |
+
f.write(f"90th percentile: {threshold:.6f}\n\n")
|
| 150 |
+
|
| 151 |
+
f.write(f"TOP DIVERGENT REGIONS ({len(regions)} found):\n")
|
| 152 |
+
f.write("-" * 60 + "\n")
|
| 153 |
+
for i, (t_start, t_end, score) in enumerate(regions[:20]):
|
| 154 |
+
f.write(f" {i+1:2d}. {t_start:6.1f}s - {t_end:6.1f}s "
|
| 155 |
+
f"(duration: {t_end - t_start:.1f}s) MSE: {score:.6f}\n")
|
| 156 |
+
|
| 157 |
+
print(f"Report written to {report_path}")
|
| 158 |
+
|
| 159 |
+
# Save raw data for further analysis
|
| 160 |
+
np.savez(os.path.join(output_dir, "spectral_data.npz"),
|
| 161 |
+
frame_mse=frame_mse, times=times, threshold=threshold)
|
| 162 |
+
|
| 163 |
+
return frame_mse, times, regions
|
| 164 |
+
|
| 165 |
+
|
| 166 |
+
def generate_spectrograms(original_audio, midi_path, output_dir, sr=22050):
|
| 167 |
+
"""
|
| 168 |
+
Generate side-by-side spectrogram images for visual inspection.
|
| 169 |
+
Creates: overview, difference map, and zoomed segments.
|
| 170 |
+
"""
|
| 171 |
+
import librosa
|
| 172 |
+
import librosa.display
|
| 173 |
+
import matplotlib
|
| 174 |
+
matplotlib.use('Agg')
|
| 175 |
+
import matplotlib.pyplot as plt
|
| 176 |
+
|
| 177 |
+
os.makedirs(output_dir, exist_ok=True)
|
| 178 |
+
|
| 179 |
+
# Render MIDI
|
| 180 |
+
rendered_wav = os.path.join(output_dir, "midi_rendered.wav")
|
| 181 |
+
if not os.path.exists(rendered_wav):
|
| 182 |
+
print("Rendering MIDI to audio...")
|
| 183 |
+
render_midi_to_audio(midi_path, rendered_wav, sample_rate=44100)
|
| 184 |
+
|
| 185 |
+
# Load
|
| 186 |
+
print("Loading audio files...")
|
| 187 |
+
y_orig, _ = load_audio(original_audio, sr=sr)
|
| 188 |
+
y_midi, _ = load_audio(rendered_wav, sr=sr)
|
| 189 |
+
|
| 190 |
+
hop = 512
|
| 191 |
+
|
| 192 |
+
# CQT spectrograms (better for music than mel)
|
| 193 |
+
print("Computing CQT spectrograms...")
|
| 194 |
+
cqt_orig = compute_cqt(y_orig, sr, hop_length=hop)
|
| 195 |
+
cqt_midi = compute_cqt(y_midi, sr, hop_length=hop)
|
| 196 |
+
cqt_orig, cqt_midi = align_lengths(cqt_orig, cqt_midi)
|
| 197 |
+
|
| 198 |
+
duration = min(len(y_orig), len(y_midi)) / sr
|
| 199 |
+
|
| 200 |
+
# ===== Figure 1: Full overview side-by-side =====
|
| 201 |
+
fig, axes = plt.subplots(3, 1, figsize=(20, 12), constrained_layout=True)
|
| 202 |
+
fig.suptitle("Spectral Comparison: Original vs MIDI Transcription", fontsize=16, fontweight='bold')
|
| 203 |
+
|
| 204 |
+
# Original
|
| 205 |
+
img0 = axes[0].imshow(cqt_orig, aspect='auto', origin='lower',
|
| 206 |
+
extent=[0, duration, 0, 84], cmap='magma',
|
| 207 |
+
vmin=-60, vmax=0)
|
| 208 |
+
axes[0].set_title("Original Audio", fontsize=13)
|
| 209 |
+
axes[0].set_ylabel("CQT Bin (semitone)")
|
| 210 |
+
plt.colorbar(img0, ax=axes[0], label='dB')
|
| 211 |
+
|
| 212 |
+
# MIDI rendered
|
| 213 |
+
img1 = axes[1].imshow(cqt_midi, aspect='auto', origin='lower',
|
| 214 |
+
extent=[0, duration, 0, 84], cmap='magma',
|
| 215 |
+
vmin=-60, vmax=0)
|
| 216 |
+
axes[1].set_title("MIDI Transcription (rendered)", fontsize=13)
|
| 217 |
+
axes[1].set_ylabel("CQT Bin (semitone)")
|
| 218 |
+
plt.colorbar(img1, ax=axes[1], label='dB')
|
| 219 |
+
|
| 220 |
+
# Difference map
|
| 221 |
+
diff = cqt_orig - cqt_midi
|
| 222 |
+
img2 = axes[2].imshow(diff, aspect='auto', origin='lower',
|
| 223 |
+
extent=[0, duration, 0, 84], cmap='RdBu_r',
|
| 224 |
+
vmin=-30, vmax=30)
|
| 225 |
+
axes[2].set_title("Difference (Original − MIDI): Red=missing, Blue=extra", fontsize=13)
|
| 226 |
+
axes[2].set_ylabel("CQT Bin (semitone)")
|
| 227 |
+
axes[2].set_xlabel("Time (seconds)")
|
| 228 |
+
plt.colorbar(img2, ax=axes[2], label='dB difference')
|
| 229 |
+
|
| 230 |
+
overview_path = os.path.join(output_dir, "spectrogram_overview.png")
|
| 231 |
+
plt.savefig(overview_path, dpi=150)
|
| 232 |
+
plt.close()
|
| 233 |
+
print(f"Saved: {overview_path}")
|
| 234 |
+
|
| 235 |
+
# ===== Figure 2: Zoomed segments (first 30s, middle, last 30s) =====
|
| 236 |
+
segments = [
|
| 237 |
+
("Opening (0-30s)", 0, 30),
|
| 238 |
+
("Middle", max(0, duration / 2 - 15), min(duration, duration / 2 + 15)),
|
| 239 |
+
("Ending", max(0, duration - 30), duration),
|
| 240 |
+
]
|
| 241 |
+
|
| 242 |
+
for label, t_start, t_end in segments:
|
| 243 |
+
frame_start = int(t_start * sr / hop)
|
| 244 |
+
frame_end = int(t_end * sr / hop)
|
| 245 |
+
frame_end = min(frame_end, cqt_orig.shape[1])
|
| 246 |
+
|
| 247 |
+
if frame_end <= frame_start:
|
| 248 |
+
continue
|
| 249 |
+
|
| 250 |
+
fig, axes = plt.subplots(3, 1, figsize=(18, 10), constrained_layout=True)
|
| 251 |
+
fig.suptitle(f"Zoomed: {label} ({t_start:.0f}s - {t_end:.0f}s)", fontsize=14, fontweight='bold')
|
| 252 |
+
|
| 253 |
+
seg_orig = cqt_orig[:, frame_start:frame_end]
|
| 254 |
+
seg_midi = cqt_midi[:, frame_start:frame_end]
|
| 255 |
+
|
| 256 |
+
img0 = axes[0].imshow(seg_orig, aspect='auto', origin='lower',
|
| 257 |
+
extent=[t_start, t_end, 0, 84], cmap='magma',
|
| 258 |
+
vmin=-60, vmax=0)
|
| 259 |
+
axes[0].set_title("Original")
|
| 260 |
+
axes[0].set_ylabel("CQT Bin")
|
| 261 |
+
plt.colorbar(img0, ax=axes[0])
|
| 262 |
+
|
| 263 |
+
img1 = axes[1].imshow(seg_midi, aspect='auto', origin='lower',
|
| 264 |
+
extent=[t_start, t_end, 0, 84], cmap='magma',
|
| 265 |
+
vmin=-60, vmax=0)
|
| 266 |
+
axes[1].set_title("MIDI Transcription")
|
| 267 |
+
axes[1].set_ylabel("CQT Bin")
|
| 268 |
+
plt.colorbar(img1, ax=axes[1])
|
| 269 |
+
|
| 270 |
+
seg_diff = seg_orig - seg_midi
|
| 271 |
+
img2 = axes[2].imshow(seg_diff, aspect='auto', origin='lower',
|
| 272 |
+
extent=[t_start, t_end, 0, 84], cmap='RdBu_r',
|
| 273 |
+
vmin=-30, vmax=30)
|
| 274 |
+
axes[2].set_title("Difference (Red=missing in MIDI, Blue=extra in MIDI)")
|
| 275 |
+
axes[2].set_ylabel("CQT Bin")
|
| 276 |
+
axes[2].set_xlabel("Time (seconds)")
|
| 277 |
+
plt.colorbar(img2, ax=axes[2])
|
| 278 |
+
|
| 279 |
+
safe_label = label.replace(" ", "_").replace("(", "").replace(")", "").replace("-", "_")
|
| 280 |
+
seg_path = os.path.join(output_dir, f"spectrogram_{safe_label}.png")
|
| 281 |
+
plt.savefig(seg_path, dpi=150)
|
| 282 |
+
plt.close()
|
| 283 |
+
print(f"Saved: {seg_path}")
|
| 284 |
+
|
| 285 |
+
# ===== Figure 3: Energy envelope comparison =====
|
| 286 |
+
fig, ax = plt.subplots(figsize=(18, 4), constrained_layout=True)
|
| 287 |
+
energy_orig = np.mean(cqt_orig, axis=0)
|
| 288 |
+
energy_midi = np.mean(cqt_midi, axis=0)
|
| 289 |
+
n_frames = min(len(energy_orig), len(energy_midi))
|
| 290 |
+
times = librosa.frames_to_time(np.arange(n_frames), sr=sr, hop_length=hop)
|
| 291 |
+
ax.plot(times, energy_orig[:n_frames], label='Original', alpha=0.8, linewidth=0.5)
|
| 292 |
+
ax.plot(times, energy_midi[:n_frames], label='MIDI Transcription', alpha=0.8, linewidth=0.5)
|
| 293 |
+
ax.set_xlabel("Time (seconds)")
|
| 294 |
+
ax.set_ylabel("Mean CQT Energy (dB)")
|
| 295 |
+
ax.set_title("Energy Envelope Comparison")
|
| 296 |
+
ax.legend()
|
| 297 |
+
|
| 298 |
+
energy_path = os.path.join(output_dir, "energy_comparison.png")
|
| 299 |
+
plt.savefig(energy_path, dpi=150)
|
| 300 |
+
plt.close()
|
| 301 |
+
print(f"Saved: {energy_path}")
|
| 302 |
+
|
| 303 |
+
return [overview_path]
|
| 304 |
+
|
| 305 |
+
|
| 306 |
+
def play_audio(audio_path, start=None, duration=None):
|
| 307 |
+
"""Play audio through system speakers using afplay (macOS)."""
|
| 308 |
+
cmd = ["afplay", audio_path]
|
| 309 |
+
if start is not None:
|
| 310 |
+
# afplay doesn't support start offset natively, so we trim with python
|
| 311 |
+
import soundfile as sf
|
| 312 |
+
data, sr = sf.read(audio_path)
|
| 313 |
+
start_sample = int(start * sr)
|
| 314 |
+
if duration:
|
| 315 |
+
end_sample = start_sample + int(duration * sr)
|
| 316 |
+
else:
|
| 317 |
+
end_sample = len(data)
|
| 318 |
+
start_sample = max(0, min(start_sample, len(data)))
|
| 319 |
+
end_sample = max(start_sample, min(end_sample, len(data)))
|
| 320 |
+
segment = data[start_sample:end_sample]
|
| 321 |
+
|
| 322 |
+
tmp = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
|
| 323 |
+
sf.write(tmp.name, segment, sr)
|
| 324 |
+
cmd = ["afplay", tmp.name]
|
| 325 |
+
|
| 326 |
+
print(f"Playing: {audio_path}" + (f" [{start:.1f}s - {start + duration:.1f}s]" if start and duration else ""))
|
| 327 |
+
subprocess.run(cmd)
|
| 328 |
+
print("Playback finished.")
|
| 329 |
+
|
| 330 |
+
|
| 331 |
+
def play_comparison(original_audio, midi_path, start=None, duration=None):
|
| 332 |
+
"""Play original then MIDI rendering back-to-back for comparison."""
|
| 333 |
+
import soundfile as sf
|
| 334 |
+
|
| 335 |
+
# Render MIDI
|
| 336 |
+
rendered_wav = tempfile.NamedTemporaryFile(suffix=".wav", delete=False).name
|
| 337 |
+
print("Rendering MIDI to audio...")
|
| 338 |
+
render_midi_to_audio(midi_path, rendered_wav, sample_rate=44100)
|
| 339 |
+
|
| 340 |
+
print("\n--- Playing ORIGINAL ---")
|
| 341 |
+
play_audio(original_audio, start=start, duration=duration)
|
| 342 |
+
|
| 343 |
+
print("\n--- Playing MIDI TRANSCRIPTION ---")
|
| 344 |
+
play_audio(rendered_wav, start=start, duration=duration)
|
| 345 |
+
|
| 346 |
+
os.unlink(rendered_wav)
|
| 347 |
+
|
| 348 |
+
|
| 349 |
+
def full_analysis(original_audio, midi_path, output_dir):
|
| 350 |
+
"""Run all analyses: spectral comparison + visual spectrograms."""
|
| 351 |
+
print("=" * 60)
|
| 352 |
+
print("FULL AUDIO ANALYSIS")
|
| 353 |
+
print("=" * 60)
|
| 354 |
+
|
| 355 |
+
# 1. Spectral comparison (metrics + report)
|
| 356 |
+
print("\n[1/2] Running spectral comparison...")
|
| 357 |
+
frame_mse, times, regions = spectral_comparison(original_audio, midi_path, output_dir)
|
| 358 |
+
|
| 359 |
+
# 2. Visual spectrograms
|
| 360 |
+
print("\n[2/2] Generating visual spectrograms...")
|
| 361 |
+
images = generate_spectrograms(original_audio, midi_path, output_dir)
|
| 362 |
+
|
| 363 |
+
print("\n" + "=" * 60)
|
| 364 |
+
print(f"Analysis complete! Results in: {output_dir}")
|
| 365 |
+
print(f" - spectral_report.txt (divergence metrics + problem regions)")
|
| 366 |
+
print(f" - spectrogram_overview.png (full comparison)")
|
| 367 |
+
print(f" - spectrogram_*.png (zoomed segments)")
|
| 368 |
+
print(f" - energy_comparison.png (energy envelopes)")
|
| 369 |
+
print(f" - midi_rendered.wav (MIDI rendered to audio for listening)")
|
| 370 |
+
print("=" * 60)
|
| 371 |
+
|
| 372 |
+
return regions
|
| 373 |
+
|
| 374 |
+
|
| 375 |
+
def main():
|
| 376 |
+
parser = argparse.ArgumentParser(description="Audio analysis toolkit for piano transcription")
|
| 377 |
+
subparsers = parser.add_subparsers(dest="command", required=True)
|
| 378 |
+
|
| 379 |
+
# compare
|
| 380 |
+
p_compare = subparsers.add_parser("compare", help="Spectral comparison")
|
| 381 |
+
p_compare.add_argument("original", help="Original audio file")
|
| 382 |
+
p_compare.add_argument("midi", help="MIDI transcription file")
|
| 383 |
+
p_compare.add_argument("--output-dir", default="./analysis", help="Output directory")
|
| 384 |
+
|
| 385 |
+
# visualize
|
| 386 |
+
p_viz = subparsers.add_parser("visualize", help="Generate spectrogram images")
|
| 387 |
+
p_viz.add_argument("original", help="Original audio file")
|
| 388 |
+
p_viz.add_argument("midi", help="MIDI transcription file")
|
| 389 |
+
p_viz.add_argument("--output-dir", default="./analysis", help="Output directory")
|
| 390 |
+
|
| 391 |
+
# play
|
| 392 |
+
p_play = subparsers.add_parser("play", help="Play an audio file")
|
| 393 |
+
p_play.add_argument("audio", help="Audio file to play")
|
| 394 |
+
p_play.add_argument("--start", type=float, default=None, help="Start time in seconds")
|
| 395 |
+
p_play.add_argument("--duration", type=float, default=None, help="Duration in seconds")
|
| 396 |
+
|
| 397 |
+
# play-both
|
| 398 |
+
p_both = subparsers.add_parser("play-both", help="Play original then MIDI back-to-back")
|
| 399 |
+
p_both.add_argument("original", help="Original audio file")
|
| 400 |
+
p_both.add_argument("midi", help="MIDI transcription file")
|
| 401 |
+
p_both.add_argument("--start", type=float, default=None, help="Start time in seconds")
|
| 402 |
+
p_both.add_argument("--duration", type=float, default=None, help="Duration in seconds")
|
| 403 |
+
|
| 404 |
+
# full
|
| 405 |
+
p_full = subparsers.add_parser("full", help="Run all analyses")
|
| 406 |
+
p_full.add_argument("original", help="Original audio file")
|
| 407 |
+
p_full.add_argument("midi", help="MIDI transcription file")
|
| 408 |
+
p_full.add_argument("--output-dir", default="./analysis", help="Output directory")
|
| 409 |
+
|
| 410 |
+
args = parser.parse_args()
|
| 411 |
+
|
| 412 |
+
if args.command == "compare":
|
| 413 |
+
spectral_comparison(args.original, args.midi, args.output_dir)
|
| 414 |
+
elif args.command == "visualize":
|
| 415 |
+
generate_spectrograms(args.original, args.midi, args.output_dir)
|
| 416 |
+
elif args.command == "play":
|
| 417 |
+
play_audio(args.audio, start=args.start, duration=args.duration)
|
| 418 |
+
elif args.command == "play-both":
|
| 419 |
+
play_comparison(args.original, args.midi, start=args.start, duration=args.duration)
|
| 420 |
+
elif args.command == "full":
|
| 421 |
+
full_analysis(args.original, args.midi, args.output_dir)
|
| 422 |
+
|
| 423 |
+
|
| 424 |
+
if __name__ == "__main__":
|
| 425 |
+
main()
|
transcriber/optimize.py
CHANGED
|
@@ -15,6 +15,7 @@ def remove_leading_silence_notes(midi_data, y, sr):
|
|
| 15 |
|
| 16 |
Finds the first moment of real musical energy and removes any MIDI notes
|
| 17 |
before that point (typically microphone rumble / low-freq noise artifacts).
|
|
|
|
| 18 |
"""
|
| 19 |
midi_out = copy.deepcopy(midi_data)
|
| 20 |
|
|
@@ -28,17 +29,32 @@ def remove_leading_silence_notes(midi_data, y, sr):
|
|
| 28 |
if len(rms) == 0:
|
| 29 |
return midi_out, 0, 0.0
|
| 30 |
|
| 31 |
-
# Music starts when RMS first exceeds
|
|
|
|
| 32 |
max_rms = np.max(rms)
|
| 33 |
music_start = 0.0
|
| 34 |
for i, r in enumerate(rms):
|
| 35 |
-
if r > max_rms * 0.
|
| 36 |
music_start = i * 0.05
|
| 37 |
break
|
| 38 |
|
| 39 |
if music_start < 0.1:
|
| 40 |
return midi_out, 0, music_start
|
| 41 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 42 |
removed = 0
|
| 43 |
for instrument in midi_out.instruments:
|
| 44 |
filtered = []
|
|
@@ -53,7 +69,11 @@ def remove_leading_silence_notes(midi_data, y, sr):
|
|
| 53 |
|
| 54 |
|
| 55 |
def remove_trailing_silence_notes(midi_data, y, sr):
|
| 56 |
-
"""Remove notes that appear during the audio fade-out/silence at the end.
|
|
|
|
|
|
|
|
|
|
|
|
|
| 57 |
midi_out = copy.deepcopy(midi_data)
|
| 58 |
|
| 59 |
hop = int(0.05 * sr)
|
|
@@ -66,13 +86,18 @@ def remove_trailing_silence_notes(midi_data, y, sr):
|
|
| 66 |
|
| 67 |
max_rms = np.max(rms)
|
| 68 |
|
| 69 |
-
# Find the last moment where RMS exceeds
|
|
|
|
| 70 |
music_end = len(y) / sr
|
| 71 |
for i in range(len(rms) - 1, -1, -1):
|
| 72 |
-
if rms[i] > max_rms * 0.
|
| 73 |
-
|
|
|
|
| 74 |
break
|
| 75 |
|
|
|
|
|
|
|
|
|
|
| 76 |
removed = 0
|
| 77 |
for instrument in midi_out.instruments:
|
| 78 |
filtered = []
|
|
@@ -150,13 +175,17 @@ def remove_low_energy_notes(midi_data, y, sr, hop_length=512):
|
|
| 150 |
def remove_harmonic_ghosts(midi_data, y=None, sr=22050, hop_length=512):
|
| 151 |
"""Remove notes that are harmonic doublings of louder lower notes.
|
| 152 |
|
| 153 |
-
|
| 154 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 155 |
|
| 156 |
Uses CQT energy to protect strong notes: if the CQT shows the note
|
| 157 |
-
has strong energy
|
| 158 |
-
|
| 159 |
-
co-occur with C5 but are genuinely played.
|
| 160 |
"""
|
| 161 |
midi_out = copy.deepcopy(midi_data)
|
| 162 |
removed = 0
|
|
@@ -165,6 +194,7 @@ def remove_harmonic_ghosts(midi_data, y=None, sr=22050, hop_length=512):
|
|
| 165 |
|
| 166 |
# Compute CQT for energy verification if audio provided
|
| 167 |
C_db = None
|
|
|
|
| 168 |
if y is not None:
|
| 169 |
N_BINS = 88 * 3
|
| 170 |
FMIN = librosa.note_to_hz('A0')
|
|
@@ -222,6 +252,47 @@ def remove_harmonic_ghosts(midi_data, y=None, sr=22050, hop_length=512):
|
|
| 222 |
to_remove.add(i)
|
| 223 |
break
|
| 224 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 225 |
instrument.notes = [n for k, n in enumerate(notes) if k not in to_remove]
|
| 226 |
removed += len(to_remove)
|
| 227 |
|
|
@@ -288,7 +359,7 @@ def remove_phantom_notes(midi_data, max_pitch=None):
|
|
| 288 |
return midi_out, removed
|
| 289 |
|
| 290 |
|
| 291 |
-
def remove_spurious_onsets(midi_data, y, sr, ref_onsets, hop_length=512):
|
| 292 |
"""Remove MIDI notes that form false-positive onsets not backed by audio.
|
| 293 |
|
| 294 |
Analysis shows 37 extra MIDI onsets cause the biggest F1 drag (precision=0.918).
|
|
@@ -302,12 +373,27 @@ def remove_spurious_onsets(midi_data, y, sr, ref_onsets, hop_length=512):
|
|
| 302 |
3. Short+quiet artifacts: onsets where every note is both short (<200ms)
|
| 303 |
and quiet (velocity < 50).
|
| 304 |
|
|
|
|
|
|
|
|
|
|
| 305 |
The filter first identifies which MIDI onsets already match audio onsets,
|
| 306 |
then only removes unmatched onsets meeting the above criteria.
|
| 307 |
"""
|
| 308 |
midi_out = copy.deepcopy(midi_data)
|
| 309 |
tolerance = 0.05
|
| 310 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 311 |
onset_env = librosa.onset.onset_strength(y=y, sr=sr, hop_length=hop_length)
|
| 312 |
onset_times = librosa.frames_to_time(
|
| 313 |
np.arange(len(onset_env)), sr=sr, hop_length=hop_length
|
|
@@ -359,12 +445,12 @@ def remove_spurious_onsets(midi_data, y, sr, ref_onsets, hop_length=512):
|
|
| 359 |
# Category 1: Chord fragment -- near a matched onset, but only if
|
| 360 |
# the onset has weak audio energy. Strong onsets near chords may be
|
| 361 |
# real grace notes or arpeggios.
|
| 362 |
-
if near_matched and strength < 2.0:
|
| 363 |
onsets_to_remove.add(j)
|
| 364 |
continue
|
| 365 |
|
| 366 |
# Category 2: Isolated ghost -- single note, low strength or far from audio
|
| 367 |
-
if len(onset_notes) == 1 and (strength < 1.5 or nearest_audio_ms > 100):
|
| 368 |
onsets_to_remove.add(j)
|
| 369 |
continue
|
| 370 |
|
|
@@ -377,14 +463,14 @@ def remove_spurious_onsets(midi_data, y, sr, ref_onsets, hop_length=512):
|
|
| 377 |
# low velocity (< 35), far from audio onset. These are rumble artifacts
|
| 378 |
# that survive the energy filter.
|
| 379 |
if (len(onset_notes) == 1 and onset_notes[0].pitch < 40
|
| 380 |
-
and onset_notes[0].velocity < 35 and nearest_audio_ms > 60):
|
| 381 |
onsets_to_remove.add(j)
|
| 382 |
continue
|
| 383 |
|
| 384 |
# Category 5: Multi-note onset far from any audio onset (> 120ms)
|
| 385 |
# with weak-to-moderate onset strength. These are chord-split artifacts
|
| 386 |
# or hallucinated events with no audio support.
|
| 387 |
-
if nearest_audio_ms > 120 and strength < 3.0:
|
| 388 |
onsets_to_remove.add(j)
|
| 389 |
continue
|
| 390 |
|
|
@@ -397,7 +483,7 @@ def remove_spurious_onsets(midi_data, y, sr, ref_onsets, hop_length=512):
|
|
| 397 |
# Category 7: Moderate distance from audio (> 70ms) with weak
|
| 398 |
# onset strength — catches near-miss hallucinations that are
|
| 399 |
# just outside the 50ms matching window.
|
| 400 |
-
if nearest_audio_ms > 70 and strength < 2.5:
|
| 401 |
onsets_to_remove.add(j)
|
| 402 |
continue
|
| 403 |
|
|
@@ -633,6 +719,66 @@ def limit_total_concurrent(midi_data, max_per_hand=4, hand_split=60):
|
|
| 633 |
return midi_out, trimmed
|
| 634 |
|
| 635 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 636 |
def extend_note_durations(midi_data, y, sr, hop_length=512, max_per_hand=4, hand_split=60):
|
| 637 |
"""Extend MIDI note durations to match audio CQT energy decay.
|
| 638 |
|
|
@@ -658,6 +804,14 @@ def extend_note_durations(midi_data, y, sr, hop_length=512, max_per_hand=4, hand
|
|
| 658 |
C_norm = (C_norm + 80.0) / 80.0
|
| 659 |
n_frames = C.shape[1]
|
| 660 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 661 |
# Pre-compute per-frame concurrent counts per hand (fast O(1) lookup)
|
| 662 |
right_count = np.zeros(n_frames, dtype=int)
|
| 663 |
left_count = np.zeros(n_frames, dtype=int)
|
|
@@ -671,6 +825,7 @@ def extend_note_durations(midi_data, y, sr, hop_length=512, max_per_hand=4, hand
|
|
| 671 |
left_count[sf:ef] += 1
|
| 672 |
|
| 673 |
extended = 0
|
|
|
|
| 674 |
for inst in midi_out.instruments:
|
| 675 |
# Sort notes by start time for overlap checking
|
| 676 |
notes_sorted = sorted(inst.notes, key=lambda n: (n.pitch, n.start))
|
|
@@ -681,8 +836,13 @@ def extend_note_durations(midi_data, y, sr, hop_length=512, max_per_hand=4, hand
|
|
| 681 |
continue
|
| 682 |
|
| 683 |
end_frame = min(n_frames, int(note.end * sr / hop_length))
|
| 684 |
-
|
| 685 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 686 |
|
| 687 |
# Don't extend into the next note at the same pitch
|
| 688 |
next_start_frame = max_extend
|
|
@@ -698,7 +858,7 @@ def extend_note_durations(midi_data, y, sr, hop_length=512, max_per_hand=4, hand
|
|
| 698 |
for f in range(end_frame, min(max_extend, next_start_frame)):
|
| 699 |
lo = max(0, fund_bin - 1)
|
| 700 |
hi = min(N_BINS, fund_bin + 2)
|
| 701 |
-
if np.mean(C_norm[lo:hi, f]) >
|
| 702 |
# Check concurrent: this note isn't counted in hand_count
|
| 703 |
# beyond end_frame, so hand_count[f] >= max_per_hand means
|
| 704 |
# extending here would create max_per_hand + 1 concurrent
|
|
@@ -717,6 +877,8 @@ def extend_note_durations(midi_data, y, sr, hop_length=512, max_per_hand=4, hand
|
|
| 717 |
hand_count[old_end_frame:new_end_frame] += 1
|
| 718 |
note.end = new_end
|
| 719 |
extended += 1
|
|
|
|
|
|
|
| 720 |
|
| 721 |
return midi_out, extended
|
| 722 |
|
|
@@ -1084,6 +1246,48 @@ def recover_missing_notes(midi_data, y, sr, hop_length=512, snap_onsets=None):
|
|
| 1084 |
return midi_out, recovered
|
| 1085 |
|
| 1086 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1087 |
def optimize(original_audio_path, midi_path, output_path=None):
|
| 1088 |
"""Full optimization pipeline."""
|
| 1089 |
if output_path is None:
|
|
@@ -1114,6 +1318,12 @@ def optimize(original_audio_path, midi_path, output_path=None):
|
|
| 1114 |
total_notes = sum(len(inst.notes) for inst in midi_data.instruments)
|
| 1115 |
print(f" {total_notes} MIDI notes")
|
| 1116 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1117 |
# Step 0: Remove notes in leading silence (mic rumble artifacts)
|
| 1118 |
print("\nStep 0: Removing notes in leading silence...")
|
| 1119 |
midi_data, silence_removed, music_start = remove_leading_silence_notes(midi_data, y, sr)
|
|
@@ -1190,7 +1400,7 @@ def optimize(original_audio_path, midi_path, output_path=None):
|
|
| 1190 |
# Step 6b: Remove spurious false-positive onsets
|
| 1191 |
print("\nStep 6b: Removing spurious onsets (false positive cleanup)...")
|
| 1192 |
midi_data, spurious_notes, spurious_onsets = remove_spurious_onsets(
|
| 1193 |
-
midi_data, y, sr, ref_onsets, hop_length
|
| 1194 |
)
|
| 1195 |
print(f" Removed {spurious_notes} notes across {spurious_onsets} spurious onsets")
|
| 1196 |
|
|
@@ -1246,14 +1456,16 @@ def optimize(original_audio_path, midi_path, output_path=None):
|
|
| 1246 |
)
|
| 1247 |
print(f" Recovered {notes_recovered} notes from CQT energy")
|
| 1248 |
|
| 1249 |
-
# Step 8f: Playability filter — limit per-onset chord size
|
| 1250 |
-
|
| 1251 |
-
|
|
|
|
|
|
|
| 1252 |
print(f" Removed {playability_removed} excess chord notes")
|
| 1253 |
|
| 1254 |
-
# Step 8g: Limit total concurrent sounding notes
|
| 1255 |
-
print("\nStep 8g: Concurrent sounding limit (max
|
| 1256 |
-
midi_data, sustain_trimmed = limit_total_concurrent(midi_data, max_per_hand=
|
| 1257 |
print(f" Trimmed {sustain_trimmed} sustained notes to reduce pileup")
|
| 1258 |
|
| 1259 |
# Final metrics
|
|
|
|
| 15 |
|
| 16 |
Finds the first moment of real musical energy and removes any MIDI notes
|
| 17 |
before that point (typically microphone rumble / low-freq noise artifacts).
|
| 18 |
+
Always preserves the first detected MIDI note to prevent eating the opening.
|
| 19 |
"""
|
| 20 |
midi_out = copy.deepcopy(midi_data)
|
| 21 |
|
|
|
|
| 29 |
if len(rms) == 0:
|
| 30 |
return midi_out, 0, 0.0
|
| 31 |
|
| 32 |
+
# Music starts when RMS first exceeds 5% of the peak energy
|
| 33 |
+
# (reduced from 10% to avoid eating quiet openings)
|
| 34 |
max_rms = np.max(rms)
|
| 35 |
music_start = 0.0
|
| 36 |
for i, r in enumerate(rms):
|
| 37 |
+
if r > max_rms * 0.05:
|
| 38 |
music_start = i * 0.05
|
| 39 |
break
|
| 40 |
|
| 41 |
if music_start < 0.1:
|
| 42 |
return midi_out, 0, music_start
|
| 43 |
|
| 44 |
+
# Find the earliest MIDI note onset — always protect it
|
| 45 |
+
all_notes = sorted(
|
| 46 |
+
[n for inst in midi_out.instruments for n in inst.notes],
|
| 47 |
+
key=lambda n: n.start
|
| 48 |
+
)
|
| 49 |
+
earliest_onset = all_notes[0].start if all_notes else 0.0
|
| 50 |
+
|
| 51 |
+
# If the "silence" region would eat the first note, clamp music_start
|
| 52 |
+
if music_start > earliest_onset:
|
| 53 |
+
music_start = earliest_onset
|
| 54 |
+
|
| 55 |
+
if music_start < 0.1:
|
| 56 |
+
return midi_out, 0, music_start
|
| 57 |
+
|
| 58 |
removed = 0
|
| 59 |
for instrument in midi_out.instruments:
|
| 60 |
filtered = []
|
|
|
|
| 69 |
|
| 70 |
|
| 71 |
def remove_trailing_silence_notes(midi_data, y, sr):
|
| 72 |
+
"""Remove notes that appear during the audio fade-out/silence at the end.
|
| 73 |
+
|
| 74 |
+
Uses a 2% RMS threshold (reduced from 5%) and adds a 3-second protection
|
| 75 |
+
zone after the detected music end to preserve natural piano decay/sustain.
|
| 76 |
+
"""
|
| 77 |
midi_out = copy.deepcopy(midi_data)
|
| 78 |
|
| 79 |
hop = int(0.05 * sr)
|
|
|
|
| 86 |
|
| 87 |
max_rms = np.max(rms)
|
| 88 |
|
| 89 |
+
# Find the last moment where RMS exceeds 2% of peak (searching backwards)
|
| 90 |
+
# Reduced from 5% to preserve quiet endings, fade-outs, and final sustain
|
| 91 |
music_end = len(y) / sr
|
| 92 |
for i in range(len(rms) - 1, -1, -1):
|
| 93 |
+
if rms[i] > max_rms * 0.02:
|
| 94 |
+
# Add 3-second protection zone for natural piano decay
|
| 95 |
+
music_end = (i + 1) * 0.05 + 3.0
|
| 96 |
break
|
| 97 |
|
| 98 |
+
# Clamp to actual audio duration
|
| 99 |
+
music_end = min(music_end, len(y) / sr)
|
| 100 |
+
|
| 101 |
removed = 0
|
| 102 |
for instrument in midi_out.instruments:
|
| 103 |
filtered = []
|
|
|
|
| 175 |
def remove_harmonic_ghosts(midi_data, y=None, sr=22050, hop_length=512):
|
| 176 |
"""Remove notes that are harmonic doublings of louder lower notes.
|
| 177 |
|
| 178 |
+
Two-stage detector:
|
| 179 |
+
1. Pairwise: for notes at harmonic intervals (7, 12, 19, 24 semitones),
|
| 180 |
+
remove the upper note if it's clearly a harmonic ghost.
|
| 181 |
+
2. Spectral masking: when bass + melody overlap (two-hand texture),
|
| 182 |
+
check if upper notes can be explained by the harmonic series of
|
| 183 |
+
strong lower notes. This catches ghost notes that the pairwise
|
| 184 |
+
detector misses because they're at non-standard intervals.
|
| 185 |
|
| 186 |
Uses CQT energy to protect strong notes: if the CQT shows the note
|
| 187 |
+
has strong independent energy distinct from what the lower note's
|
| 188 |
+
harmonics would produce, it's a real played note.
|
|
|
|
| 189 |
"""
|
| 190 |
midi_out = copy.deepcopy(midi_data)
|
| 191 |
removed = 0
|
|
|
|
| 194 |
|
| 195 |
# Compute CQT for energy verification if audio provided
|
| 196 |
C_db = None
|
| 197 |
+
N_BINS = 0
|
| 198 |
if y is not None:
|
| 199 |
N_BINS = 88 * 3
|
| 200 |
FMIN = librosa.note_to_hz('A0')
|
|
|
|
| 252 |
to_remove.add(i)
|
| 253 |
break
|
| 254 |
|
| 255 |
+
# Stage 2: Spectral masking for two-hand texture
|
| 256 |
+
# When bass (< MIDI 55) and melody (>= MIDI 60) overlap, bass harmonics
|
| 257 |
+
# can produce ghost notes in the melody range. Check if a mid-range note
|
| 258 |
+
# is explainable as a harmonic partial of a concurrent bass note.
|
| 259 |
+
if C_db is not None:
|
| 260 |
+
remaining = [(k, n) for k, n in enumerate(notes) if k not in to_remove]
|
| 261 |
+
bass_notes = [(k, n) for k, n in remaining if n.pitch < 55]
|
| 262 |
+
mid_notes = [(k, n) for k, n in remaining if 55 <= n.pitch < 72]
|
| 263 |
+
|
| 264 |
+
for mid_k, mid_n in mid_notes:
|
| 265 |
+
if mid_k in to_remove:
|
| 266 |
+
continue
|
| 267 |
+
for bass_k, bass_n in bass_notes:
|
| 268 |
+
if abs(bass_n.start - mid_n.start) > 0.05:
|
| 269 |
+
continue
|
| 270 |
+
# Check if mid_n.pitch matches any harmonic partial of bass_n
|
| 271 |
+
# Harmonics: 2nd (+12), 3rd (+19), 4th (+24), 5th (+28), 6th (+31)
|
| 272 |
+
bass_pitch = bass_n.pitch
|
| 273 |
+
harmonic_pitches = {
|
| 274 |
+
bass_pitch + 12, # 2nd harmonic (octave)
|
| 275 |
+
bass_pitch + 19, # 3rd (octave + fifth)
|
| 276 |
+
bass_pitch + 24, # 4th (2 octaves)
|
| 277 |
+
bass_pitch + 28, # 5th (2 oct + major 3rd)
|
| 278 |
+
bass_pitch + 31, # 6th (2 oct + fifth)
|
| 279 |
+
}
|
| 280 |
+
if mid_n.pitch in harmonic_pitches:
|
| 281 |
+
# This mid note matches a bass harmonic — check if
|
| 282 |
+
# it has independent CQT energy above the harmonic level
|
| 283 |
+
mid_bin = (mid_n.pitch - 21) * 3 + 1
|
| 284 |
+
bass_bin = (bass_pitch - 21) * 3 + 1
|
| 285 |
+
if 0 <= mid_bin < N_BINS and 0 <= bass_bin < N_BINS:
|
| 286 |
+
sf = max(0, int(mid_n.start * sr / hop_length))
|
| 287 |
+
ef = min(C_db.shape[1], sf + max(1, int(0.15 * sr / hop_length)))
|
| 288 |
+
mid_energy = float(np.max(C_db[max(0, mid_bin-1):min(N_BINS, mid_bin+2), sf:ef]))
|
| 289 |
+
bass_energy = float(np.max(C_db[max(0, bass_bin-1):min(N_BINS, bass_bin+2), sf:ef]))
|
| 290 |
+
# If bass is much louder (>8dB) and mid note is quiet,
|
| 291 |
+
# it's likely a harmonic ghost
|
| 292 |
+
if bass_energy - mid_energy > 8.0 and mid_n.velocity < bass_n.velocity * 0.7:
|
| 293 |
+
to_remove.add(mid_k)
|
| 294 |
+
break
|
| 295 |
+
|
| 296 |
instrument.notes = [n for k, n in enumerate(notes) if k not in to_remove]
|
| 297 |
removed += len(to_remove)
|
| 298 |
|
|
|
|
| 359 |
return midi_out, removed
|
| 360 |
|
| 361 |
|
| 362 |
+
def remove_spurious_onsets(midi_data, y, sr, ref_onsets, hop_length=512, complexity='simple'):
|
| 363 |
"""Remove MIDI notes that form false-positive onsets not backed by audio.
|
| 364 |
|
| 365 |
Analysis shows 37 extra MIDI onsets cause the biggest F1 drag (precision=0.918).
|
|
|
|
| 373 |
3. Short+quiet artifacts: onsets where every note is both short (<200ms)
|
| 374 |
and quiet (velocity < 50).
|
| 375 |
|
| 376 |
+
For complex pieces, thresholds are relaxed to preserve legitimate dense
|
| 377 |
+
textures that might otherwise be classified as spurious.
|
| 378 |
+
|
| 379 |
The filter first identifies which MIDI onsets already match audio onsets,
|
| 380 |
then only removes unmatched onsets meeting the above criteria.
|
| 381 |
"""
|
| 382 |
midi_out = copy.deepcopy(midi_data)
|
| 383 |
tolerance = 0.05
|
| 384 |
|
| 385 |
+
# Complexity-adjusted thresholds: complex pieces are more permissive
|
| 386 |
+
# to preserve legitimate dense textures
|
| 387 |
+
if complexity == 'complex':
|
| 388 |
+
strength_scale = 1.5 # require stronger evidence to remove
|
| 389 |
+
dist_scale = 1.4 # require further from audio onset to remove
|
| 390 |
+
elif complexity == 'moderate':
|
| 391 |
+
strength_scale = 1.2
|
| 392 |
+
dist_scale = 1.2
|
| 393 |
+
else:
|
| 394 |
+
strength_scale = 1.0
|
| 395 |
+
dist_scale = 1.0
|
| 396 |
+
|
| 397 |
onset_env = librosa.onset.onset_strength(y=y, sr=sr, hop_length=hop_length)
|
| 398 |
onset_times = librosa.frames_to_time(
|
| 399 |
np.arange(len(onset_env)), sr=sr, hop_length=hop_length
|
|
|
|
| 445 |
# Category 1: Chord fragment -- near a matched onset, but only if
|
| 446 |
# the onset has weak audio energy. Strong onsets near chords may be
|
| 447 |
# real grace notes or arpeggios.
|
| 448 |
+
if near_matched and strength < 2.0 * strength_scale:
|
| 449 |
onsets_to_remove.add(j)
|
| 450 |
continue
|
| 451 |
|
| 452 |
# Category 2: Isolated ghost -- single note, low strength or far from audio
|
| 453 |
+
if len(onset_notes) == 1 and (strength < 1.5 * strength_scale or nearest_audio_ms > 100 * dist_scale):
|
| 454 |
onsets_to_remove.add(j)
|
| 455 |
continue
|
| 456 |
|
|
|
|
| 463 |
# low velocity (< 35), far from audio onset. These are rumble artifacts
|
| 464 |
# that survive the energy filter.
|
| 465 |
if (len(onset_notes) == 1 and onset_notes[0].pitch < 40
|
| 466 |
+
and onset_notes[0].velocity < 35 and nearest_audio_ms > 60 * dist_scale):
|
| 467 |
onsets_to_remove.add(j)
|
| 468 |
continue
|
| 469 |
|
| 470 |
# Category 5: Multi-note onset far from any audio onset (> 120ms)
|
| 471 |
# with weak-to-moderate onset strength. These are chord-split artifacts
|
| 472 |
# or hallucinated events with no audio support.
|
| 473 |
+
if nearest_audio_ms > 120 * dist_scale and strength < 3.0 * strength_scale:
|
| 474 |
onsets_to_remove.add(j)
|
| 475 |
continue
|
| 476 |
|
|
|
|
| 483 |
# Category 7: Moderate distance from audio (> 70ms) with weak
|
| 484 |
# onset strength — catches near-miss hallucinations that are
|
| 485 |
# just outside the 50ms matching window.
|
| 486 |
+
if nearest_audio_ms > 70 * dist_scale and strength < 2.5 * strength_scale:
|
| 487 |
onsets_to_remove.add(j)
|
| 488 |
continue
|
| 489 |
|
|
|
|
| 719 |
return midi_out, trimmed
|
| 720 |
|
| 721 |
|
| 722 |
+
def detect_sustain_regions(y, sr, hop_length=512):
|
| 723 |
+
"""Detect regions where the sustain pedal is likely engaged.
|
| 724 |
+
|
| 725 |
+
Analyzes spectral flux and broadband energy decay. When the sustain pedal
|
| 726 |
+
is held, notes ring longer and the spectral energy decays slowly instead
|
| 727 |
+
of dropping abruptly at note release. Detects this by looking for:
|
| 728 |
+
1. Low spectral flux (sustained timbre, no new attacks)
|
| 729 |
+
2. Slow energy decay (notes ringing through pedal)
|
| 730 |
+
|
| 731 |
+
Returns a boolean array (per frame) indicating sustained regions.
|
| 732 |
+
"""
|
| 733 |
+
# Compute spectral flux (rate of spectral change)
|
| 734 |
+
S = np.abs(librosa.stft(y, hop_length=hop_length))
|
| 735 |
+
flux = np.sqrt(np.mean(np.diff(S, axis=1) ** 2, axis=0))
|
| 736 |
+
flux = np.concatenate([[0], flux]) # pad to match frame count
|
| 737 |
+
|
| 738 |
+
# Compute RMS energy
|
| 739 |
+
rms = librosa.feature.rms(y=y, hop_length=hop_length)[0]
|
| 740 |
+
|
| 741 |
+
# Normalize both
|
| 742 |
+
flux_norm = flux / (np.percentile(flux, 95) + 1e-8)
|
| 743 |
+
rms_norm = rms / (np.max(rms) + 1e-8)
|
| 744 |
+
|
| 745 |
+
n_frames = min(len(flux_norm), len(rms_norm))
|
| 746 |
+
flux_norm = flux_norm[:n_frames]
|
| 747 |
+
rms_norm = rms_norm[:n_frames]
|
| 748 |
+
|
| 749 |
+
# Sustain pedal indicators:
|
| 750 |
+
# - Low spectral flux (< 30th percentile) = sustained sound, not new attacks
|
| 751 |
+
# - Moderate+ energy (> 10% of peak) = notes are still ringing
|
| 752 |
+
flux_thresh = np.percentile(flux_norm, 30)
|
| 753 |
+
sustain_mask = (flux_norm < flux_thresh) & (rms_norm > 0.10)
|
| 754 |
+
|
| 755 |
+
# Smooth: close 200ms gaps, remove blips shorter than 300ms
|
| 756 |
+
close_frames = max(1, int(0.2 * sr / hop_length))
|
| 757 |
+
min_region = max(1, int(0.3 * sr / hop_length))
|
| 758 |
+
|
| 759 |
+
# Morphological closing
|
| 760 |
+
for i in range(1, n_frames - 1):
|
| 761 |
+
if not sustain_mask[i]:
|
| 762 |
+
before = any(sustain_mask[max(0, i - close_frames):i])
|
| 763 |
+
after = any(sustain_mask[i + 1:min(n_frames, i + close_frames + 1)])
|
| 764 |
+
if before and after:
|
| 765 |
+
sustain_mask[i] = True
|
| 766 |
+
|
| 767 |
+
# Remove short blips
|
| 768 |
+
in_region = False
|
| 769 |
+
start = 0
|
| 770 |
+
for i in range(n_frames):
|
| 771 |
+
if sustain_mask[i] and not in_region:
|
| 772 |
+
start = i
|
| 773 |
+
in_region = True
|
| 774 |
+
elif not sustain_mask[i] and in_region:
|
| 775 |
+
if i - start < min_region:
|
| 776 |
+
sustain_mask[start:i] = False
|
| 777 |
+
in_region = False
|
| 778 |
+
|
| 779 |
+
return sustain_mask
|
| 780 |
+
|
| 781 |
+
|
| 782 |
def extend_note_durations(midi_data, y, sr, hop_length=512, max_per_hand=4, hand_split=60):
|
| 783 |
"""Extend MIDI note durations to match audio CQT energy decay.
|
| 784 |
|
|
|
|
| 804 |
C_norm = (C_norm + 80.0) / 80.0
|
| 805 |
n_frames = C.shape[1]
|
| 806 |
|
| 807 |
+
# Detect sustain pedal regions for longer extension allowance
|
| 808 |
+
sustain_mask = detect_sustain_regions(y, sr, hop_length)
|
| 809 |
+
# Pad/trim to match CQT frame count
|
| 810 |
+
if len(sustain_mask) < n_frames:
|
| 811 |
+
sustain_mask = np.concatenate([sustain_mask, np.zeros(n_frames - len(sustain_mask), dtype=bool)])
|
| 812 |
+
else:
|
| 813 |
+
sustain_mask = sustain_mask[:n_frames]
|
| 814 |
+
|
| 815 |
# Pre-compute per-frame concurrent counts per hand (fast O(1) lookup)
|
| 816 |
right_count = np.zeros(n_frames, dtype=int)
|
| 817 |
left_count = np.zeros(n_frames, dtype=int)
|
|
|
|
| 825 |
left_count[sf:ef] += 1
|
| 826 |
|
| 827 |
extended = 0
|
| 828 |
+
sustain_extended = 0
|
| 829 |
for inst in midi_out.instruments:
|
| 830 |
# Sort notes by start time for overlap checking
|
| 831 |
notes_sorted = sorted(inst.notes, key=lambda n: (n.pitch, n.start))
|
|
|
|
| 836 |
continue
|
| 837 |
|
| 838 |
end_frame = min(n_frames, int(note.end * sr / hop_length))
|
| 839 |
+
|
| 840 |
+
# In sustain regions, allow longer extension (4s) and lower threshold
|
| 841 |
+
in_sustain = end_frame < n_frames and sustain_mask[min(end_frame, n_frames - 1)]
|
| 842 |
+
max_ext_seconds = 4.0 if in_sustain else 2.0
|
| 843 |
+
energy_thresh = 0.15 if in_sustain else 0.20
|
| 844 |
+
|
| 845 |
+
max_extend = min(n_frames, end_frame + int(max_ext_seconds * sr / hop_length))
|
| 846 |
|
| 847 |
# Don't extend into the next note at the same pitch
|
| 848 |
next_start_frame = max_extend
|
|
|
|
| 858 |
for f in range(end_frame, min(max_extend, next_start_frame)):
|
| 859 |
lo = max(0, fund_bin - 1)
|
| 860 |
hi = min(N_BINS, fund_bin + 2)
|
| 861 |
+
if np.mean(C_norm[lo:hi, f]) > energy_thresh:
|
| 862 |
# Check concurrent: this note isn't counted in hand_count
|
| 863 |
# beyond end_frame, so hand_count[f] >= max_per_hand means
|
| 864 |
# extending here would create max_per_hand + 1 concurrent
|
|
|
|
| 877 |
hand_count[old_end_frame:new_end_frame] += 1
|
| 878 |
note.end = new_end
|
| 879 |
extended += 1
|
| 880 |
+
if in_sustain:
|
| 881 |
+
sustain_extended += 1
|
| 882 |
|
| 883 |
return midi_out, extended
|
| 884 |
|
|
|
|
| 1246 |
return midi_out, recovered
|
| 1247 |
|
| 1248 |
|
| 1249 |
+
def estimate_complexity(midi_data, audio_duration):
|
| 1250 |
+
"""Estimate piece complexity to adjust filter aggressiveness.
|
| 1251 |
+
|
| 1252 |
+
Returns a dict with:
|
| 1253 |
+
- note_density: notes per second
|
| 1254 |
+
- avg_polyphony: average concurrent notes at any onset
|
| 1255 |
+
- complexity: 'simple' (<4 n/s), 'moderate' (4-8), 'complex' (>8)
|
| 1256 |
+
|
| 1257 |
+
Complex pieces need less aggressive ghost removal and wider tolerance
|
| 1258 |
+
for concurrent notes, since dense textures are intentional.
|
| 1259 |
+
"""
|
| 1260 |
+
all_notes = sorted(
|
| 1261 |
+
[n for inst in midi_data.instruments for n in inst.notes],
|
| 1262 |
+
key=lambda n: n.start
|
| 1263 |
+
)
|
| 1264 |
+
if not all_notes or audio_duration < 1:
|
| 1265 |
+
return {'note_density': 0, 'avg_polyphony': 1, 'complexity': 'simple'}
|
| 1266 |
+
|
| 1267 |
+
note_density = len(all_notes) / audio_duration
|
| 1268 |
+
|
| 1269 |
+
# Compute average polyphony at each onset
|
| 1270 |
+
onsets = sorted(set(round(n.start, 3) for n in all_notes))
|
| 1271 |
+
polyphonies = []
|
| 1272 |
+
for onset in onsets:
|
| 1273 |
+
count = sum(1 for n in all_notes if abs(n.start - onset) < 0.03)
|
| 1274 |
+
polyphonies.append(count)
|
| 1275 |
+
avg_polyphony = np.mean(polyphonies) if polyphonies else 1
|
| 1276 |
+
|
| 1277 |
+
if note_density > 8 or avg_polyphony > 3.5:
|
| 1278 |
+
complexity = 'complex'
|
| 1279 |
+
elif note_density > 4 or avg_polyphony > 2.5:
|
| 1280 |
+
complexity = 'moderate'
|
| 1281 |
+
else:
|
| 1282 |
+
complexity = 'simple'
|
| 1283 |
+
|
| 1284 |
+
return {
|
| 1285 |
+
'note_density': note_density,
|
| 1286 |
+
'avg_polyphony': avg_polyphony,
|
| 1287 |
+
'complexity': complexity,
|
| 1288 |
+
}
|
| 1289 |
+
|
| 1290 |
+
|
| 1291 |
def optimize(original_audio_path, midi_path, output_path=None):
|
| 1292 |
"""Full optimization pipeline."""
|
| 1293 |
if output_path is None:
|
|
|
|
| 1318 |
total_notes = sum(len(inst.notes) for inst in midi_data.instruments)
|
| 1319 |
print(f" {total_notes} MIDI notes")
|
| 1320 |
|
| 1321 |
+
# Estimate complexity to adjust filter thresholds
|
| 1322 |
+
complexity_info = estimate_complexity(midi_data, audio_duration)
|
| 1323 |
+
complexity = complexity_info['complexity']
|
| 1324 |
+
print(f" Complexity: {complexity} (density={complexity_info['note_density']:.1f} n/s, "
|
| 1325 |
+
f"polyphony={complexity_info['avg_polyphony']:.1f})")
|
| 1326 |
+
|
| 1327 |
# Step 0: Remove notes in leading silence (mic rumble artifacts)
|
| 1328 |
print("\nStep 0: Removing notes in leading silence...")
|
| 1329 |
midi_data, silence_removed, music_start = remove_leading_silence_notes(midi_data, y, sr)
|
|
|
|
| 1400 |
# Step 6b: Remove spurious false-positive onsets
|
| 1401 |
print("\nStep 6b: Removing spurious onsets (false positive cleanup)...")
|
| 1402 |
midi_data, spurious_notes, spurious_onsets = remove_spurious_onsets(
|
| 1403 |
+
midi_data, y, sr, ref_onsets, hop_length, complexity=complexity
|
| 1404 |
)
|
| 1405 |
print(f" Removed {spurious_notes} notes across {spurious_onsets} spurious onsets")
|
| 1406 |
|
|
|
|
| 1456 |
)
|
| 1457 |
print(f" Recovered {notes_recovered} notes from CQT energy")
|
| 1458 |
|
| 1459 |
+
# Step 8f: Playability filter — limit per-onset chord size
|
| 1460 |
+
# Complex pieces get 5 notes/hand to preserve dense voicings
|
| 1461 |
+
max_hand = 5 if complexity == 'complex' else 4
|
| 1462 |
+
print(f"\nStep 8f: Playability filter (max {max_hand} notes per hand per chord)...")
|
| 1463 |
+
midi_data, playability_removed = limit_concurrent_notes(midi_data, max_per_hand=max_hand)
|
| 1464 |
print(f" Removed {playability_removed} excess chord notes")
|
| 1465 |
|
| 1466 |
+
# Step 8g: Limit total concurrent sounding notes
|
| 1467 |
+
print(f"\nStep 8g: Concurrent sounding limit (max {max_hand} per hand)...")
|
| 1468 |
+
midi_data, sustain_trimmed = limit_total_concurrent(midi_data, max_per_hand=max_hand)
|
| 1469 |
print(f" Trimmed {sustain_trimmed} sustained notes to reduce pileup")
|
| 1470 |
|
| 1471 |
# Final metrics
|