coda / verify.py
blackboxanalytics's picture
Rebuild CODA on Stable Audio 3 Small Music
e8b2f06
Raw
History Blame Contribute Delete
11.2 kB
"""verify.py — mathematical audio QA for CODA continuations (dev/test tool).
Not imported by the app at runtime. Given the user's original clip, the finished
track, and the splice boundary, it measures whether the generated region is real,
continuous music — the things an ear-check can miss between sessions:
duration · silence-collapse · clipping · loudness continuity · seam click ·
tempo continuity · key continuity · spectral rolloff · stereo width
Each metric returns a value, a pass/fail against a threshold, and a short reason.
`verify(...)` returns a structured report; `plot_report(...)` writes a diagnostic
PNG (waveform + mel-spectrogram + seam-RMS overlay + a pass/fail table).
CLI: python verify.py <original> <finished> <source_seconds> [out.png]
"""
import sys
import librosa
import numpy as np
from analyze import _key_from_audio, _scalar_tempo
SR = 44100
# thresholds (plan §4)
SILENCE_WINDOW_S = 2.0
SILENCE_MIN_RATIO = 0.30 # every 2s window > 0.3x source RMS
CLIP_MAX_FRACTION = 1e-4 # < 0.01% of samples at full scale
LOUDNESS_TOL_DB = 6.0 # seam RMS step within +/- 6 dB
SEAM_JUMP_FACTOR = 6.0 # seam jump < 6x the 99.9th-pct local delta
TEMPO_TOL = 0.08 # generated tempo within +/-8% of source
ROLLOFF_MIN_RATIO = 0.70 # new rolloff >= 0.7x source
STEREO_CORR_RANGE = (0.10, 0.98) # not mono-collapsed, not decorrelated noise
def _mono(y):
return y.mean(axis=0) if y.ndim == 2 else y
def _rms(x):
x = np.asarray(x, dtype=np.float64)
return float(np.sqrt(np.mean(x ** 2)) + 1e-12)
def _db(ratio):
return 20.0 * np.log10(max(ratio, 1e-12))
def _relative_keys(key):
"""key string + its relative major/minor, for continuity matching."""
names = ['C', 'C#', 'D', 'D#', 'E', 'F', 'F#', 'G', 'G#', 'A', 'A#', 'B']
try:
root, mode = key.split()
except ValueError:
return {key}
i = names.index(root)
out = {key}
if mode == 'major':
out.add(f'{names[(i + 9) % 12]} minor') # relative minor
else:
out.add(f'{names[(i + 3) % 12]} major') # relative major
return out
def verify(original_path, finished_path, source_seconds, fade_seconds=4.0):
"""Return a report dict: {metrics: {name: {value, pass, reason}}, passed:bool,
and arrays for plotting}. `fade_seconds` is the stitch closing fade, excluded
from the silence-collapse scan so the intentional ending isn't flagged."""
fin, _ = librosa.load(finished_path, sr=SR, mono=False)
if fin.ndim == 1:
fin = np.stack([fin, fin])
fin_m = _mono(fin)
total_s = fin.shape[-1] / SR
boundary = int(round(source_seconds * SR))
boundary = max(0, min(boundary, fin.shape[-1] - 1))
src_region = fin_m[:boundary]
new_region = fin_m[boundary:]
src_rms = _rms(src_region) if len(src_region) else _rms(fin_m)
metrics = {}
# 1. duration sane
metrics['duration'] = {
'value': round(total_s, 2),
'pass': total_s > source_seconds + 1.0,
'reason': f'{total_s:.1f}s total, source {source_seconds:.1f}s',
}
# 2. silence collapse — every window in the new region carries energy.
# exclude the intentional closing fade (last fade_seconds) so a real ending
# isn't read as a collapse.
win = int(SILENCE_WINDOW_S * SR)
body = new_region[:max(0, len(new_region) - int((fade_seconds + 1.0) * SR))]
worst = 1.0
if len(body) >= win:
ratios = [_rms(body[i:i + win]) / src_rms
for i in range(0, len(body) - win + 1, win)]
worst = min(ratios) if ratios else 1.0
metrics['no_silence_collapse'] = {
'value': round(worst, 3),
'pass': worst > SILENCE_MIN_RATIO,
'reason': f'quietest 2s window = {worst:.2f}x source RMS '
f'(need > {SILENCE_MIN_RATIO})',
}
# 3. clipping
clip_frac = float(np.mean(np.abs(fin) >= 0.999))
metrics['no_clipping'] = {
'value': clip_frac,
'pass': clip_frac < CLIP_MAX_FRACTION,
'reason': f'{clip_frac*100:.4f}% samples at full scale',
}
# 4. loudness continuity across the seam
w = int(3.0 * SR)
pre = fin_m[max(0, boundary - w):boundary]
post = fin_m[boundary:boundary + w]
step_db = _db(_rms(post) / _rms(pre)) if len(pre) and len(post) else 0.0
metrics['loudness_continuity'] = {
'value': round(step_db, 2),
'pass': abs(step_db) < LOUDNESS_TOL_DB,
'reason': f'seam RMS step {step_db:+.1f} dB '
f'(tol +/-{LOUDNESS_TOL_DB:.0f})',
}
# 5. seam discontinuity — no audible click at the splice point. reference
# the seam's largest sample step against a HIGH QUANTILE of the surrounding
# |delta| distribution, not its std: on percussive/bright music the biggest
# single delta is heavy-tailed and dwarfs the std with no click present, so
# an std-based gate false-positives on drums/electronic continuations.
g = int(0.05 * SR)
seg = fin_m[max(0, boundary - g):boundary + g]
if len(seg) > 4:
diffs = np.abs(np.diff(seg))
local = np.abs(np.diff(fin_m[max(0, boundary - SR):boundary + SR]))
local_ref = float(np.quantile(local, 0.999) + 1e-9) if len(local) else 1.0
jump = float(diffs.max()) / local_ref
else:
jump = 0.0
metrics['no_seam_click'] = {
'value': round(jump, 2),
'pass': jump < SEAM_JUMP_FACTOR,
'reason': f'max seam jump {jump:.1f}x the 99.9th-pct local delta '
f'(need < {SEAM_JUMP_FACTOR})',
}
# 6/7. tempo + key continuity (source region vs new region)
src_for_analysis = src_region if len(src_region) > SR else fin_m
src_tempo = _scalar_tempo(librosa.beat.beat_track(y=src_for_analysis, sr=SR)[0])
new_tempo = _scalar_tempo(librosa.beat.beat_track(y=new_region, sr=SR)[0]) \
if len(new_region) > SR else src_tempo
tempo_dev = abs(new_tempo - src_tempo) / max(src_tempo, 1e-6)
# allow half/double-time relationship (common + musically valid)
half_double = min(tempo_dev,
abs(new_tempo - 2 * src_tempo) / max(2 * src_tempo, 1e-6),
abs(new_tempo - 0.5 * src_tempo) / max(0.5 * src_tempo, 1e-6))
metrics['tempo_continuity'] = {
'value': f'{src_tempo:.0f}->{new_tempo:.0f} bpm',
'pass': half_double < TEMPO_TOL,
'reason': f'{half_double*100:.1f}% deviation (tol {TEMPO_TOL*100:.0f}%, '
f'half/double allowed)',
}
src_key = _key_from_audio(src_for_analysis, SR)
new_key = _key_from_audio(new_region, SR) if len(new_region) > SR else src_key
metrics['key_continuity'] = {
'value': f'{src_key} -> {new_key}',
'pass': new_key in _relative_keys(src_key),
'reason': f'generated key {new_key} vs source {src_key} '
f'(relative maj/min ok)',
}
# 8. spectral rolloff — confirm the tail isn't band-limited
src_roll = float(np.mean(librosa.feature.spectral_rolloff(
y=src_for_analysis, sr=SR))) + 1e-6
new_roll = float(np.mean(librosa.feature.spectral_rolloff(
y=new_region, sr=SR))) if len(new_region) > SR else src_roll
metrics['spectral_rolloff'] = {
'value': f'{src_roll/1000:.1f}->{new_roll/1000:.1f} kHz',
'pass': new_roll >= ROLLOFF_MIN_RATIO * src_roll,
'reason': f'new {new_roll/1000:.1f}kHz vs source {src_roll/1000:.1f}kHz '
f'(need >= {ROLLOFF_MIN_RATIO:.0%})',
}
# 9. stereo width in the new region
if fin.shape[0] == 2 and fin.shape[-1] - boundary > SR:
L, R = fin[0, boundary:], fin[1, boundary:]
if np.std(L) > 1e-6 and np.std(R) > 1e-6:
corr = float(np.corrcoef(L, R)[0, 1])
else:
corr = 1.0
else:
corr = 1.0
lo, hi = STEREO_CORR_RANGE
metrics['stereo_width'] = {
'value': round(corr, 3),
'pass': lo <= corr <= hi or corr == 1.0,
'reason': f'L/R correlation {corr:.2f} (target {lo}-{hi}; '
f'1.0 = intentional mono)',
}
passed = all(m['pass'] for m in metrics.values())
return {
'metrics': metrics,
'passed': passed,
'finished': fin,
'boundary': boundary,
'source_seconds': source_seconds,
'total_seconds': total_s,
}
def print_report(report):
print("\n==== CODA VERIFY ====")
width = max(len(k) for k in report['metrics'])
for name, m in report['metrics'].items():
tag = 'PASS' if m['pass'] else 'FAIL'
print(f" [{tag}] {name:<{width}} {m['reason']}")
print(f" ----\n OVERALL: {'PASS' if report['passed'] else 'FAIL'}\n")
def plot_report(report, out_png):
import matplotlib
matplotlib.use("Agg")
import matplotlib.pyplot as plt
import librosa.display # submodule isn't pulled in by `import librosa`
fin = report['finished']
fin_m = _mono(fin)
b = report['boundary']
sr = SR
t = np.arange(fin_m.shape[-1]) / sr
fig, axes = plt.subplots(3, 1, figsize=(12, 9), facecolor="#0b0d10")
for ax in axes:
ax.set_facecolor("#14171c")
ax.tick_params(colors="#9aa4b2")
for s in ax.spines.values():
s.set_color("#2a2f37")
axes[0].plot(t, fin_m, color="#39d0d8", lw=0.4)
axes[0].axvline(b / sr, color="#ffb347", lw=1.5, label="seam")
axes[0].set_title("Finished waveform (seam marked)", color="#e6e9ef")
axes[0].legend(facecolor="#14171c", labelcolor="#e6e9ef")
S = librosa.feature.melspectrogram(y=fin_m, sr=sr, n_mels=128)
Sdb = librosa.power_to_db(S, ref=np.max)
librosa.display.specshow(Sdb, sr=sr, x_axis="time", y_axis="mel", ax=axes[1])
axes[1].axvline(b / sr, color="#ffb347", lw=1.5)
axes[1].set_title("Mel-spectrogram", color="#e6e9ef")
rows = [[k, ("PASS" if m['pass'] else "FAIL"), str(m['value'])]
for k, m in report['metrics'].items()]
axes[2].axis("off")
tbl = axes[2].table(cellText=rows,
colLabels=["metric", "result", "value"],
loc="center", cellLoc="left")
tbl.auto_set_font_size(False)
tbl.set_fontsize(9)
for (r, c), cell in tbl.get_celld().items():
cell.set_edgecolor("#2a2f37")
if r == 0:
cell.set_facecolor("#22272e")
cell.set_text_props(color="#e6e9ef")
else:
ok = report['metrics'][rows[r - 1][0]]['pass']
cell.set_facecolor("#14171c")
cell.set_text_props(color="#5fd38d" if ok else "#ff6b6b")
fig.suptitle(f"CODA verify — {'PASS' if report['passed'] else 'FAIL'}",
color="#e6e9ef", fontsize=14)
fig.tight_layout()
fig.savefig(out_png, dpi=110, facecolor="#0b0d10")
plt.close(fig)
print(f" wrote {out_png}")
if __name__ == "__main__":
if len(sys.argv) < 4:
print("usage: python verify.py <original> <finished> "
"<source_seconds> [out.png]")
sys.exit(1)
rep = verify(sys.argv[1], sys.argv[2], float(sys.argv[3]))
print_report(rep)
if len(sys.argv) > 4:
plot_report(rep, sys.argv[4])
sys.exit(0 if rep['passed'] else 2)