Spaces:

build-small-hackathon
/

coda

Running on Zero

App Files Files Community

coda / verify.py

blackboxanalytics

Rebuild CODA on Stable Audio 3 Small Music

e8b2f06 22 days ago

Raw

History Blame Contribute Delete

11.2 kB

	"""verify.py — mathematical audio QA for CODA continuations (dev/test tool).

	Not imported by the app at runtime. Given the user's original clip, the finished
	track, and the splice boundary, it measures whether the generated region is real,
	continuous music — the things an ear-check can miss between sessions:

	duration · silence-collapse · clipping · loudness continuity · seam click ·
	tempo continuity · key continuity · spectral rolloff · stereo width

	Each metric returns a value, a pass/fail against a threshold, and a short reason.
	`verify(...)` returns a structured report; `plot_report(...)` writes a diagnostic
	PNG (waveform + mel-spectrogram + seam-RMS overlay + a pass/fail table).

	CLI: python verify.py <original> <finished> <source_seconds> [out.png]
	"""
	import sys

	import librosa
	import numpy as np

	from analyze import _key_from_audio, _scalar_tempo

	SR = 44100

	# thresholds (plan §4)
	SILENCE_WINDOW_S = 2.0
	SILENCE_MIN_RATIO = 0.30 # every 2s window > 0.3x source RMS
	CLIP_MAX_FRACTION = 1e-4 # < 0.01% of samples at full scale
	LOUDNESS_TOL_DB = 6.0 # seam RMS step within +/- 6 dB
	SEAM_JUMP_FACTOR = 6.0 # seam jump < 6x the 99.9th-pct local delta
	TEMPO_TOL = 0.08 # generated tempo within +/-8% of source
	ROLLOFF_MIN_RATIO = 0.70 # new rolloff >= 0.7x source
	STEREO_CORR_RANGE = (0.10, 0.98) # not mono-collapsed, not decorrelated noise


	def _mono(y):
	return y.mean(axis=0) if y.ndim == 2 else y


	def _rms(x):
	x = np.asarray(x, dtype=np.float64)
	return float(np.sqrt(np.mean(x ** 2)) + 1e-12)


	def _db(ratio):
	return 20.0 * np.log10(max(ratio, 1e-12))


	def _relative_keys(key):
	"""key string + its relative major/minor, for continuity matching."""
	names = ['C', 'C#', 'D', 'D#', 'E', 'F', 'F#', 'G', 'G#', 'A', 'A#', 'B']
	try:
	root, mode = key.split()
	except ValueError:
	return {key}
	i = names.index(root)
	out = {key}
	if mode == 'major':
	out.add(f'{names[(i + 9) % 12]} minor') # relative minor
	else:
	out.add(f'{names[(i + 3) % 12]} major') # relative major
	return out


	def verify(original_path, finished_path, source_seconds, fade_seconds=4.0):
	"""Return a report dict: {metrics: {name: {value, pass, reason}}, passed:bool,
	and arrays for plotting}. `fade_seconds` is the stitch closing fade, excluded
	from the silence-collapse scan so the intentional ending isn't flagged."""
	fin, _ = librosa.load(finished_path, sr=SR, mono=False)
	if fin.ndim == 1:
	fin = np.stack([fin, fin])
	fin_m = _mono(fin)
	total_s = fin.shape[-1] / SR

	boundary = int(round(source_seconds * SR))
	boundary = max(0, min(boundary, fin.shape[-1] - 1))
	src_region = fin_m[:boundary]
	new_region = fin_m[boundary:]

	src_rms = _rms(src_region) if len(src_region) else _rms(fin_m)
	metrics = {}

	# 1. duration sane
	metrics['duration'] = {
	'value': round(total_s, 2),
	'pass': total_s > source_seconds + 1.0,
	'reason': f'{total_s:.1f}s total, source {source_seconds:.1f}s',
	}

	# 2. silence collapse — every window in the new region carries energy.
	# exclude the intentional closing fade (last fade_seconds) so a real ending
	# isn't read as a collapse.
	win = int(SILENCE_WINDOW_S * SR)
	body = new_region[:max(0, len(new_region) - int((fade_seconds + 1.0) * SR))]
	worst = 1.0
	if len(body) >= win:
	ratios = [_rms(body[i:i + win]) / src_rms
	for i in range(0, len(body) - win + 1, win)]
	worst = min(ratios) if ratios else 1.0
	metrics['no_silence_collapse'] = {
	'value': round(worst, 3),
	'pass': worst > SILENCE_MIN_RATIO,
	'reason': f'quietest 2s window = {worst:.2f}x source RMS '
	f'(need > {SILENCE_MIN_RATIO})',
	}

	# 3. clipping
	clip_frac = float(np.mean(np.abs(fin) >= 0.999))
	metrics['no_clipping'] = {
	'value': clip_frac,
	'pass': clip_frac < CLIP_MAX_FRACTION,
	'reason': f'{clip_frac*100:.4f}% samples at full scale',
	}

	# 4. loudness continuity across the seam
	w = int(3.0 * SR)
	pre = fin_m[max(0, boundary - w):boundary]
	post = fin_m[boundary:boundary + w]
	step_db = _db(_rms(post) / _rms(pre)) if len(pre) and len(post) else 0.0
	metrics['loudness_continuity'] = {
	'value': round(step_db, 2),
	'pass': abs(step_db) < LOUDNESS_TOL_DB,
	'reason': f'seam RMS step {step_db:+.1f} dB '
	f'(tol +/-{LOUDNESS_TOL_DB:.0f})',
	}

	# 5. seam discontinuity — no audible click at the splice point. reference
	# the seam's largest sample step against a HIGH QUANTILE of the surrounding
	# \|delta\| distribution, not its std: on percussive/bright music the biggest
	# single delta is heavy-tailed and dwarfs the std with no click present, so
	# an std-based gate false-positives on drums/electronic continuations.
	g = int(0.05 * SR)
	seg = fin_m[max(0, boundary - g):boundary + g]
	if len(seg) > 4:
	diffs = np.abs(np.diff(seg))
	local = np.abs(np.diff(fin_m[max(0, boundary - SR):boundary + SR]))
	local_ref = float(np.quantile(local, 0.999) + 1e-9) if len(local) else 1.0
	jump = float(diffs.max()) / local_ref
	else:
	jump = 0.0
	metrics['no_seam_click'] = {
	'value': round(jump, 2),
	'pass': jump < SEAM_JUMP_FACTOR,
	'reason': f'max seam jump {jump:.1f}x the 99.9th-pct local delta '
	f'(need < {SEAM_JUMP_FACTOR})',
	}

	# 6/7. tempo + key continuity (source region vs new region)
	src_for_analysis = src_region if len(src_region) > SR else fin_m
	src_tempo = _scalar_tempo(librosa.beat.beat_track(y=src_for_analysis, sr=SR)[0])
	new_tempo = _scalar_tempo(librosa.beat.beat_track(y=new_region, sr=SR)[0]) \
	if len(new_region) > SR else src_tempo
	tempo_dev = abs(new_tempo - src_tempo) / max(src_tempo, 1e-6)
	# allow half/double-time relationship (common + musically valid)
	half_double = min(tempo_dev,
	abs(new_tempo - 2 * src_tempo) / max(2 * src_tempo, 1e-6),
	abs(new_tempo - 0.5 * src_tempo) / max(0.5 * src_tempo, 1e-6))
	metrics['tempo_continuity'] = {
	'value': f'{src_tempo:.0f}->{new_tempo:.0f} bpm',
	'pass': half_double < TEMPO_TOL,
	'reason': f'{half_double100:.1f}% deviation (tol {TEMPO_TOL100:.0f}%, '
	f'half/double allowed)',
	}

	src_key = _key_from_audio(src_for_analysis, SR)
	new_key = _key_from_audio(new_region, SR) if len(new_region) > SR else src_key
	metrics['key_continuity'] = {
	'value': f'{src_key} -> {new_key}',
	'pass': new_key in _relative_keys(src_key),
	'reason': f'generated key {new_key} vs source {src_key} '
	f'(relative maj/min ok)',
	}

	# 8. spectral rolloff — confirm the tail isn't band-limited
	src_roll = float(np.mean(librosa.feature.spectral_rolloff(
	y=src_for_analysis, sr=SR))) + 1e-6
	new_roll = float(np.mean(librosa.feature.spectral_rolloff(
	y=new_region, sr=SR))) if len(new_region) > SR else src_roll
	metrics['spectral_rolloff'] = {
	'value': f'{src_roll/1000:.1f}->{new_roll/1000:.1f} kHz',
	'pass': new_roll >= ROLLOFF_MIN_RATIO * src_roll,
	'reason': f'new {new_roll/1000:.1f}kHz vs source {src_roll/1000:.1f}kHz '
	f'(need >= {ROLLOFF_MIN_RATIO:.0%})',
	}

	# 9. stereo width in the new region
	if fin.shape[0] == 2 and fin.shape[-1] - boundary > SR:
	L, R = fin[0, boundary:], fin[1, boundary:]
	if np.std(L) > 1e-6 and np.std(R) > 1e-6:
	corr = float(np.corrcoef(L, R)[0, 1])
	else:
	corr = 1.0
	else:
	corr = 1.0
	lo, hi = STEREO_CORR_RANGE
	metrics['stereo_width'] = {
	'value': round(corr, 3),
	'pass': lo <= corr <= hi or corr == 1.0,
	'reason': f'L/R correlation {corr:.2f} (target {lo}-{hi}; '
	f'1.0 = intentional mono)',
	}

	passed = all(m['pass'] for m in metrics.values())
	return {
	'metrics': metrics,
	'passed': passed,
	'finished': fin,
	'boundary': boundary,
	'source_seconds': source_seconds,
	'total_seconds': total_s,
	}


	def print_report(report):
	print("\n==== CODA VERIFY ====")
	width = max(len(k) for k in report['metrics'])
	for name, m in report['metrics'].items():
	tag = 'PASS' if m['pass'] else 'FAIL'
	print(f" [{tag}] {name:<{width}} {m['reason']}")
	print(f" ----\n OVERALL: {'PASS' if report['passed'] else 'FAIL'}\n")


	def plot_report(report, out_png):
	import matplotlib
	matplotlib.use("Agg")
	import matplotlib.pyplot as plt
	import librosa.display # submodule isn't pulled in by `import librosa`

	fin = report['finished']
	fin_m = _mono(fin)
	b = report['boundary']
	sr = SR
	t = np.arange(fin_m.shape[-1]) / sr

	fig, axes = plt.subplots(3, 1, figsize=(12, 9), facecolor="#0b0d10")
	for ax in axes:
	ax.set_facecolor("#14171c")
	ax.tick_params(colors="#9aa4b2")
	for s in ax.spines.values():
	s.set_color("#2a2f37")

	axes[0].plot(t, fin_m, color="#39d0d8", lw=0.4)
	axes[0].axvline(b / sr, color="#ffb347", lw=1.5, label="seam")
	axes[0].set_title("Finished waveform (seam marked)", color="#e6e9ef")
	axes[0].legend(facecolor="#14171c", labelcolor="#e6e9ef")

	S = librosa.feature.melspectrogram(y=fin_m, sr=sr, n_mels=128)
	Sdb = librosa.power_to_db(S, ref=np.max)
	librosa.display.specshow(Sdb, sr=sr, x_axis="time", y_axis="mel", ax=axes[1])
	axes[1].axvline(b / sr, color="#ffb347", lw=1.5)
	axes[1].set_title("Mel-spectrogram", color="#e6e9ef")

	rows = [[k, ("PASS" if m['pass'] else "FAIL"), str(m['value'])]
	for k, m in report['metrics'].items()]
	axes[2].axis("off")
	tbl = axes[2].table(cellText=rows,
	colLabels=["metric", "result", "value"],
	loc="center", cellLoc="left")
	tbl.auto_set_font_size(False)
	tbl.set_fontsize(9)
	for (r, c), cell in tbl.get_celld().items():
	cell.set_edgecolor("#2a2f37")
	if r == 0:
	cell.set_facecolor("#22272e")
	cell.set_text_props(color="#e6e9ef")
	else:
	ok = report['metrics'][rows[r - 1][0]]['pass']
	cell.set_facecolor("#14171c")
	cell.set_text_props(color="#5fd38d" if ok else "#ff6b6b")
	fig.suptitle(f"CODA verify — {'PASS' if report['passed'] else 'FAIL'}",
	color="#e6e9ef", fontsize=14)
	fig.tight_layout()
	fig.savefig(out_png, dpi=110, facecolor="#0b0d10")
	plt.close(fig)
	print(f" wrote {out_png}")


	if __name__ == "__main__":
	if len(sys.argv) < 4:
	print("usage: python verify.py <original> <finished> "
	"<source_seconds> [out.png]")
	sys.exit(1)
	rep = verify(sys.argv[1], sys.argv[2], float(sys.argv[3]))
	print_report(rep)
	if len(sys.argv) > 4:
	plot_report(rep, sys.argv[4])
	sys.exit(0 if rep['passed'] else 2)