Spaces:
Running on Zero
Running on Zero
| """verify.py — mathematical audio QA for CODA continuations (dev/test tool). | |
| Not imported by the app at runtime. Given the user's original clip, the finished | |
| track, and the splice boundary, it measures whether the generated region is real, | |
| continuous music — the things an ear-check can miss between sessions: | |
| duration · silence-collapse · clipping · loudness continuity · seam click · | |
| tempo continuity · key continuity · spectral rolloff · stereo width | |
| Each metric returns a value, a pass/fail against a threshold, and a short reason. | |
| `verify(...)` returns a structured report; `plot_report(...)` writes a diagnostic | |
| PNG (waveform + mel-spectrogram + seam-RMS overlay + a pass/fail table). | |
| CLI: python verify.py <original> <finished> <source_seconds> [out.png] | |
| """ | |
| import sys | |
| import librosa | |
| import numpy as np | |
| from analyze import _key_from_audio, _scalar_tempo | |
| SR = 44100 | |
| # thresholds (plan §4) | |
| SILENCE_WINDOW_S = 2.0 | |
| SILENCE_MIN_RATIO = 0.30 # every 2s window > 0.3x source RMS | |
| CLIP_MAX_FRACTION = 1e-4 # < 0.01% of samples at full scale | |
| LOUDNESS_TOL_DB = 6.0 # seam RMS step within +/- 6 dB | |
| SEAM_JUMP_FACTOR = 6.0 # seam jump < 6x the 99.9th-pct local delta | |
| TEMPO_TOL = 0.08 # generated tempo within +/-8% of source | |
| ROLLOFF_MIN_RATIO = 0.70 # new rolloff >= 0.7x source | |
| STEREO_CORR_RANGE = (0.10, 0.98) # not mono-collapsed, not decorrelated noise | |
| def _mono(y): | |
| return y.mean(axis=0) if y.ndim == 2 else y | |
| def _rms(x): | |
| x = np.asarray(x, dtype=np.float64) | |
| return float(np.sqrt(np.mean(x ** 2)) + 1e-12) | |
| def _db(ratio): | |
| return 20.0 * np.log10(max(ratio, 1e-12)) | |
| def _relative_keys(key): | |
| """key string + its relative major/minor, for continuity matching.""" | |
| names = ['C', 'C#', 'D', 'D#', 'E', 'F', 'F#', 'G', 'G#', 'A', 'A#', 'B'] | |
| try: | |
| root, mode = key.split() | |
| except ValueError: | |
| return {key} | |
| i = names.index(root) | |
| out = {key} | |
| if mode == 'major': | |
| out.add(f'{names[(i + 9) % 12]} minor') # relative minor | |
| else: | |
| out.add(f'{names[(i + 3) % 12]} major') # relative major | |
| return out | |
| def verify(original_path, finished_path, source_seconds, fade_seconds=4.0): | |
| """Return a report dict: {metrics: {name: {value, pass, reason}}, passed:bool, | |
| and arrays for plotting}. `fade_seconds` is the stitch closing fade, excluded | |
| from the silence-collapse scan so the intentional ending isn't flagged.""" | |
| fin, _ = librosa.load(finished_path, sr=SR, mono=False) | |
| if fin.ndim == 1: | |
| fin = np.stack([fin, fin]) | |
| fin_m = _mono(fin) | |
| total_s = fin.shape[-1] / SR | |
| boundary = int(round(source_seconds * SR)) | |
| boundary = max(0, min(boundary, fin.shape[-1] - 1)) | |
| src_region = fin_m[:boundary] | |
| new_region = fin_m[boundary:] | |
| src_rms = _rms(src_region) if len(src_region) else _rms(fin_m) | |
| metrics = {} | |
| # 1. duration sane | |
| metrics['duration'] = { | |
| 'value': round(total_s, 2), | |
| 'pass': total_s > source_seconds + 1.0, | |
| 'reason': f'{total_s:.1f}s total, source {source_seconds:.1f}s', | |
| } | |
| # 2. silence collapse — every window in the new region carries energy. | |
| # exclude the intentional closing fade (last fade_seconds) so a real ending | |
| # isn't read as a collapse. | |
| win = int(SILENCE_WINDOW_S * SR) | |
| body = new_region[:max(0, len(new_region) - int((fade_seconds + 1.0) * SR))] | |
| worst = 1.0 | |
| if len(body) >= win: | |
| ratios = [_rms(body[i:i + win]) / src_rms | |
| for i in range(0, len(body) - win + 1, win)] | |
| worst = min(ratios) if ratios else 1.0 | |
| metrics['no_silence_collapse'] = { | |
| 'value': round(worst, 3), | |
| 'pass': worst > SILENCE_MIN_RATIO, | |
| 'reason': f'quietest 2s window = {worst:.2f}x source RMS ' | |
| f'(need > {SILENCE_MIN_RATIO})', | |
| } | |
| # 3. clipping | |
| clip_frac = float(np.mean(np.abs(fin) >= 0.999)) | |
| metrics['no_clipping'] = { | |
| 'value': clip_frac, | |
| 'pass': clip_frac < CLIP_MAX_FRACTION, | |
| 'reason': f'{clip_frac*100:.4f}% samples at full scale', | |
| } | |
| # 4. loudness continuity across the seam | |
| w = int(3.0 * SR) | |
| pre = fin_m[max(0, boundary - w):boundary] | |
| post = fin_m[boundary:boundary + w] | |
| step_db = _db(_rms(post) / _rms(pre)) if len(pre) and len(post) else 0.0 | |
| metrics['loudness_continuity'] = { | |
| 'value': round(step_db, 2), | |
| 'pass': abs(step_db) < LOUDNESS_TOL_DB, | |
| 'reason': f'seam RMS step {step_db:+.1f} dB ' | |
| f'(tol +/-{LOUDNESS_TOL_DB:.0f})', | |
| } | |
| # 5. seam discontinuity — no audible click at the splice point. reference | |
| # the seam's largest sample step against a HIGH QUANTILE of the surrounding | |
| # |delta| distribution, not its std: on percussive/bright music the biggest | |
| # single delta is heavy-tailed and dwarfs the std with no click present, so | |
| # an std-based gate false-positives on drums/electronic continuations. | |
| g = int(0.05 * SR) | |
| seg = fin_m[max(0, boundary - g):boundary + g] | |
| if len(seg) > 4: | |
| diffs = np.abs(np.diff(seg)) | |
| local = np.abs(np.diff(fin_m[max(0, boundary - SR):boundary + SR])) | |
| local_ref = float(np.quantile(local, 0.999) + 1e-9) if len(local) else 1.0 | |
| jump = float(diffs.max()) / local_ref | |
| else: | |
| jump = 0.0 | |
| metrics['no_seam_click'] = { | |
| 'value': round(jump, 2), | |
| 'pass': jump < SEAM_JUMP_FACTOR, | |
| 'reason': f'max seam jump {jump:.1f}x the 99.9th-pct local delta ' | |
| f'(need < {SEAM_JUMP_FACTOR})', | |
| } | |
| # 6/7. tempo + key continuity (source region vs new region) | |
| src_for_analysis = src_region if len(src_region) > SR else fin_m | |
| src_tempo = _scalar_tempo(librosa.beat.beat_track(y=src_for_analysis, sr=SR)[0]) | |
| new_tempo = _scalar_tempo(librosa.beat.beat_track(y=new_region, sr=SR)[0]) \ | |
| if len(new_region) > SR else src_tempo | |
| tempo_dev = abs(new_tempo - src_tempo) / max(src_tempo, 1e-6) | |
| # allow half/double-time relationship (common + musically valid) | |
| half_double = min(tempo_dev, | |
| abs(new_tempo - 2 * src_tempo) / max(2 * src_tempo, 1e-6), | |
| abs(new_tempo - 0.5 * src_tempo) / max(0.5 * src_tempo, 1e-6)) | |
| metrics['tempo_continuity'] = { | |
| 'value': f'{src_tempo:.0f}->{new_tempo:.0f} bpm', | |
| 'pass': half_double < TEMPO_TOL, | |
| 'reason': f'{half_double*100:.1f}% deviation (tol {TEMPO_TOL*100:.0f}%, ' | |
| f'half/double allowed)', | |
| } | |
| src_key = _key_from_audio(src_for_analysis, SR) | |
| new_key = _key_from_audio(new_region, SR) if len(new_region) > SR else src_key | |
| metrics['key_continuity'] = { | |
| 'value': f'{src_key} -> {new_key}', | |
| 'pass': new_key in _relative_keys(src_key), | |
| 'reason': f'generated key {new_key} vs source {src_key} ' | |
| f'(relative maj/min ok)', | |
| } | |
| # 8. spectral rolloff — confirm the tail isn't band-limited | |
| src_roll = float(np.mean(librosa.feature.spectral_rolloff( | |
| y=src_for_analysis, sr=SR))) + 1e-6 | |
| new_roll = float(np.mean(librosa.feature.spectral_rolloff( | |
| y=new_region, sr=SR))) if len(new_region) > SR else src_roll | |
| metrics['spectral_rolloff'] = { | |
| 'value': f'{src_roll/1000:.1f}->{new_roll/1000:.1f} kHz', | |
| 'pass': new_roll >= ROLLOFF_MIN_RATIO * src_roll, | |
| 'reason': f'new {new_roll/1000:.1f}kHz vs source {src_roll/1000:.1f}kHz ' | |
| f'(need >= {ROLLOFF_MIN_RATIO:.0%})', | |
| } | |
| # 9. stereo width in the new region | |
| if fin.shape[0] == 2 and fin.shape[-1] - boundary > SR: | |
| L, R = fin[0, boundary:], fin[1, boundary:] | |
| if np.std(L) > 1e-6 and np.std(R) > 1e-6: | |
| corr = float(np.corrcoef(L, R)[0, 1]) | |
| else: | |
| corr = 1.0 | |
| else: | |
| corr = 1.0 | |
| lo, hi = STEREO_CORR_RANGE | |
| metrics['stereo_width'] = { | |
| 'value': round(corr, 3), | |
| 'pass': lo <= corr <= hi or corr == 1.0, | |
| 'reason': f'L/R correlation {corr:.2f} (target {lo}-{hi}; ' | |
| f'1.0 = intentional mono)', | |
| } | |
| passed = all(m['pass'] for m in metrics.values()) | |
| return { | |
| 'metrics': metrics, | |
| 'passed': passed, | |
| 'finished': fin, | |
| 'boundary': boundary, | |
| 'source_seconds': source_seconds, | |
| 'total_seconds': total_s, | |
| } | |
| def print_report(report): | |
| print("\n==== CODA VERIFY ====") | |
| width = max(len(k) for k in report['metrics']) | |
| for name, m in report['metrics'].items(): | |
| tag = 'PASS' if m['pass'] else 'FAIL' | |
| print(f" [{tag}] {name:<{width}} {m['reason']}") | |
| print(f" ----\n OVERALL: {'PASS' if report['passed'] else 'FAIL'}\n") | |
| def plot_report(report, out_png): | |
| import matplotlib | |
| matplotlib.use("Agg") | |
| import matplotlib.pyplot as plt | |
| import librosa.display # submodule isn't pulled in by `import librosa` | |
| fin = report['finished'] | |
| fin_m = _mono(fin) | |
| b = report['boundary'] | |
| sr = SR | |
| t = np.arange(fin_m.shape[-1]) / sr | |
| fig, axes = plt.subplots(3, 1, figsize=(12, 9), facecolor="#0b0d10") | |
| for ax in axes: | |
| ax.set_facecolor("#14171c") | |
| ax.tick_params(colors="#9aa4b2") | |
| for s in ax.spines.values(): | |
| s.set_color("#2a2f37") | |
| axes[0].plot(t, fin_m, color="#39d0d8", lw=0.4) | |
| axes[0].axvline(b / sr, color="#ffb347", lw=1.5, label="seam") | |
| axes[0].set_title("Finished waveform (seam marked)", color="#e6e9ef") | |
| axes[0].legend(facecolor="#14171c", labelcolor="#e6e9ef") | |
| S = librosa.feature.melspectrogram(y=fin_m, sr=sr, n_mels=128) | |
| Sdb = librosa.power_to_db(S, ref=np.max) | |
| librosa.display.specshow(Sdb, sr=sr, x_axis="time", y_axis="mel", ax=axes[1]) | |
| axes[1].axvline(b / sr, color="#ffb347", lw=1.5) | |
| axes[1].set_title("Mel-spectrogram", color="#e6e9ef") | |
| rows = [[k, ("PASS" if m['pass'] else "FAIL"), str(m['value'])] | |
| for k, m in report['metrics'].items()] | |
| axes[2].axis("off") | |
| tbl = axes[2].table(cellText=rows, | |
| colLabels=["metric", "result", "value"], | |
| loc="center", cellLoc="left") | |
| tbl.auto_set_font_size(False) | |
| tbl.set_fontsize(9) | |
| for (r, c), cell in tbl.get_celld().items(): | |
| cell.set_edgecolor("#2a2f37") | |
| if r == 0: | |
| cell.set_facecolor("#22272e") | |
| cell.set_text_props(color="#e6e9ef") | |
| else: | |
| ok = report['metrics'][rows[r - 1][0]]['pass'] | |
| cell.set_facecolor("#14171c") | |
| cell.set_text_props(color="#5fd38d" if ok else "#ff6b6b") | |
| fig.suptitle(f"CODA verify — {'PASS' if report['passed'] else 'FAIL'}", | |
| color="#e6e9ef", fontsize=14) | |
| fig.tight_layout() | |
| fig.savefig(out_png, dpi=110, facecolor="#0b0d10") | |
| plt.close(fig) | |
| print(f" wrote {out_png}") | |
| if __name__ == "__main__": | |
| if len(sys.argv) < 4: | |
| print("usage: python verify.py <original> <finished> " | |
| "<source_seconds> [out.png]") | |
| sys.exit(1) | |
| rep = verify(sys.argv[1], sys.argv[2], float(sys.argv[3])) | |
| print_report(rep) | |
| if len(sys.argv) > 4: | |
| plot_report(rep, sys.argv[4]) | |
| sys.exit(0 if rep['passed'] else 2) | |