Spaces:

rikhoffbauer2
/

ai-techno-dj

Sleeping

Rik Hoffbauer

Implement waveform cue editor and feedback-to-learning path

6362e08 29 days ago

11.2 kB

	"""Waveform-backed manual cue editing helpers.

	The previous manual editor only exposed numeric inputs. This module adds a
	visual, audio-derived cue editor: it renders waveform overviews for the two
	tracks in a transition, overlays selected cue positions and alternative cue
	candidates, and returns stable cue-choice strings that can be applied back to
	TransitionPlan objects.

	The UI remains deliberately simple because Gradio event/click APIs vary across
	versions. The backend is still real: the waveform image is computed from the
	actual audio files, candidate lists are built from analysis cue objects, and
	manual edits become explicit cue overrides that can later be exported as
	training examples.
	"""

	from __future__ import annotations

	from dataclasses import dataclass
	from pathlib import Path
	from typing import Any, Iterable, Mapping
	import hashlib
	import math
	import tempfile

	import librosa
	import matplotlib
	matplotlib.use("Agg")
	import matplotlib.pyplot as plt
	import numpy as np


	@dataclass(frozen=True)
	class CueChoice:
	role: str
	time: float
	label: str
	confidence: float
	source: str = ""

	@property
	def value(self) -> str:
	safe_label = self.label.replace("\|", "/")
	safe_source = self.source.replace("\|", "/")
	return f"{self.role}\|{self.time:.3f}\|{self.confidence:.3f}\|{safe_label}\|{safe_source}"

	@property
	def display(self) -> str:
	source = f" · {self.source}" if self.source else ""
	return f"{self.role} @ {self.time:.2f}s · {self.confidence:.0%} · {self.label}{source}"


	def parse_cue_choice(value: str \| None) -> CueChoice \| None:
	if not value:
	return None
	parts = str(value).split("\|", 4)
	if len(parts) < 4:
	return None
	role, time_s, confidence, label = parts[:4]
	source = parts[4] if len(parts) > 4 else ""
	try:
	return CueChoice(role=role, time=float(time_s), confidence=float(confidence), label=label, source=source)
	except ValueError:
	return None


	def _cue_source(cue: Mapping[str, Any]) -> str:
	ev = cue.get("evidence", {})
	return str(ev.get("source", "")) if isinstance(ev, Mapping) else ""


	def cue_choices(track: Any, role: str, *, limit: int = 12) -> list[tuple[str, str]]:
	"""Return Gradio-compatible `(label, value)` cue choices for a role."""
	aliases = {
	"a_out": {"mix_out", "loopable", "drop"},
	"b_in": {"mix_in", "loopable"},
	"b_drop": {"first_drop", "drop"},
	}.get(role, {role})
	cues = [c for c in getattr(track, "cue_points", []) if str(c.get("kind", c.get("type", ""))) in aliases]
	cues.sort(key=lambda c: (-float(c.get("confidence", 0.0) or 0.0), float(c.get("time", 0.0) or 0.0)))
	out: list[tuple[str, str]] = []
	for cue in cues[:limit]:
	choice = CueChoice(
	role=role,
	time=float(cue.get("time", 0.0) or 0.0),
	label=str(cue.get("label", cue.get("kind", "cue"))),
	confidence=float(cue.get("confidence", 0.0) or 0.0),
	source=_cue_source(cue),
	)
	out.append((choice.display, choice.value))
	return out


	def default_choice(track: Any, role: str, time_s: float) -> str \| None:
	choices = cue_choices(track, role)
	if not choices:
	return None
	parsed = [(label, value, parse_cue_choice(value)) for label, value in choices]
	parsed = [(label, value, cue) for label, value, cue in parsed if cue is not None]
	if not parsed:
	return None
	return min(parsed, key=lambda item: abs(item[2].time - float(time_s)))[1]


	def _load_preview(path: str, *, max_duration: float = 300.0, sr: int = 12000) -> tuple[np.ndarray, int]:
	try:
	y, got_sr = librosa.load(path, sr=sr, mono=True, duration=max_duration)
	except Exception:
	# A missing/corrupt file should not kill the editor. Return a visible flatline.
	got_sr = sr
	y = np.zeros(sr, dtype=np.float32)
	if y.size == 0:
	y = np.zeros(sr, dtype=np.float32)
	y = np.asarray(y, dtype=np.float32)
	peak = float(np.max(np.abs(y))) if y.size else 0.0
	if peak > 0:
	y = y / peak
	return y, got_sr


	def _amplitude_envelope(y: np.ndarray, sr: int, *, bins: int = 1800) -> tuple[np.ndarray, np.ndarray]:
	n = len(y)
	if n == 0:
	return np.array([0.0]), np.array([0.0])
	bins = max(64, min(bins, n))
	edges = np.linspace(0, n, bins + 1, dtype=int)
	env = np.zeros(bins, dtype=np.float32)
	for i in range(bins):
	seg = y[edges[i]:edges[i + 1]]
	env[i] = float(np.max(np.abs(seg))) if len(seg) else 0.0
	times = np.linspace(0.0, n / sr, bins)
	return times, env


	def _draw_track(ax: Any, track: Any, *, selected: dict[str, float], title: str) -> None:
	y, sr = _load_preview(getattr(track, "path", ""), max_duration=float(getattr(track, "duration", 300.0) or 300.0))
	times, env = _amplitude_envelope(y, sr)
	ax.fill_between(times, -env, env, alpha=0.35, linewidth=0)
	ax.plot(times, env, linewidth=0.35)
	ax.plot(times, -env, linewidth=0.35)
	duration = float(getattr(track, "duration", times[-1] if len(times) else 0.0) or 0.0)
	ax.set_xlim(0, max(1.0, min(duration, times[-1] if len(times) else duration)))
	ax.set_ylim(-1.05, 1.05)
	ax.set_yticks([])
	ax.set_title(title, loc="left", fontsize=10)
	ax.set_xlabel("seconds")

	# Segment spans give the user context beyond the raw waveform.
	for seg in getattr(track, "segments", [])[:40]:
	if not isinstance(seg, Mapping):
	continue
	start = float(seg.get("start", 0.0) or 0.0)
	end = float(seg.get("end", start) or start)
	label = str(seg.get("label", "section"))
	if end <= start:
	continue
	ax.axvspan(start, end, alpha=0.04)
	if end - start > 5:
	ax.text(start + 0.15, 0.82, label, fontsize=7, alpha=0.65)

	cue_palette = {
	"mix_in": (0.2, 0.7, 0.2),
	"mix_out": (0.8, 0.25, 0.2),
	"first_drop": (0.55, 0.2, 0.8),
	"drop": (0.55, 0.2, 0.8),
	"loopable": (0.2, 0.45, 0.85),
	}
	for cue in getattr(track, "cue_points", [])[:60]:
	kind = str(cue.get("kind", cue.get("type", "cue")))
	t = float(cue.get("time", 0.0) or 0.0)
	if t < 0 or t > duration:
	continue
	conf = float(cue.get("confidence", 0.0) or 0.0)
	color = cue_palette.get(kind, (0.3, 0.3, 0.3))
	ax.axvline(t, color=color, alpha=max(0.12, min(0.55, conf * 0.55)), linewidth=0.8)

	for name, t in selected.items():
	ax.axvline(float(t), color="black", linewidth=2.0, alpha=0.95)
	ax.text(float(t), -0.92, name, rotation=90, va="bottom", ha="right", fontsize=8, fontweight="bold")


	def render_transition_cue_editor(track_a: Any, track_b: Any, plan: Any, *, output_dir: str \| Path \| None = None) -> tuple[str, str]:
	"""Render a two-track waveform/cue overview and return `(png_path, markdown)`."""
	output_dir = Path(output_dir or tempfile.gettempdir())
	output_dir.mkdir(parents=True, exist_ok=True)
	fingerprint = hashlib.sha1(
	f"{getattr(track_a, 'path', '')}\|{getattr(track_b, 'path', '')}\|{getattr(plan, 'mix_out_point', 0)}\|{getattr(plan, 'mix_in_point', 0)}\|{getattr(plan, 'duration_seconds', 0)}\|{getattr(plan, 'transition_type', '')}".encode()
	).hexdigest()[:12]
	out = output_dir / f"ai-dj-cue-editor-{fingerprint}.png"

	selected = getattr(plan, "selected_cues", {}) or {}
	a_out = float(selected.get("a_out", {}).get("time", getattr(plan, "mix_out_point", 0.0)))
	b_in = float(selected.get("b_in", {}).get("time", getattr(plan, "mix_in_point", 0.0)))
	b_drop = float(selected.get("b_drop", {}).get("time", b_in + getattr(plan, "duration_seconds", 0.0)))

	fig, axes = plt.subplots(2, 1, figsize=(15, 5.2), constrained_layout=True)
	_draw_track(axes[0], track_a, selected={"A OUT": a_out}, title=f"A: {getattr(track_a, 'filename', 'track A')}")
	_draw_track(axes[1], track_b, selected={"B IN": b_in, "B DROP": b_drop}, title=f"B: {getattr(track_b, 'filename', 'track B')}")
	fig.suptitle(f"Transition cue editor · {getattr(plan, 'transition_type', 'transition')} · {getattr(plan, 'duration_beats', '?')} beats", fontsize=12)
	fig.savefig(out, dpi=150)
	plt.close(fig)

	summary = [
	"### Waveform cue editor",
	"The black markers are the currently selected transition anchors. Thin colored lines are ranked cue candidates from analysis.",
	f"- A mix-out: {a_out:.2f}s",
	f"- B mix-in: {b_in:.2f}s",
	f"- B drop: {b_drop:.2f}s",
	f"- Transition type: `{getattr(plan, 'transition_type', 'unknown')}`",
	f"- Duration: {float(getattr(plan, 'duration_seconds', 0.0)):.2f}s / {int(getattr(plan, 'duration_beats', 0))} beats",
	]
	return str(out), "\n".join(summary)


	def choices_for_transition(track_a: Any, track_b: Any, plan: Any) -> dict[str, Any]:
	"""Return choice lists and defaults for the UI/backend tests."""
	return {
	"a_choices": cue_choices(track_a, "a_out"),
	"b_in_choices": cue_choices(track_b, "b_in"),
	"b_drop_choices": cue_choices(track_b, "b_drop"),
	"a_default": default_choice(track_a, "a_out", float(getattr(plan, "mix_out_point", 0.0) or 0.0)),
	"b_in_default": default_choice(track_b, "b_in", float(getattr(plan, "mix_in_point", 0.0) or 0.0)),
	"b_drop_default": default_choice(track_b, "b_drop", float(getattr(plan, "mix_in_point", 0.0) + getattr(plan, "duration_seconds", 0.0))),
	}


	def apply_choices_to_plan(plan: Any, *, a_choice: str \| None, b_in_choice: str \| None, b_drop_choice: str \| None, transition_type: str \| None = None) -> tuple[float, float, float, dict[str, Any]]:
	"""Apply cue-choice strings to a TransitionPlan-like object.

	Returns `(mix_out, mix_in, duration_seconds, selected_cues)` so callers can
	update additional derived fields such as beat count.
	"""
	a = parse_cue_choice(a_choice)
	b = parse_cue_choice(b_in_choice)
	d = parse_cue_choice(b_drop_choice)
	mix_out = float(a.time if a else getattr(plan, "mix_out_point", 0.0))
	mix_in = float(b.time if b else getattr(plan, "mix_in_point", 0.0))
	drop = float(d.time if d else mix_in + float(getattr(plan, "duration_seconds", 0.0)))
	duration = max(0.25, drop - mix_in)
	if transition_type:
	setattr(plan, "transition_type", transition_type)
	selected = {
	"a_out": {"kind": "mix_out", "label": a.label if a else "manual waveform value", "time": round(mix_out, 3), "confidence": a.confidence if a else 1.0, "evidence": {"source": a.source if a else "waveform_editor"}},
	"b_in": {"kind": "mix_in", "label": b.label if b else "manual waveform value", "time": round(mix_in, 3), "confidence": b.confidence if b else 1.0, "evidence": {"source": b.source if b else "waveform_editor"}},
	"b_drop": {"kind": "drop", "label": d.label if d else "manual waveform value", "time": round(drop, 3), "confidence": d.confidence if d else 1.0, "evidence": {"source": d.source if d else "waveform_editor"}},
	}
	return mix_out, mix_in, duration, selected