exp-beat-tracking-task / exp /data /audio.py

Upload folder using huggingface_hub

31bf74c unverified 22 days ago

8.51 kB

	"""
	Audio synthesis utilities for beat tracking evaluation.

	This module provides functions to:
	- Generate click sounds for beats and downbeats
	- Mix click tracks with original audio
	- Save audio files with beat annotations

	Example usage:
	from exp.data.audio import create_click_track, mix_audio, save_audio

	# Create click track
	clicks = create_click_track(
	beat_times=pred_beats,
	downbeat_times=pred_downbeats,
	duration=30.0,
	sr=16000
	)

	# Mix with original audio
	mixed = mix_audio(original_audio, clicks, click_volume=0.5)

	# Save to file
	save_audio(mixed, "output.wav", sr=16000)
	"""

	import numpy as np
	from pathlib import Path


	def generate_click(
	frequency: float = 1000.0,
	duration: float = 0.02,
	sr: int = 16000,
	attack: float = 0.002,
	decay: float = 0.018,
	) -> np.ndarray:
	"""
	Generate a single click sound.

	Args:
	frequency: Frequency of the click tone in Hz
	duration: Duration of the click in seconds
	sr: Sample rate
	attack: Attack time in seconds
	decay: Decay time in seconds

	Returns:
	Click waveform as numpy array
	"""
	t = np.arange(int(duration * sr)) / sr

	# Generate sine wave
	wave = np.sin(2 * np.pi * frequency * t)

	# Apply envelope (attack-decay)
	envelope = np.ones_like(t)
	attack_samples = int(attack * sr)
	decay_samples = int(decay * sr)

	if attack_samples > 0:
	envelope[:attack_samples] = np.linspace(0, 1, attack_samples)
	if decay_samples > 0:
	decay_start = len(t) - decay_samples
	if decay_start > 0:
	envelope[decay_start:] = np.linspace(1, 0, decay_samples)

	return wave * envelope


	def create_click_track(
	beat_times: list[float] \| np.ndarray,
	downbeat_times: list[float] \| np.ndarray \| None = None,
	duration: float \| None = None,
	sr: int = 16000,
	beat_freq: float = 1000.0,
	downbeat_freq: float = 1500.0,
	click_duration: float = 0.03,
	) -> np.ndarray:
	"""
	Create a click track from beat and downbeat times.

	Args:
	beat_times: List of beat times in seconds
	downbeat_times: List of downbeat times in seconds (optional)
	duration: Total duration in seconds (auto-detected if None)
	sr: Sample rate
	beat_freq: Frequency for beat clicks (Hz)
	downbeat_freq: Frequency for downbeat clicks (Hz)
	click_duration: Duration of each click in seconds

	Returns:
	Click track as numpy array
	"""
	beat_times = np.array(beat_times) if len(beat_times) > 0 else np.array([])
	if downbeat_times is not None:
	downbeat_times = (
	np.array(downbeat_times) if len(downbeat_times) > 0 else np.array([])
	)
	else:
	downbeat_times = np.array([])

	# Determine duration
	if duration is None:
	all_times = np.concatenate([beat_times, downbeat_times])
	if len(all_times) == 0:
	return np.array([])
	duration = float(np.max(all_times)) + 1.0

	# Create output array
	total_samples = int(duration * sr)
	output = np.zeros(total_samples, dtype=np.float32)

	# Generate click templates
	beat_click = generate_click(frequency=beat_freq, duration=click_duration, sr=sr)
	downbeat_click = generate_click(
	frequency=downbeat_freq, duration=click_duration, sr=sr
	)

	# Convert downbeat times to set for fast lookup
	downbeat_set = set(np.round(downbeat_times, 3))

	# Add beat clicks
	for t in beat_times:
	sample_idx = int(t * sr)
	if sample_idx < 0 or sample_idx >= total_samples:
	continue

	# Use downbeat click if this is also a downbeat
	is_downbeat = np.round(t, 3) in downbeat_set
	click = downbeat_click if is_downbeat else beat_click

	# Add click to output
	end_idx = min(sample_idx + len(click), total_samples)
	click_len = end_idx - sample_idx
	output[sample_idx:end_idx] += click[:click_len]

	# Add downbeat clicks (for downbeats not already in beats)
	beat_set = set(np.round(beat_times, 3))
	for t in downbeat_times:
	if np.round(t, 3) in beat_set:
	continue # Already added as beat

	sample_idx = int(t * sr)
	if sample_idx < 0 or sample_idx >= total_samples:
	continue

	end_idx = min(sample_idx + len(downbeat_click), total_samples)
	click_len = end_idx - sample_idx
	output[sample_idx:end_idx] += downbeat_click[:click_len]

	return output


	def mix_audio(
	audio: np.ndarray,
	click_track: np.ndarray,
	click_volume: float = 0.5,
	) -> np.ndarray:
	"""
	Mix original audio with a click track.

	Args:
	audio: Original audio waveform
	click_track: Click track to overlay
	click_volume: Volume of clicks relative to audio (0.0 to 1.0)

	Returns:
	Mixed audio
	"""
	# Ensure same length
	max_len = max(len(audio), len(click_track))
	audio_padded = np.zeros(max_len, dtype=np.float32)
	click_padded = np.zeros(max_len, dtype=np.float32)

	audio_padded[: len(audio)] = audio
	click_padded[: len(click_track)] = click_track

	# Normalize audio
	audio_max = np.abs(audio_padded).max()
	if audio_max > 0:
	audio_padded = audio_padded / audio_max * 0.8

	# Normalize clicks
	click_max = np.abs(click_padded).max()
	if click_max > 0:
	click_padded = click_padded / click_max * click_volume * 0.8

	# Mix
	mixed = audio_padded + click_padded

	# Prevent clipping
	max_val = np.abs(mixed).max()
	if max_val > 1.0:
	mixed = mixed / max_val * 0.95

	return mixed.astype(np.float32)


	def create_comparison_audio(
	audio: np.ndarray,
	pred_beats: list[float],
	pred_downbeats: list[float],
	gt_beats: list[float],
	gt_downbeats: list[float],
	sr: int = 16000,
	click_volume: float = 0.5,
	) -> tuple[np.ndarray, np.ndarray, np.ndarray]:
	"""
	Create audio files for comparison: prediction clicks, ground truth clicks, and combined.

	Args:
	audio: Original audio waveform
	pred_beats: Predicted beat times
	pred_downbeats: Predicted downbeat times
	gt_beats: Ground truth beat times
	gt_downbeats: Ground truth downbeat times
	sr: Sample rate
	click_volume: Volume of clicks

	Returns:
	Tuple of (audio_with_pred_clicks, audio_with_gt_clicks, audio_with_both)
	"""
	duration = len(audio) / sr

	# Create click tracks
	pred_clicks = create_click_track(
	pred_beats,
	pred_downbeats,
	duration=duration,
	sr=sr,
	beat_freq=1000.0,
	downbeat_freq=1500.0,
	)

	gt_clicks = create_click_track(
	gt_beats,
	gt_downbeats,
	duration=duration,
	sr=sr,
	beat_freq=800.0, # Different frequency for GT
	downbeat_freq=1200.0,
	)

	# Mix
	audio_pred = mix_audio(audio, pred_clicks, click_volume)
	audio_gt = mix_audio(audio, gt_clicks, click_volume)
	audio_both = mix_audio(audio, pred_clicks + gt_clicks, click_volume)

	return audio_pred, audio_gt, audio_both


	def save_audio(
	audio: np.ndarray,
	path: str \| Path,
	sr: int = 16000,
	) -> None:
	"""
	Save audio to a WAV file.

	Args:
	audio: Audio waveform
	path: Output file path
	sr: Sample rate
	"""
	import scipy.io.wavfile as wavfile

	path = Path(path)
	path.parent.mkdir(parents=True, exist_ok=True)

	# Convert to int16
	audio_int16 = (audio * 32767).astype(np.int16)
	wavfile.write(str(path), sr, audio_int16)


	if __name__ == "__main__":
	# Demo
	print("Audio synthesis demo...")

	# Create a simple sine wave as "music"
	sr = 16000
	duration = 10.0
	t = np.arange(int(duration * sr)) / sr
	music = np.sin(2 * np.pi * 220 * t) * 0.3 # 220 Hz tone

	# Beats every 0.5s, downbeats every 2s
	beats = np.arange(0, duration, 0.5).tolist()
	downbeats = np.arange(0, duration, 2.0).tolist()

	# Create click track
	clicks = create_click_track(beats, downbeats, duration=duration, sr=sr)

	# Mix
	mixed = mix_audio(music, clicks, click_volume=0.6)

	print(f"Created mixed audio: {len(mixed)} samples ({len(mixed) / sr:.2f}s)")
	print(f"Beats: {len(beats)}, Downbeats: {len(downbeats)}")

	# Save demo
	save_audio(mixed, "/tmp/beat_click_demo.wav", sr=sr)
	print("Saved demo to /tmp/beat_click_demo.wav")