""" Audio synthesis utilities for beat tracking evaluation. This module provides functions to: - Generate click sounds for beats and downbeats - Mix click tracks with original audio - Save audio files with beat annotations Example usage: from exp.data.audio import create_click_track, mix_audio, save_audio # Create click track clicks = create_click_track( beat_times=pred_beats, downbeat_times=pred_downbeats, duration=30.0, sr=16000 ) # Mix with original audio mixed = mix_audio(original_audio, clicks, click_volume=0.5) # Save to file save_audio(mixed, "output.wav", sr=16000) """ import numpy as np from pathlib import Path def generate_click( frequency: float = 1000.0, duration: float = 0.02, sr: int = 16000, attack: float = 0.002, decay: float = 0.018, ) -> np.ndarray: """ Generate a single click sound. Args: frequency: Frequency of the click tone in Hz duration: Duration of the click in seconds sr: Sample rate attack: Attack time in seconds decay: Decay time in seconds Returns: Click waveform as numpy array """ t = np.arange(int(duration * sr)) / sr # Generate sine wave wave = np.sin(2 * np.pi * frequency * t) # Apply envelope (attack-decay) envelope = np.ones_like(t) attack_samples = int(attack * sr) decay_samples = int(decay * sr) if attack_samples > 0: envelope[:attack_samples] = np.linspace(0, 1, attack_samples) if decay_samples > 0: decay_start = len(t) - decay_samples if decay_start > 0: envelope[decay_start:] = np.linspace(1, 0, decay_samples) return wave * envelope def create_click_track( beat_times: list[float] | np.ndarray, downbeat_times: list[float] | np.ndarray | None = None, duration: float | None = None, sr: int = 16000, beat_freq: float = 1000.0, downbeat_freq: float = 1500.0, click_duration: float = 0.03, ) -> np.ndarray: """ Create a click track from beat and downbeat times. Args: beat_times: List of beat times in seconds downbeat_times: List of downbeat times in seconds (optional) duration: Total duration in seconds (auto-detected if None) sr: Sample rate beat_freq: Frequency for beat clicks (Hz) downbeat_freq: Frequency for downbeat clicks (Hz) click_duration: Duration of each click in seconds Returns: Click track as numpy array """ beat_times = np.array(beat_times) if len(beat_times) > 0 else np.array([]) if downbeat_times is not None: downbeat_times = ( np.array(downbeat_times) if len(downbeat_times) > 0 else np.array([]) ) else: downbeat_times = np.array([]) # Determine duration if duration is None: all_times = np.concatenate([beat_times, downbeat_times]) if len(all_times) == 0: return np.array([]) duration = float(np.max(all_times)) + 1.0 # Create output array total_samples = int(duration * sr) output = np.zeros(total_samples, dtype=np.float32) # Generate click templates beat_click = generate_click(frequency=beat_freq, duration=click_duration, sr=sr) downbeat_click = generate_click( frequency=downbeat_freq, duration=click_duration, sr=sr ) # Convert downbeat times to set for fast lookup downbeat_set = set(np.round(downbeat_times, 3)) # Add beat clicks for t in beat_times: sample_idx = int(t * sr) if sample_idx < 0 or sample_idx >= total_samples: continue # Use downbeat click if this is also a downbeat is_downbeat = np.round(t, 3) in downbeat_set click = downbeat_click if is_downbeat else beat_click # Add click to output end_idx = min(sample_idx + len(click), total_samples) click_len = end_idx - sample_idx output[sample_idx:end_idx] += click[:click_len] # Add downbeat clicks (for downbeats not already in beats) beat_set = set(np.round(beat_times, 3)) for t in downbeat_times: if np.round(t, 3) in beat_set: continue # Already added as beat sample_idx = int(t * sr) if sample_idx < 0 or sample_idx >= total_samples: continue end_idx = min(sample_idx + len(downbeat_click), total_samples) click_len = end_idx - sample_idx output[sample_idx:end_idx] += downbeat_click[:click_len] return output def mix_audio( audio: np.ndarray, click_track: np.ndarray, click_volume: float = 0.5, ) -> np.ndarray: """ Mix original audio with a click track. Args: audio: Original audio waveform click_track: Click track to overlay click_volume: Volume of clicks relative to audio (0.0 to 1.0) Returns: Mixed audio """ # Ensure same length max_len = max(len(audio), len(click_track)) audio_padded = np.zeros(max_len, dtype=np.float32) click_padded = np.zeros(max_len, dtype=np.float32) audio_padded[: len(audio)] = audio click_padded[: len(click_track)] = click_track # Normalize audio audio_max = np.abs(audio_padded).max() if audio_max > 0: audio_padded = audio_padded / audio_max * 0.8 # Normalize clicks click_max = np.abs(click_padded).max() if click_max > 0: click_padded = click_padded / click_max * click_volume * 0.8 # Mix mixed = audio_padded + click_padded # Prevent clipping max_val = np.abs(mixed).max() if max_val > 1.0: mixed = mixed / max_val * 0.95 return mixed.astype(np.float32) def create_comparison_audio( audio: np.ndarray, pred_beats: list[float], pred_downbeats: list[float], gt_beats: list[float], gt_downbeats: list[float], sr: int = 16000, click_volume: float = 0.5, ) -> tuple[np.ndarray, np.ndarray, np.ndarray]: """ Create audio files for comparison: prediction clicks, ground truth clicks, and combined. Args: audio: Original audio waveform pred_beats: Predicted beat times pred_downbeats: Predicted downbeat times gt_beats: Ground truth beat times gt_downbeats: Ground truth downbeat times sr: Sample rate click_volume: Volume of clicks Returns: Tuple of (audio_with_pred_clicks, audio_with_gt_clicks, audio_with_both) """ duration = len(audio) / sr # Create click tracks pred_clicks = create_click_track( pred_beats, pred_downbeats, duration=duration, sr=sr, beat_freq=1000.0, downbeat_freq=1500.0, ) gt_clicks = create_click_track( gt_beats, gt_downbeats, duration=duration, sr=sr, beat_freq=800.0, # Different frequency for GT downbeat_freq=1200.0, ) # Mix audio_pred = mix_audio(audio, pred_clicks, click_volume) audio_gt = mix_audio(audio, gt_clicks, click_volume) audio_both = mix_audio(audio, pred_clicks + gt_clicks, click_volume) return audio_pred, audio_gt, audio_both def save_audio( audio: np.ndarray, path: str | Path, sr: int = 16000, ) -> None: """ Save audio to a WAV file. Args: audio: Audio waveform path: Output file path sr: Sample rate """ import scipy.io.wavfile as wavfile path = Path(path) path.parent.mkdir(parents=True, exist_ok=True) # Convert to int16 audio_int16 = (audio * 32767).astype(np.int16) wavfile.write(str(path), sr, audio_int16) if __name__ == "__main__": # Demo print("Audio synthesis demo...") # Create a simple sine wave as "music" sr = 16000 duration = 10.0 t = np.arange(int(duration * sr)) / sr music = np.sin(2 * np.pi * 220 * t) * 0.3 # 220 Hz tone # Beats every 0.5s, downbeats every 2s beats = np.arange(0, duration, 0.5).tolist() downbeats = np.arange(0, duration, 2.0).tolist() # Create click track clicks = create_click_track(beats, downbeats, duration=duration, sr=sr) # Mix mixed = mix_audio(music, clicks, click_volume=0.6) print(f"Created mixed audio: {len(mixed)} samples ({len(mixed) / sr:.2f}s)") print(f"Beats: {len(beats)}, Downbeats: {len(downbeats)}") # Save demo save_audio(mixed, "/tmp/beat_click_demo.wav", sr=sr) print("Saved demo to /tmp/beat_click_demo.wav")