|
|
""" |
|
|
Audio synthesis utilities for beat tracking evaluation. |
|
|
|
|
|
This module provides functions to: |
|
|
- Generate click sounds for beats and downbeats |
|
|
- Mix click tracks with original audio |
|
|
- Save audio files with beat annotations |
|
|
|
|
|
Example usage: |
|
|
from exp.data.audio import create_click_track, mix_audio, save_audio |
|
|
|
|
|
# Create click track |
|
|
clicks = create_click_track( |
|
|
beat_times=pred_beats, |
|
|
downbeat_times=pred_downbeats, |
|
|
duration=30.0, |
|
|
sr=16000 |
|
|
) |
|
|
|
|
|
# Mix with original audio |
|
|
mixed = mix_audio(original_audio, clicks, click_volume=0.5) |
|
|
|
|
|
# Save to file |
|
|
save_audio(mixed, "output.wav", sr=16000) |
|
|
""" |
|
|
|
|
|
import numpy as np |
|
|
from pathlib import Path |
|
|
|
|
|
|
|
|
def generate_click( |
|
|
frequency: float = 1000.0, |
|
|
duration: float = 0.02, |
|
|
sr: int = 16000, |
|
|
attack: float = 0.002, |
|
|
decay: float = 0.018, |
|
|
) -> np.ndarray: |
|
|
""" |
|
|
Generate a single click sound. |
|
|
|
|
|
Args: |
|
|
frequency: Frequency of the click tone in Hz |
|
|
duration: Duration of the click in seconds |
|
|
sr: Sample rate |
|
|
attack: Attack time in seconds |
|
|
decay: Decay time in seconds |
|
|
|
|
|
Returns: |
|
|
Click waveform as numpy array |
|
|
""" |
|
|
t = np.arange(int(duration * sr)) / sr |
|
|
|
|
|
|
|
|
wave = np.sin(2 * np.pi * frequency * t) |
|
|
|
|
|
|
|
|
envelope = np.ones_like(t) |
|
|
attack_samples = int(attack * sr) |
|
|
decay_samples = int(decay * sr) |
|
|
|
|
|
if attack_samples > 0: |
|
|
envelope[:attack_samples] = np.linspace(0, 1, attack_samples) |
|
|
if decay_samples > 0: |
|
|
decay_start = len(t) - decay_samples |
|
|
if decay_start > 0: |
|
|
envelope[decay_start:] = np.linspace(1, 0, decay_samples) |
|
|
|
|
|
return wave * envelope |
|
|
|
|
|
|
|
|
def create_click_track( |
|
|
beat_times: list[float] | np.ndarray, |
|
|
downbeat_times: list[float] | np.ndarray | None = None, |
|
|
duration: float | None = None, |
|
|
sr: int = 16000, |
|
|
beat_freq: float = 1000.0, |
|
|
downbeat_freq: float = 1500.0, |
|
|
click_duration: float = 0.03, |
|
|
) -> np.ndarray: |
|
|
""" |
|
|
Create a click track from beat and downbeat times. |
|
|
|
|
|
Args: |
|
|
beat_times: List of beat times in seconds |
|
|
downbeat_times: List of downbeat times in seconds (optional) |
|
|
duration: Total duration in seconds (auto-detected if None) |
|
|
sr: Sample rate |
|
|
beat_freq: Frequency for beat clicks (Hz) |
|
|
downbeat_freq: Frequency for downbeat clicks (Hz) |
|
|
click_duration: Duration of each click in seconds |
|
|
|
|
|
Returns: |
|
|
Click track as numpy array |
|
|
""" |
|
|
beat_times = np.array(beat_times) if len(beat_times) > 0 else np.array([]) |
|
|
if downbeat_times is not None: |
|
|
downbeat_times = ( |
|
|
np.array(downbeat_times) if len(downbeat_times) > 0 else np.array([]) |
|
|
) |
|
|
else: |
|
|
downbeat_times = np.array([]) |
|
|
|
|
|
|
|
|
if duration is None: |
|
|
all_times = np.concatenate([beat_times, downbeat_times]) |
|
|
if len(all_times) == 0: |
|
|
return np.array([]) |
|
|
duration = float(np.max(all_times)) + 1.0 |
|
|
|
|
|
|
|
|
total_samples = int(duration * sr) |
|
|
output = np.zeros(total_samples, dtype=np.float32) |
|
|
|
|
|
|
|
|
beat_click = generate_click(frequency=beat_freq, duration=click_duration, sr=sr) |
|
|
downbeat_click = generate_click( |
|
|
frequency=downbeat_freq, duration=click_duration, sr=sr |
|
|
) |
|
|
|
|
|
|
|
|
downbeat_set = set(np.round(downbeat_times, 3)) |
|
|
|
|
|
|
|
|
for t in beat_times: |
|
|
sample_idx = int(t * sr) |
|
|
if sample_idx < 0 or sample_idx >= total_samples: |
|
|
continue |
|
|
|
|
|
|
|
|
is_downbeat = np.round(t, 3) in downbeat_set |
|
|
click = downbeat_click if is_downbeat else beat_click |
|
|
|
|
|
|
|
|
end_idx = min(sample_idx + len(click), total_samples) |
|
|
click_len = end_idx - sample_idx |
|
|
output[sample_idx:end_idx] += click[:click_len] |
|
|
|
|
|
|
|
|
beat_set = set(np.round(beat_times, 3)) |
|
|
for t in downbeat_times: |
|
|
if np.round(t, 3) in beat_set: |
|
|
continue |
|
|
|
|
|
sample_idx = int(t * sr) |
|
|
if sample_idx < 0 or sample_idx >= total_samples: |
|
|
continue |
|
|
|
|
|
end_idx = min(sample_idx + len(downbeat_click), total_samples) |
|
|
click_len = end_idx - sample_idx |
|
|
output[sample_idx:end_idx] += downbeat_click[:click_len] |
|
|
|
|
|
return output |
|
|
|
|
|
|
|
|
def mix_audio( |
|
|
audio: np.ndarray, |
|
|
click_track: np.ndarray, |
|
|
click_volume: float = 0.5, |
|
|
) -> np.ndarray: |
|
|
""" |
|
|
Mix original audio with a click track. |
|
|
|
|
|
Args: |
|
|
audio: Original audio waveform |
|
|
click_track: Click track to overlay |
|
|
click_volume: Volume of clicks relative to audio (0.0 to 1.0) |
|
|
|
|
|
Returns: |
|
|
Mixed audio |
|
|
""" |
|
|
|
|
|
max_len = max(len(audio), len(click_track)) |
|
|
audio_padded = np.zeros(max_len, dtype=np.float32) |
|
|
click_padded = np.zeros(max_len, dtype=np.float32) |
|
|
|
|
|
audio_padded[: len(audio)] = audio |
|
|
click_padded[: len(click_track)] = click_track |
|
|
|
|
|
|
|
|
audio_max = np.abs(audio_padded).max() |
|
|
if audio_max > 0: |
|
|
audio_padded = audio_padded / audio_max * 0.8 |
|
|
|
|
|
|
|
|
click_max = np.abs(click_padded).max() |
|
|
if click_max > 0: |
|
|
click_padded = click_padded / click_max * click_volume * 0.8 |
|
|
|
|
|
|
|
|
mixed = audio_padded + click_padded |
|
|
|
|
|
|
|
|
max_val = np.abs(mixed).max() |
|
|
if max_val > 1.0: |
|
|
mixed = mixed / max_val * 0.95 |
|
|
|
|
|
return mixed.astype(np.float32) |
|
|
|
|
|
|
|
|
def create_comparison_audio( |
|
|
audio: np.ndarray, |
|
|
pred_beats: list[float], |
|
|
pred_downbeats: list[float], |
|
|
gt_beats: list[float], |
|
|
gt_downbeats: list[float], |
|
|
sr: int = 16000, |
|
|
click_volume: float = 0.5, |
|
|
) -> tuple[np.ndarray, np.ndarray, np.ndarray]: |
|
|
""" |
|
|
Create audio files for comparison: prediction clicks, ground truth clicks, and combined. |
|
|
|
|
|
Args: |
|
|
audio: Original audio waveform |
|
|
pred_beats: Predicted beat times |
|
|
pred_downbeats: Predicted downbeat times |
|
|
gt_beats: Ground truth beat times |
|
|
gt_downbeats: Ground truth downbeat times |
|
|
sr: Sample rate |
|
|
click_volume: Volume of clicks |
|
|
|
|
|
Returns: |
|
|
Tuple of (audio_with_pred_clicks, audio_with_gt_clicks, audio_with_both) |
|
|
""" |
|
|
duration = len(audio) / sr |
|
|
|
|
|
|
|
|
pred_clicks = create_click_track( |
|
|
pred_beats, |
|
|
pred_downbeats, |
|
|
duration=duration, |
|
|
sr=sr, |
|
|
beat_freq=1000.0, |
|
|
downbeat_freq=1500.0, |
|
|
) |
|
|
|
|
|
gt_clicks = create_click_track( |
|
|
gt_beats, |
|
|
gt_downbeats, |
|
|
duration=duration, |
|
|
sr=sr, |
|
|
beat_freq=800.0, |
|
|
downbeat_freq=1200.0, |
|
|
) |
|
|
|
|
|
|
|
|
audio_pred = mix_audio(audio, pred_clicks, click_volume) |
|
|
audio_gt = mix_audio(audio, gt_clicks, click_volume) |
|
|
audio_both = mix_audio(audio, pred_clicks + gt_clicks, click_volume) |
|
|
|
|
|
return audio_pred, audio_gt, audio_both |
|
|
|
|
|
|
|
|
def save_audio( |
|
|
audio: np.ndarray, |
|
|
path: str | Path, |
|
|
sr: int = 16000, |
|
|
) -> None: |
|
|
""" |
|
|
Save audio to a WAV file. |
|
|
|
|
|
Args: |
|
|
audio: Audio waveform |
|
|
path: Output file path |
|
|
sr: Sample rate |
|
|
""" |
|
|
import scipy.io.wavfile as wavfile |
|
|
|
|
|
path = Path(path) |
|
|
path.parent.mkdir(parents=True, exist_ok=True) |
|
|
|
|
|
|
|
|
audio_int16 = (audio * 32767).astype(np.int16) |
|
|
wavfile.write(str(path), sr, audio_int16) |
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
|
|
|
print("Audio synthesis demo...") |
|
|
|
|
|
|
|
|
sr = 16000 |
|
|
duration = 10.0 |
|
|
t = np.arange(int(duration * sr)) / sr |
|
|
music = np.sin(2 * np.pi * 220 * t) * 0.3 |
|
|
|
|
|
|
|
|
beats = np.arange(0, duration, 0.5).tolist() |
|
|
downbeats = np.arange(0, duration, 2.0).tolist() |
|
|
|
|
|
|
|
|
clicks = create_click_track(beats, downbeats, duration=duration, sr=sr) |
|
|
|
|
|
|
|
|
mixed = mix_audio(music, clicks, click_volume=0.6) |
|
|
|
|
|
print(f"Created mixed audio: {len(mixed)} samples ({len(mixed) / sr:.2f}s)") |
|
|
print(f"Beats: {len(beats)}, Downbeats: {len(downbeats)}") |
|
|
|
|
|
|
|
|
save_audio(mixed, "/tmp/beat_click_demo.wav", sr=sr) |
|
|
print("Saved demo to /tmp/beat_click_demo.wav") |
|
|
|