JacobLinCool's picture
Upload folder using huggingface_hub
31bf74c unverified
"""
Audio synthesis utilities for beat tracking evaluation.
This module provides functions to:
- Generate click sounds for beats and downbeats
- Mix click tracks with original audio
- Save audio files with beat annotations
Example usage:
from exp.data.audio import create_click_track, mix_audio, save_audio
# Create click track
clicks = create_click_track(
beat_times=pred_beats,
downbeat_times=pred_downbeats,
duration=30.0,
sr=16000
)
# Mix with original audio
mixed = mix_audio(original_audio, clicks, click_volume=0.5)
# Save to file
save_audio(mixed, "output.wav", sr=16000)
"""
import numpy as np
from pathlib import Path
def generate_click(
frequency: float = 1000.0,
duration: float = 0.02,
sr: int = 16000,
attack: float = 0.002,
decay: float = 0.018,
) -> np.ndarray:
"""
Generate a single click sound.
Args:
frequency: Frequency of the click tone in Hz
duration: Duration of the click in seconds
sr: Sample rate
attack: Attack time in seconds
decay: Decay time in seconds
Returns:
Click waveform as numpy array
"""
t = np.arange(int(duration * sr)) / sr
# Generate sine wave
wave = np.sin(2 * np.pi * frequency * t)
# Apply envelope (attack-decay)
envelope = np.ones_like(t)
attack_samples = int(attack * sr)
decay_samples = int(decay * sr)
if attack_samples > 0:
envelope[:attack_samples] = np.linspace(0, 1, attack_samples)
if decay_samples > 0:
decay_start = len(t) - decay_samples
if decay_start > 0:
envelope[decay_start:] = np.linspace(1, 0, decay_samples)
return wave * envelope
def create_click_track(
beat_times: list[float] | np.ndarray,
downbeat_times: list[float] | np.ndarray | None = None,
duration: float | None = None,
sr: int = 16000,
beat_freq: float = 1000.0,
downbeat_freq: float = 1500.0,
click_duration: float = 0.03,
) -> np.ndarray:
"""
Create a click track from beat and downbeat times.
Args:
beat_times: List of beat times in seconds
downbeat_times: List of downbeat times in seconds (optional)
duration: Total duration in seconds (auto-detected if None)
sr: Sample rate
beat_freq: Frequency for beat clicks (Hz)
downbeat_freq: Frequency for downbeat clicks (Hz)
click_duration: Duration of each click in seconds
Returns:
Click track as numpy array
"""
beat_times = np.array(beat_times) if len(beat_times) > 0 else np.array([])
if downbeat_times is not None:
downbeat_times = (
np.array(downbeat_times) if len(downbeat_times) > 0 else np.array([])
)
else:
downbeat_times = np.array([])
# Determine duration
if duration is None:
all_times = np.concatenate([beat_times, downbeat_times])
if len(all_times) == 0:
return np.array([])
duration = float(np.max(all_times)) + 1.0
# Create output array
total_samples = int(duration * sr)
output = np.zeros(total_samples, dtype=np.float32)
# Generate click templates
beat_click = generate_click(frequency=beat_freq, duration=click_duration, sr=sr)
downbeat_click = generate_click(
frequency=downbeat_freq, duration=click_duration, sr=sr
)
# Convert downbeat times to set for fast lookup
downbeat_set = set(np.round(downbeat_times, 3))
# Add beat clicks
for t in beat_times:
sample_idx = int(t * sr)
if sample_idx < 0 or sample_idx >= total_samples:
continue
# Use downbeat click if this is also a downbeat
is_downbeat = np.round(t, 3) in downbeat_set
click = downbeat_click if is_downbeat else beat_click
# Add click to output
end_idx = min(sample_idx + len(click), total_samples)
click_len = end_idx - sample_idx
output[sample_idx:end_idx] += click[:click_len]
# Add downbeat clicks (for downbeats not already in beats)
beat_set = set(np.round(beat_times, 3))
for t in downbeat_times:
if np.round(t, 3) in beat_set:
continue # Already added as beat
sample_idx = int(t * sr)
if sample_idx < 0 or sample_idx >= total_samples:
continue
end_idx = min(sample_idx + len(downbeat_click), total_samples)
click_len = end_idx - sample_idx
output[sample_idx:end_idx] += downbeat_click[:click_len]
return output
def mix_audio(
audio: np.ndarray,
click_track: np.ndarray,
click_volume: float = 0.5,
) -> np.ndarray:
"""
Mix original audio with a click track.
Args:
audio: Original audio waveform
click_track: Click track to overlay
click_volume: Volume of clicks relative to audio (0.0 to 1.0)
Returns:
Mixed audio
"""
# Ensure same length
max_len = max(len(audio), len(click_track))
audio_padded = np.zeros(max_len, dtype=np.float32)
click_padded = np.zeros(max_len, dtype=np.float32)
audio_padded[: len(audio)] = audio
click_padded[: len(click_track)] = click_track
# Normalize audio
audio_max = np.abs(audio_padded).max()
if audio_max > 0:
audio_padded = audio_padded / audio_max * 0.8
# Normalize clicks
click_max = np.abs(click_padded).max()
if click_max > 0:
click_padded = click_padded / click_max * click_volume * 0.8
# Mix
mixed = audio_padded + click_padded
# Prevent clipping
max_val = np.abs(mixed).max()
if max_val > 1.0:
mixed = mixed / max_val * 0.95
return mixed.astype(np.float32)
def create_comparison_audio(
audio: np.ndarray,
pred_beats: list[float],
pred_downbeats: list[float],
gt_beats: list[float],
gt_downbeats: list[float],
sr: int = 16000,
click_volume: float = 0.5,
) -> tuple[np.ndarray, np.ndarray, np.ndarray]:
"""
Create audio files for comparison: prediction clicks, ground truth clicks, and combined.
Args:
audio: Original audio waveform
pred_beats: Predicted beat times
pred_downbeats: Predicted downbeat times
gt_beats: Ground truth beat times
gt_downbeats: Ground truth downbeat times
sr: Sample rate
click_volume: Volume of clicks
Returns:
Tuple of (audio_with_pred_clicks, audio_with_gt_clicks, audio_with_both)
"""
duration = len(audio) / sr
# Create click tracks
pred_clicks = create_click_track(
pred_beats,
pred_downbeats,
duration=duration,
sr=sr,
beat_freq=1000.0,
downbeat_freq=1500.0,
)
gt_clicks = create_click_track(
gt_beats,
gt_downbeats,
duration=duration,
sr=sr,
beat_freq=800.0, # Different frequency for GT
downbeat_freq=1200.0,
)
# Mix
audio_pred = mix_audio(audio, pred_clicks, click_volume)
audio_gt = mix_audio(audio, gt_clicks, click_volume)
audio_both = mix_audio(audio, pred_clicks + gt_clicks, click_volume)
return audio_pred, audio_gt, audio_both
def save_audio(
audio: np.ndarray,
path: str | Path,
sr: int = 16000,
) -> None:
"""
Save audio to a WAV file.
Args:
audio: Audio waveform
path: Output file path
sr: Sample rate
"""
import scipy.io.wavfile as wavfile
path = Path(path)
path.parent.mkdir(parents=True, exist_ok=True)
# Convert to int16
audio_int16 = (audio * 32767).astype(np.int16)
wavfile.write(str(path), sr, audio_int16)
if __name__ == "__main__":
# Demo
print("Audio synthesis demo...")
# Create a simple sine wave as "music"
sr = 16000
duration = 10.0
t = np.arange(int(duration * sr)) / sr
music = np.sin(2 * np.pi * 220 * t) * 0.3 # 220 Hz tone
# Beats every 0.5s, downbeats every 2s
beats = np.arange(0, duration, 0.5).tolist()
downbeats = np.arange(0, duration, 2.0).tolist()
# Create click track
clicks = create_click_track(beats, downbeats, duration=duration, sr=sr)
# Mix
mixed = mix_audio(music, clicks, click_volume=0.6)
print(f"Created mixed audio: {len(mixed)} samples ({len(mixed) / sr:.2f}s)")
print(f"Beats: {len(beats)}, Downbeats: {len(downbeats)}")
# Save demo
save_audio(mixed, "/tmp/beat_click_demo.wav", sr=sr)
print("Saved demo to /tmp/beat_click_demo.wav")