File size: 8,508 Bytes

31bf74c

"""
Audio synthesis utilities for beat tracking evaluation.

This module provides functions to:
- Generate click sounds for beats and downbeats
- Mix click tracks with original audio
- Save audio files with beat annotations

Example usage:
    from exp.data.audio import create_click_track, mix_audio, save_audio

    # Create click track
    clicks = create_click_track(
        beat_times=pred_beats,
        downbeat_times=pred_downbeats,
        duration=30.0,
        sr=16000
    )

    # Mix with original audio
    mixed = mix_audio(original_audio, clicks, click_volume=0.5)

    # Save to file
    save_audio(mixed, "output.wav", sr=16000)
"""

import numpy as np
from pathlib import Path


def generate_click(
    frequency: float = 1000.0,
    duration: float = 0.02,
    sr: int = 16000,
    attack: float = 0.002,
    decay: float = 0.018,
) -> np.ndarray:
    """
    Generate a single click sound.

    Args:
        frequency: Frequency of the click tone in Hz
        duration: Duration of the click in seconds
        sr: Sample rate
        attack: Attack time in seconds
        decay: Decay time in seconds

    Returns:
        Click waveform as numpy array
    """
    t = np.arange(int(duration * sr)) / sr

    # Generate sine wave
    wave = np.sin(2 * np.pi * frequency * t)

    # Apply envelope (attack-decay)
    envelope = np.ones_like(t)
    attack_samples = int(attack * sr)
    decay_samples = int(decay * sr)

    if attack_samples > 0:
        envelope[:attack_samples] = np.linspace(0, 1, attack_samples)
    if decay_samples > 0:
        decay_start = len(t) - decay_samples
        if decay_start > 0:
            envelope[decay_start:] = np.linspace(1, 0, decay_samples)

    return wave * envelope


def create_click_track(
    beat_times: list[float] | np.ndarray,
    downbeat_times: list[float] | np.ndarray | None = None,
    duration: float | None = None,
    sr: int = 16000,
    beat_freq: float = 1000.0,
    downbeat_freq: float = 1500.0,
    click_duration: float = 0.03,
) -> np.ndarray:
    """
    Create a click track from beat and downbeat times.

    Args:
        beat_times: List of beat times in seconds
        downbeat_times: List of downbeat times in seconds (optional)
        duration: Total duration in seconds (auto-detected if None)
        sr: Sample rate
        beat_freq: Frequency for beat clicks (Hz)
        downbeat_freq: Frequency for downbeat clicks (Hz)
        click_duration: Duration of each click in seconds

    Returns:
        Click track as numpy array
    """
    beat_times = np.array(beat_times) if len(beat_times) > 0 else np.array([])
    if downbeat_times is not None:
        downbeat_times = (
            np.array(downbeat_times) if len(downbeat_times) > 0 else np.array([])
        )
    else:
        downbeat_times = np.array([])

    # Determine duration
    if duration is None:
        all_times = np.concatenate([beat_times, downbeat_times])
        if len(all_times) == 0:
            return np.array([])
        duration = float(np.max(all_times)) + 1.0

    # Create output array
    total_samples = int(duration * sr)
    output = np.zeros(total_samples, dtype=np.float32)

    # Generate click templates
    beat_click = generate_click(frequency=beat_freq, duration=click_duration, sr=sr)
    downbeat_click = generate_click(
        frequency=downbeat_freq, duration=click_duration, sr=sr
    )

    # Convert downbeat times to set for fast lookup
    downbeat_set = set(np.round(downbeat_times, 3))

    # Add beat clicks
    for t in beat_times:
        sample_idx = int(t * sr)
        if sample_idx < 0 or sample_idx >= total_samples:
            continue

        # Use downbeat click if this is also a downbeat
        is_downbeat = np.round(t, 3) in downbeat_set
        click = downbeat_click if is_downbeat else beat_click

        # Add click to output
        end_idx = min(sample_idx + len(click), total_samples)
        click_len = end_idx - sample_idx
        output[sample_idx:end_idx] += click[:click_len]

    # Add downbeat clicks (for downbeats not already in beats)
    beat_set = set(np.round(beat_times, 3))
    for t in downbeat_times:
        if np.round(t, 3) in beat_set:
            continue  # Already added as beat

        sample_idx = int(t * sr)
        if sample_idx < 0 or sample_idx >= total_samples:
            continue

        end_idx = min(sample_idx + len(downbeat_click), total_samples)
        click_len = end_idx - sample_idx
        output[sample_idx:end_idx] += downbeat_click[:click_len]

    return output


def mix_audio(
    audio: np.ndarray,
    click_track: np.ndarray,
    click_volume: float = 0.5,
) -> np.ndarray:
    """
    Mix original audio with a click track.

    Args:
        audio: Original audio waveform
        click_track: Click track to overlay
        click_volume: Volume of clicks relative to audio (0.0 to 1.0)

    Returns:
        Mixed audio
    """
    # Ensure same length
    max_len = max(len(audio), len(click_track))
    audio_padded = np.zeros(max_len, dtype=np.float32)
    click_padded = np.zeros(max_len, dtype=np.float32)

    audio_padded[: len(audio)] = audio
    click_padded[: len(click_track)] = click_track

    # Normalize audio
    audio_max = np.abs(audio_padded).max()
    if audio_max > 0:
        audio_padded = audio_padded / audio_max * 0.8

    # Normalize clicks
    click_max = np.abs(click_padded).max()
    if click_max > 0:
        click_padded = click_padded / click_max * click_volume * 0.8

    # Mix
    mixed = audio_padded + click_padded

    # Prevent clipping
    max_val = np.abs(mixed).max()
    if max_val > 1.0:
        mixed = mixed / max_val * 0.95

    return mixed.astype(np.float32)


def create_comparison_audio(
    audio: np.ndarray,
    pred_beats: list[float],
    pred_downbeats: list[float],
    gt_beats: list[float],
    gt_downbeats: list[float],
    sr: int = 16000,
    click_volume: float = 0.5,
) -> tuple[np.ndarray, np.ndarray, np.ndarray]:
    """
    Create audio files for comparison: prediction clicks, ground truth clicks, and combined.

    Args:
        audio: Original audio waveform
        pred_beats: Predicted beat times
        pred_downbeats: Predicted downbeat times
        gt_beats: Ground truth beat times
        gt_downbeats: Ground truth downbeat times
        sr: Sample rate
        click_volume: Volume of clicks

    Returns:
        Tuple of (audio_with_pred_clicks, audio_with_gt_clicks, audio_with_both)
    """
    duration = len(audio) / sr

    # Create click tracks
    pred_clicks = create_click_track(
        pred_beats,
        pred_downbeats,
        duration=duration,
        sr=sr,
        beat_freq=1000.0,
        downbeat_freq=1500.0,
    )

    gt_clicks = create_click_track(
        gt_beats,
        gt_downbeats,
        duration=duration,
        sr=sr,
        beat_freq=800.0,  # Different frequency for GT
        downbeat_freq=1200.0,
    )

    # Mix
    audio_pred = mix_audio(audio, pred_clicks, click_volume)
    audio_gt = mix_audio(audio, gt_clicks, click_volume)
    audio_both = mix_audio(audio, pred_clicks + gt_clicks, click_volume)

    return audio_pred, audio_gt, audio_both


def save_audio(
    audio: np.ndarray,
    path: str | Path,
    sr: int = 16000,
) -> None:
    """
    Save audio to a WAV file.

    Args:
        audio: Audio waveform
        path: Output file path
        sr: Sample rate
    """
    import scipy.io.wavfile as wavfile

    path = Path(path)
    path.parent.mkdir(parents=True, exist_ok=True)

    # Convert to int16
    audio_int16 = (audio * 32767).astype(np.int16)
    wavfile.write(str(path), sr, audio_int16)


if __name__ == "__main__":
    # Demo
    print("Audio synthesis demo...")

    # Create a simple sine wave as "music"
    sr = 16000
    duration = 10.0
    t = np.arange(int(duration * sr)) / sr
    music = np.sin(2 * np.pi * 220 * t) * 0.3  # 220 Hz tone

    # Beats every 0.5s, downbeats every 2s
    beats = np.arange(0, duration, 0.5).tolist()
    downbeats = np.arange(0, duration, 2.0).tolist()

    # Create click track
    clicks = create_click_track(beats, downbeats, duration=duration, sr=sr)

    # Mix
    mixed = mix_audio(music, clicks, click_volume=0.6)

    print(f"Created mixed audio: {len(mixed)} samples ({len(mixed) / sr:.2f}s)")
    print(f"Beats: {len(beats)}, Downbeats: {len(downbeats)}")

    # Save demo
    save_audio(mixed, "/tmp/beat_click_demo.wav", sr=sr)
    print("Saved demo to /tmp/beat_click_demo.wav")