Spaces:

frascuchon
/

music-mcp

Running on CPU Upgrade

File size: 8,869 Bytes

import os
import subprocess
import tempfile
from pathlib import Path
from typing import Tuple

import librosa
import numpy as np
import soundfile as sf


def _load_audio(audio_path: str, mono: bool = False) -> Tuple[np.ndarray, float]:
    """
    Load an audio file in stereo format.

    Args:
        audio_path: Path to audio file or URL
        mono: Whether to load as mono or stereo (default: False)

    Returns:
        Tuple of (audio_data, sample_rate)
    """
    y, sr = librosa.load(audio_path, sr=None, mono=mono, res_type="soxr_vhq")
    return y, sr


def estimate_key(audio_path: str) -> str:
    """
    Estimate the musical key of an audio file using chroma features and harmonic analysis.

    This function analyzes the harmonic content of an audio file to determine its musical key
    using chroma features and statistical analysis of pitch class distributions.

    Args:
        audio_path: Path to audio file or URL (supports common formats: WAV, MP3, FLAC)

    Returns:
        Estimated key as string (e.g., 'C', 'C#', 'D', 'D#', 'E', 'F', 'F#', 'G', 'G#', 'A', 'A#', 'B')

    Examples:
        - Returns 'C' for audio in C major/A minor
        - Returns 'F#' for audio in F# major/D# minor
        - Returns 'A' for audio in A major/F# minor

    Note:
        Uses medium quality processing for faster analysis
        Most accurate for music with clear harmonic content
        May be less accurate for atonal or highly percussive music
    """
    try:
        y, sr = librosa.load(
            audio_path, res_type="soxr_mq"
        )  # Medium quality for faster processing

        # Extract chroma features
        chroma = librosa.feature.chroma_cqt(y=y, sr=sr)

        # Get the most prominent pitch class
        chroma_mean = np.mean(chroma, axis=1)
        key_index = np.argmax(chroma_mean)

        # Map index to key names
        keys = ["C", "C#", "D", "D#", "E", "F", "F#", "G", "G#", "A", "A#", "B"]
        estimated_key = keys[key_index]

        return estimated_key

    except Exception as e:
        raise RuntimeError(f"Error estimating key: {str(e)}")


def key_to_semitones(key: str, target_key: str = "C") -> int:
    """
    Calculate semitone difference between two keys.

    Args:
        key: Source key
        target_key: Target key to align to

    Returns:
        Number of semitones to shift
    """
    keys = ["C", "C#", "D", "D#", "E", "F", "F#", "G", "G#", "A", "A#", "B"]

    if key not in keys or target_key not in keys:
        raise ValueError("Invalid key name")

    key_index = keys.index(key)
    target_index = keys.index(target_key)

    # Calculate semitone difference (wrapping around 12 semitones)
    semitones = (target_index - key_index) % 12
    if semitones > 6:
        semitones -= 12

    return semitones


def align_songs_by_key(
    audio1_path: str,
    audio2_path: str,
    target_key: str = "C",
    output_path: str = "output",
    output_format: str = "wav",
) -> Tuple[str, str]:
    """
    Align two songs to the same musical key by pitch shifting.

    Args:
        audio1_path: Path to first audio file (supports WAV, MP3, FLAC)
        audio2_path: Path to second audio file (supports WAV, MP3, FLAC)
        target_key: Target key to align both songs to (default: 'C')
        output_path: Directory to save the aligned audio files
        output_format: Output format ('wav' or 'mp3', default: 'wav')

    Returns:
        Tuple of (aligned_audio1_path, aligned_audio2_path) - paths to processed files
    """
    try:
        # Estimate keys for both tracks (handled internally by shift_to_key)
        # key1 = estimate_key(audio1_path)
        # key2 = estimate_key(audio2_path)

        # Calculate semitone shifts (handled internally by shift_to_key)
        # semitones1 = key_to_semitones(key1, target_key)
        # semitones2 = key_to_semitones(key2, target_key)

        # Load audio files
        y1, sr1 = _load_audio(audio1_path)
        y2, sr2 = _load_audio(audio2_path)

        # res_type = "soxr_vhq"  # Very high quality for final output (set in shift_to_key)

        aligned1_path = shift_to_key(
            audio1_path, target_key, output_path, output_format
        )
        aligned2_path = shift_to_key(
            audio2_path, target_key, output_path, output_format
        )

        return aligned1_path, aligned2_path

    except Exception as e:
        raise RuntimeError(f"Error aligning audio keys: {str(e)}") from e


def shift_to_key(
    audio_path: str,
    target_key: str,
    output_path: str = "output",
    output_format: str = "wav",
) -> str:
    """
    Shift an audio file to a specific musical key.

    Args:
        audio_path: Path to audio file or URL (supports WAV, MP3, FLAC)
        target_key: Target key to shift to
        output_path: Directory to save the shifted audio file
        output_format: Output format ('wav' or 'mp3', default: 'wav')

    Returns:
        Path to the pitch-shifted audio file
    """
    try:
        # Estimate current key
        current_key = estimate_key(audio_path)

        # Calculate semitone shift
        semitones = key_to_semitones(current_key, target_key)

        # Load and shift audio
        y, sr = _load_audio(audio_path)
        y_shifted = librosa.effects.pitch_shift(
            y, n_steps=semitones, scale=True, sr=sr, res_type="soxr_vhq"
        )

        # Save to temporary file
        audio_filename = Path(audio_path).stem
        os.makedirs(output_path, exist_ok=True)

        if y_shifted.ndim == 2:
            y_shifted = y_shifted.T

        final_audio_path = os.path.join(
            output_path,
            f"{audio_filename}_shifted_to_{target_key}.{output_format.lower()}",
        )

        if output_format.lower() == "mp3":
            # For MP3, save as WAV first then convert
            with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_wav:
                sf.write(temp_wav.name, y_shifted, sr, format="wav", subtype="PCM_16")

                # Convert to MP3 using ffmpeg
                cmd = [
                    "ffmpeg",
                    "-y",
                    "-i",
                    temp_wav.name,
                    "-c:a",
                    "libmp3lame",
                    "-b:a",
                    "192k",
                    final_audio_path,
                ]
                subprocess.run(cmd, capture_output=True, check=True)

                # Clean up temp file
                os.unlink(temp_wav.name)
        else:
            sf.write(final_audio_path, y_shifted, sr, format="wav", subtype="PCM_16")

        return final_audio_path

    except Exception as e:
        raise RuntimeError(f"Error shifting key: {str(e)}")


if __name__ == "__main__":
    import argparse

    parser = argparse.ArgumentParser(
        description="Pitch alignment tools for audio files"
    )
    subparsers = parser.add_subparsers(dest="command", help="Available commands")

    # Estimate key of a single file
    estimate_parser = subparsers.add_parser(
        "estimate", help="Estimate the key of an audio file"
    )
    estimate_parser.add_argument("audio", help="Path to audio file")

    # Align two songs by key
    align_parser = subparsers.add_parser("align", help="Align two songs to same key")
    align_parser.add_argument("audio1", help="Path to first audio file")
    align_parser.add_argument("audio2", help="Path to second audio file")
    align_parser.add_argument(
        "--target-key", default="C", help="Target key to align to (default: C)"
    )
    align_parser.add_argument(
        "--format", default="wav", choices=["wav", "mp3"], help="Output format"
    )

    # Shift single file to key
    shift_parser = subparsers.add_parser("shift", help="Shift audio to specific key")
    shift_parser.add_argument("audio", help="Path to audio file")
    shift_parser.add_argument("target_key", help="Target key to shift to")
    shift_parser.add_argument(
        "--format", default="wav", choices=["wav", "mp3"], help="Output format"
    )

    args = parser.parse_args()

    try:
        if args.command == "estimate":
            key = estimate_key(args.audio)
            print(f"Estimated key: {key}")
        elif args.command == "align":
            aligned1, aligned2 = align_songs_by_key(
                args.audio1, args.audio2, args.target_key, output_format=args.format
            )
            print(f"Aligned audio 1: {aligned1}")
            print(f"Aligned audio 2: {aligned2}")
        elif args.command == "shift":
            output = shift_to_key(
                args.audio, args.target_key, output_format=args.format
            )
            print(f"Shifted audio saved to: {output}")
        else:
            parser.print_help()
    except Exception as e:
        print(f"Error: {e}")
        raise e
        exit(1)