import os import subprocess import tempfile from pathlib import Path from typing import Tuple import librosa import numpy as np import soundfile as sf def _load_audio(audio_path: str, mono: bool = False) -> Tuple[np.ndarray, float]: """ Load an audio file in stereo format. Args: audio_path: Path to audio file or URL mono: Whether to load as mono or stereo (default: False) Returns: Tuple of (audio_data, sample_rate) """ y, sr = librosa.load(audio_path, sr=None, mono=mono, res_type="soxr_vhq") return y, sr def estimate_key(audio_path: str) -> str: """ Estimate the musical key of an audio file using chroma features and harmonic analysis. This function analyzes the harmonic content of an audio file to determine its musical key using chroma features and statistical analysis of pitch class distributions. Args: audio_path: Path to audio file or URL (supports common formats: WAV, MP3, FLAC) Returns: Estimated key as string (e.g., 'C', 'C#', 'D', 'D#', 'E', 'F', 'F#', 'G', 'G#', 'A', 'A#', 'B') Examples: - Returns 'C' for audio in C major/A minor - Returns 'F#' for audio in F# major/D# minor - Returns 'A' for audio in A major/F# minor Note: Uses medium quality processing for faster analysis Most accurate for music with clear harmonic content May be less accurate for atonal or highly percussive music """ try: y, sr = librosa.load( audio_path, res_type="soxr_mq" ) # Medium quality for faster processing # Extract chroma features chroma = librosa.feature.chroma_cqt(y=y, sr=sr) # Get the most prominent pitch class chroma_mean = np.mean(chroma, axis=1) key_index = np.argmax(chroma_mean) # Map index to key names keys = ["C", "C#", "D", "D#", "E", "F", "F#", "G", "G#", "A", "A#", "B"] estimated_key = keys[key_index] return estimated_key except Exception as e: raise RuntimeError(f"Error estimating key: {str(e)}") def key_to_semitones(key: str, target_key: str = "C") -> int: """ Calculate semitone difference between two keys. Args: key: Source key target_key: Target key to align to Returns: Number of semitones to shift """ keys = ["C", "C#", "D", "D#", "E", "F", "F#", "G", "G#", "A", "A#", "B"] if key not in keys or target_key not in keys: raise ValueError("Invalid key name") key_index = keys.index(key) target_index = keys.index(target_key) # Calculate semitone difference (wrapping around 12 semitones) semitones = (target_index - key_index) % 12 if semitones > 6: semitones -= 12 return semitones def align_songs_by_key( audio1_path: str, audio2_path: str, target_key: str = "C", output_path: str = "output", output_format: str = "wav", ) -> Tuple[str, str]: """ Align two songs to the same musical key by pitch shifting. Args: audio1_path: Path to first audio file (supports WAV, MP3, FLAC) audio2_path: Path to second audio file (supports WAV, MP3, FLAC) target_key: Target key to align both songs to (default: 'C') output_path: Directory to save the aligned audio files output_format: Output format ('wav' or 'mp3', default: 'wav') Returns: Tuple of (aligned_audio1_path, aligned_audio2_path) - paths to processed files """ try: # Estimate keys for both tracks (handled internally by shift_to_key) # key1 = estimate_key(audio1_path) # key2 = estimate_key(audio2_path) # Calculate semitone shifts (handled internally by shift_to_key) # semitones1 = key_to_semitones(key1, target_key) # semitones2 = key_to_semitones(key2, target_key) # Load audio files y1, sr1 = _load_audio(audio1_path) y2, sr2 = _load_audio(audio2_path) # res_type = "soxr_vhq" # Very high quality for final output (set in shift_to_key) aligned1_path = shift_to_key( audio1_path, target_key, output_path, output_format ) aligned2_path = shift_to_key( audio2_path, target_key, output_path, output_format ) return aligned1_path, aligned2_path except Exception as e: raise RuntimeError(f"Error aligning audio keys: {str(e)}") from e def shift_to_key( audio_path: str, target_key: str, output_path: str = "output", output_format: str = "wav", ) -> str: """ Shift an audio file to a specific musical key. Args: audio_path: Path to audio file or URL (supports WAV, MP3, FLAC) target_key: Target key to shift to output_path: Directory to save the shifted audio file output_format: Output format ('wav' or 'mp3', default: 'wav') Returns: Path to the pitch-shifted audio file """ try: # Estimate current key current_key = estimate_key(audio_path) # Calculate semitone shift semitones = key_to_semitones(current_key, target_key) # Load and shift audio y, sr = _load_audio(audio_path) y_shifted = librosa.effects.pitch_shift( y, n_steps=semitones, scale=True, sr=sr, res_type="soxr_vhq" ) # Save to temporary file audio_filename = Path(audio_path).stem os.makedirs(output_path, exist_ok=True) if y_shifted.ndim == 2: y_shifted = y_shifted.T final_audio_path = os.path.join( output_path, f"{audio_filename}_shifted_to_{target_key}.{output_format.lower()}", ) if output_format.lower() == "mp3": # For MP3, save as WAV first then convert with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_wav: sf.write(temp_wav.name, y_shifted, sr, format="wav", subtype="PCM_16") # Convert to MP3 using ffmpeg cmd = [ "ffmpeg", "-y", "-i", temp_wav.name, "-c:a", "libmp3lame", "-b:a", "192k", final_audio_path, ] subprocess.run(cmd, capture_output=True, check=True) # Clean up temp file os.unlink(temp_wav.name) else: sf.write(final_audio_path, y_shifted, sr, format="wav", subtype="PCM_16") return final_audio_path except Exception as e: raise RuntimeError(f"Error shifting key: {str(e)}") if __name__ == "__main__": import argparse parser = argparse.ArgumentParser( description="Pitch alignment tools for audio files" ) subparsers = parser.add_subparsers(dest="command", help="Available commands") # Estimate key of a single file estimate_parser = subparsers.add_parser( "estimate", help="Estimate the key of an audio file" ) estimate_parser.add_argument("audio", help="Path to audio file") # Align two songs by key align_parser = subparsers.add_parser("align", help="Align two songs to same key") align_parser.add_argument("audio1", help="Path to first audio file") align_parser.add_argument("audio2", help="Path to second audio file") align_parser.add_argument( "--target-key", default="C", help="Target key to align to (default: C)" ) align_parser.add_argument( "--format", default="wav", choices=["wav", "mp3"], help="Output format" ) # Shift single file to key shift_parser = subparsers.add_parser("shift", help="Shift audio to specific key") shift_parser.add_argument("audio", help="Path to audio file") shift_parser.add_argument("target_key", help="Target key to shift to") shift_parser.add_argument( "--format", default="wav", choices=["wav", "mp3"], help="Output format" ) args = parser.parse_args() try: if args.command == "estimate": key = estimate_key(args.audio) print(f"Estimated key: {key}") elif args.command == "align": aligned1, aligned2 = align_songs_by_key( args.audio1, args.audio2, args.target_key, output_format=args.format ) print(f"Aligned audio 1: {aligned1}") print(f"Aligned audio 2: {aligned2}") elif args.command == "shift": output = shift_to_key( args.audio, args.target_key, output_format=args.format ) print(f"Shifted audio saved to: {output}") else: parser.print_help() except Exception as e: print(f"Error: {e}") raise e exit(1)