File size: 8,869 Bytes
801ea60
0069183
 
 
801ea60
 
 
 
 
 
 
 
 
 
 
 
14e5437
801ea60
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14e5437
801ea60
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0069183
801ea60
 
 
 
 
0069183
 
801ea60
 
0069183
801ea60
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0069183
 
 
 
 
 
801ea60
 
 
 
 
 
 
0069183
 
 
 
 
 
801ea60
 
 
 
14e5437
801ea60
 
0069183
801ea60
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0069183
801ea60
 
 
 
 
 
0069183
 
801ea60
0069183
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
801ea60
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0069183
 
 
801ea60
 
 
 
 
0069183
 
 
801ea60
 
 
 
 
 
 
 
 
0069183
801ea60
 
 
 
0069183
 
 
801ea60
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
import os
import subprocess
import tempfile
from pathlib import Path
from typing import Tuple

import librosa
import numpy as np
import soundfile as sf


def _load_audio(audio_path: str, mono: bool = False) -> Tuple[np.ndarray, float]:
    """
    Load an audio file in stereo format.

    Args:
        audio_path: Path to audio file or URL
        mono: Whether to load as mono or stereo (default: False)

    Returns:
        Tuple of (audio_data, sample_rate)
    """
    y, sr = librosa.load(audio_path, sr=None, mono=mono, res_type="soxr_vhq")
    return y, sr


def estimate_key(audio_path: str) -> str:
    """
    Estimate the musical key of an audio file using chroma features and harmonic analysis.

    This function analyzes the harmonic content of an audio file to determine its musical key
    using chroma features and statistical analysis of pitch class distributions.

    Args:
        audio_path: Path to audio file or URL (supports common formats: WAV, MP3, FLAC)

    Returns:
        Estimated key as string (e.g., 'C', 'C#', 'D', 'D#', 'E', 'F', 'F#', 'G', 'G#', 'A', 'A#', 'B')

    Examples:
        - Returns 'C' for audio in C major/A minor
        - Returns 'F#' for audio in F# major/D# minor
        - Returns 'A' for audio in A major/F# minor

    Note:
        Uses medium quality processing for faster analysis
        Most accurate for music with clear harmonic content
        May be less accurate for atonal or highly percussive music
    """
    try:
        y, sr = librosa.load(
            audio_path, res_type="soxr_mq"
        )  # Medium quality for faster processing

        # Extract chroma features
        chroma = librosa.feature.chroma_cqt(y=y, sr=sr)

        # Get the most prominent pitch class
        chroma_mean = np.mean(chroma, axis=1)
        key_index = np.argmax(chroma_mean)

        # Map index to key names
        keys = ["C", "C#", "D", "D#", "E", "F", "F#", "G", "G#", "A", "A#", "B"]
        estimated_key = keys[key_index]

        return estimated_key

    except Exception as e:
        raise RuntimeError(f"Error estimating key: {str(e)}")


def key_to_semitones(key: str, target_key: str = "C") -> int:
    """
    Calculate semitone difference between two keys.

    Args:
        key: Source key
        target_key: Target key to align to

    Returns:
        Number of semitones to shift
    """
    keys = ["C", "C#", "D", "D#", "E", "F", "F#", "G", "G#", "A", "A#", "B"]

    if key not in keys or target_key not in keys:
        raise ValueError("Invalid key name")

    key_index = keys.index(key)
    target_index = keys.index(target_key)

    # Calculate semitone difference (wrapping around 12 semitones)
    semitones = (target_index - key_index) % 12
    if semitones > 6:
        semitones -= 12

    return semitones


def align_songs_by_key(
    audio1_path: str,
    audio2_path: str,
    target_key: str = "C",
    output_path: str = "output",
    output_format: str = "wav",
) -> Tuple[str, str]:
    """
    Align two songs to the same musical key by pitch shifting.

    Args:
        audio1_path: Path to first audio file (supports WAV, MP3, FLAC)
        audio2_path: Path to second audio file (supports WAV, MP3, FLAC)
        target_key: Target key to align both songs to (default: 'C')
        output_path: Directory to save the aligned audio files
        output_format: Output format ('wav' or 'mp3', default: 'wav')

    Returns:
        Tuple of (aligned_audio1_path, aligned_audio2_path) - paths to processed files
    """
    try:
        # Estimate keys for both tracks (handled internally by shift_to_key)
        # key1 = estimate_key(audio1_path)
        # key2 = estimate_key(audio2_path)

        # Calculate semitone shifts (handled internally by shift_to_key)
        # semitones1 = key_to_semitones(key1, target_key)
        # semitones2 = key_to_semitones(key2, target_key)

        # Load audio files
        y1, sr1 = _load_audio(audio1_path)
        y2, sr2 = _load_audio(audio2_path)

        # res_type = "soxr_vhq"  # Very high quality for final output (set in shift_to_key)

        aligned1_path = shift_to_key(
            audio1_path, target_key, output_path, output_format
        )
        aligned2_path = shift_to_key(
            audio2_path, target_key, output_path, output_format
        )

        return aligned1_path, aligned2_path

    except Exception as e:
        raise RuntimeError(f"Error aligning audio keys: {str(e)}") from e


def shift_to_key(
    audio_path: str,
    target_key: str,
    output_path: str = "output",
    output_format: str = "wav",
) -> str:
    """
    Shift an audio file to a specific musical key.

    Args:
        audio_path: Path to audio file or URL (supports WAV, MP3, FLAC)
        target_key: Target key to shift to
        output_path: Directory to save the shifted audio file
        output_format: Output format ('wav' or 'mp3', default: 'wav')

    Returns:
        Path to the pitch-shifted audio file
    """
    try:
        # Estimate current key
        current_key = estimate_key(audio_path)

        # Calculate semitone shift
        semitones = key_to_semitones(current_key, target_key)

        # Load and shift audio
        y, sr = _load_audio(audio_path)
        y_shifted = librosa.effects.pitch_shift(
            y, n_steps=semitones, scale=True, sr=sr, res_type="soxr_vhq"
        )

        # Save to temporary file
        audio_filename = Path(audio_path).stem
        os.makedirs(output_path, exist_ok=True)

        if y_shifted.ndim == 2:
            y_shifted = y_shifted.T

        final_audio_path = os.path.join(
            output_path,
            f"{audio_filename}_shifted_to_{target_key}.{output_format.lower()}",
        )

        if output_format.lower() == "mp3":
            # For MP3, save as WAV first then convert
            with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_wav:
                sf.write(temp_wav.name, y_shifted, sr, format="wav", subtype="PCM_16")

                # Convert to MP3 using ffmpeg
                cmd = [
                    "ffmpeg",
                    "-y",
                    "-i",
                    temp_wav.name,
                    "-c:a",
                    "libmp3lame",
                    "-b:a",
                    "192k",
                    final_audio_path,
                ]
                subprocess.run(cmd, capture_output=True, check=True)

                # Clean up temp file
                os.unlink(temp_wav.name)
        else:
            sf.write(final_audio_path, y_shifted, sr, format="wav", subtype="PCM_16")

        return final_audio_path

    except Exception as e:
        raise RuntimeError(f"Error shifting key: {str(e)}")


if __name__ == "__main__":
    import argparse

    parser = argparse.ArgumentParser(
        description="Pitch alignment tools for audio files"
    )
    subparsers = parser.add_subparsers(dest="command", help="Available commands")

    # Estimate key of a single file
    estimate_parser = subparsers.add_parser(
        "estimate", help="Estimate the key of an audio file"
    )
    estimate_parser.add_argument("audio", help="Path to audio file")

    # Align two songs by key
    align_parser = subparsers.add_parser("align", help="Align two songs to same key")
    align_parser.add_argument("audio1", help="Path to first audio file")
    align_parser.add_argument("audio2", help="Path to second audio file")
    align_parser.add_argument(
        "--target-key", default="C", help="Target key to align to (default: C)"
    )
    align_parser.add_argument(
        "--format", default="wav", choices=["wav", "mp3"], help="Output format"
    )

    # Shift single file to key
    shift_parser = subparsers.add_parser("shift", help="Shift audio to specific key")
    shift_parser.add_argument("audio", help="Path to audio file")
    shift_parser.add_argument("target_key", help="Target key to shift to")
    shift_parser.add_argument(
        "--format", default="wav", choices=["wav", "mp3"], help="Output format"
    )

    args = parser.parse_args()

    try:
        if args.command == "estimate":
            key = estimate_key(args.audio)
            print(f"Estimated key: {key}")
        elif args.command == "align":
            aligned1, aligned2 = align_songs_by_key(
                args.audio1, args.audio2, args.target_key, output_format=args.format
            )
            print(f"Aligned audio 1: {aligned1}")
            print(f"Aligned audio 2: {aligned2}")
        elif args.command == "shift":
            output = shift_to_key(
                args.audio, args.target_key, output_format=args.format
            )
            print(f"Shifted audio saved to: {output}")
        else:
            parser.print_help()
    except Exception as e:
        print(f"Error: {e}")
        raise e
        exit(1)