frascuchon HF Staff commited on
Commit
801ea60
·
1 Parent(s): f086c75

All the tools and gradio server

Browse files
.gitignore ADDED
@@ -0,0 +1,82 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+
6
+ # C extensions
7
+ *.so
8
+
9
+ # Distribution / packaging
10
+ .Python
11
+ build/
12
+ develop-eggs/
13
+ dist/
14
+ downloads/
15
+ eggs/
16
+ .eggs/
17
+ lib/
18
+ lib64/
19
+ parts/
20
+ sdist/
21
+ var/
22
+ *.egg-info/
23
+ .installed.cfg
24
+ *.egg
25
+
26
+ # PyInstaller
27
+ # Usually these files are written by a python script from a template
28
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
29
+ *.manifest
30
+ *.spec
31
+
32
+ # Installer logs
33
+ pip-log.txt
34
+ pip-delete-this-directory.txt
35
+
36
+ # Unit test / coverage reports
37
+ htmlcov/
38
+ .tox/
39
+ .nox/
40
+ .coverage
41
+ .coverage.*
42
+ .cache
43
+ nosetests.xml
44
+ coverage.xml
45
+ *.cover
46
+ .hypothesis/
47
+ .pytest_cache/
48
+
49
+ # Jupyter Notebook
50
+ .ipynb_checkpoints
51
+
52
+ # pyenv
53
+ .python-version
54
+
55
+ # mypy
56
+ .mypy_cache/
57
+ .dmypy.json
58
+ dmypy.json
59
+
60
+ # Pyre type checker
61
+ .pyre/
62
+
63
+ # VS Code
64
+ .vscode/
65
+
66
+ # Local env
67
+ .env
68
+ .venv
69
+ env/
70
+ venv/
71
+ ENV/
72
+ env.bak/
73
+ venv.bak/
74
+
75
+ # MacOS
76
+ .DS_Store
77
+
78
+ # IDEs
79
+ .idea/
80
+ *.iml
81
+ *.sublime-workspace
82
+ *.sublime-project
README.md CHANGED
@@ -5,7 +5,7 @@ colorFrom: indigo
5
  colorTo: pink
6
  sdk: gradio
7
  sdk_version: 5.49.1
8
- app_file: app.py
9
  pinned: false
10
  ---
11
 
 
5
  colorTo: pink
6
  sdk: gradio
7
  sdk_version: 5.49.1
8
+ app_file: mcp_server.py
9
  pinned: false
10
  ---
11
 
mcp_server.py ADDED
@@ -0,0 +1,426 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from typing import Dict, Tuple
3
+
4
+ from tools.combine_tracks import combine_tracks, create_medley
5
+ from tools.stems_separation import (
6
+ separate_audio,
7
+ extract_selected_stems,
8
+ extract_vocal_non_vocal,
9
+ create_karaoke_track,
10
+ )
11
+ from tools.time_strech import align_songs_by_bpm, stretch_to_bpm
12
+ from tools.youtube_extract import extract_audio_from_youtube
13
+
14
+
15
+ def pitch_shift_with_semitones(audio_path: str, semitones: int) -> str:
16
+ """
17
+ Shift the pitch of an audio file by a specified number of semitones.
18
+
19
+ This function uses librosa's pitch shifting algorithm to change the musical pitch
20
+ of an audio file while maintaining its tempo and duration.
21
+
22
+ Args:
23
+ audio_path: Path to the input audio file (supports common formats: WAV, MP3, FLAC)
24
+ semitones: Number of semitones to shift (positive = higher pitch, negative = lower pitch)
25
+ Range: -12 to +12 semitones (1 octave up/down)
26
+
27
+ Returns:
28
+ Path to the pitch-shifted audio file in WAV format
29
+
30
+ Examples:
31
+ - semitones=2: Shift up by 2 semitones (1 whole tone)
32
+ - semitones=-5: Shift down by 5 semitones (1 perfect fourth)
33
+ - semitones=0: No change (returns original file)
34
+
35
+ Note:
36
+ The function creates a temporary WAV file that should be cleaned up by the caller
37
+ """
38
+ if semitones == 0:
39
+ return audio_path
40
+
41
+ # Load audio to get sample rate
42
+ import librosa
43
+
44
+ y, sr = librosa.load(audio_path, sr=None, mono=False)
45
+
46
+ # Apply pitch shift
47
+ y_shifted = librosa.effects.pitch_shift(y, n_steps=semitones, sr=sr)
48
+
49
+ # Save to temporary file
50
+ import tempfile
51
+ import soundfile as sf
52
+
53
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp:
54
+ if y_shifted.ndim == 2:
55
+ y_shifted = y_shifted.T
56
+ sf.write(tmp.name, y_shifted, sr, format="wav", subtype="PCM_16")
57
+ return tmp.name
58
+
59
+
60
+ def stretch_audio_to_bpm_wrapper(audio_path: str, target_bpm: float) -> str:
61
+ """
62
+ Stretch or compress audio to match a specific BPM (beats per minute) while maintaining pitch.
63
+
64
+ This function uses time-stretching algorithms to change the tempo of an audio file
65
+ without affecting its musical pitch, making it useful for beat-matching and tempo alignment.
66
+
67
+ Args:
68
+ audio_path: Path to the input audio file (supports common formats: WAV, MP3, FLAC)
69
+ target_bpm: Target beats per minute (BPM) value
70
+ Typical range: 60-200 BPM
71
+ Common values: 90 (slow), 120 (medium), 140 (fast), 128 (electronic)
72
+
73
+ Returns:
74
+ Path to the time-stretched audio file in WAV format
75
+
76
+ Examples:
77
+ - target_bpm=128: Stretch to typical electronic dance music tempo
78
+ - target_bpm=120: Stretch to standard pop/rock tempo
79
+ - target_bpm=140: Stretch to fast electronic or rock tempo
80
+
81
+ Note:
82
+ The function automatically detects the original BPM and calculates the stretch factor
83
+ Creates a new WAV file with the modified tempo
84
+ """
85
+ return stretch_to_bpm(audio_path, target_bpm)
86
+
87
+
88
+ def extract_selected_stems_wrapper(
89
+ audio_path: str, vocals: bool, drums: bool, bass: bool, other: bool
90
+ ) -> Dict[str, str]:
91
+ """
92
+ Extract selected stems from an audio file based on user choices.
93
+
94
+ This function allows selective extraction of specific stems rather than all four stems,
95
+ which can save processing time and storage space when only certain elements are needed.
96
+
97
+ Args:
98
+ audio_path: Path to the input audio file (supports common formats: WAV, MP3, FLAC, M4A)
99
+ vocals: Whether to extract the vocals stem
100
+ drums: Whether to extract the drums stem
101
+ bass: Whether to extract the bass stem
102
+ other: Whether to extract the other stem
103
+
104
+ Returns:
105
+ dict[str, str]: Dictionary mapping stem names to their file paths
106
+
107
+ Examples:
108
+ - vocals=True, drums=True, bass=False, other=False: Extract only vocals and drums
109
+ - vocals=True, drums=False, bass=False, other=False: Extract only vocals for karaoke
110
+ - vocals=False, drums=True, bass=True, other=False: Extract rhythm section (drums + bass)
111
+
112
+ Note:
113
+ At least one stem must be selected for extraction
114
+ Uses the same high-quality Demucs model as separate_audio
115
+ Processing time is the same as full separation since Demucs extracts all stems internally
116
+ """
117
+ stems_to_extract = []
118
+ if vocals:
119
+ stems_to_extract.append("vocals")
120
+ if drums:
121
+ stems_to_extract.append("drums")
122
+ if bass:
123
+ stems_to_extract.append("bass")
124
+ if other:
125
+ stems_to_extract.append("other")
126
+
127
+ if not stems_to_extract:
128
+ raise ValueError("At least one stem must be selected for extraction")
129
+
130
+ return extract_selected_stems(audio_path, stems_to_extract)
131
+
132
+
133
+ def extract_vocal_non_vocal_wrapper(audio_path: str) -> Tuple[str, str]:
134
+ """
135
+ Extract vocals and non-vocals (instrumental) stems from an audio file.
136
+
137
+ This function provides a simple interface to separate audio into vocal and
138
+ non-vocal components, which is useful for karaoke creation, vocal isolation,
139
+ or instrumental extraction.
140
+
141
+ Args:
142
+ audio_path: Path to the input audio file (supports common formats: WAV, MP3, FLAC, M4A)
143
+
144
+ Returns:
145
+ tuple[str, str]: Paths to (vocals_file, instrumental_file)
146
+ - vocals_file: Path to the isolated vocal track
147
+ - instrumental_file: Path to the combined instrumental track (drums + bass + other)
148
+
149
+ Examples:
150
+ - extract_vocal_non_vocal_wrapper('song.mp3'): Separate into vocals and instrumental
151
+ - extract_vocal_non_vocal_wrapper('song.wav'): Create vocal and backing track versions
152
+
153
+ Note:
154
+ The instrumental track combines drums, bass, and other stems into a single track
155
+ Uses the same high-quality Demucs model as separate_audio
156
+ Instrumental track is automatically mixed and normalized for consistent volume
157
+ """
158
+ return extract_vocal_non_vocal(audio_path)
159
+
160
+
161
+ def create_karaoke_track_wrapper(audio_path: str) -> str:
162
+ """
163
+ Create a karaoke (instrumental) track by removing vocals from an audio file.
164
+
165
+ This is a convenience function that extracts the instrumental (non-vocal) portion
166
+ of a song, creating a karaoke-ready backing track.
167
+
168
+ Args:
169
+ audio_path: Path to the input audio file (supports common formats: WAV, MP3, FLAC, M4A)
170
+
171
+ Returns:
172
+ Path to the karaoke (instrumental) audio file
173
+
174
+ Examples:
175
+ - create_karaoke_track_wrapper('song.mp3'): Create karaoke version
176
+ - create_karaoke_track_wrapper('song.wav'): Create instrumental backing track
177
+
178
+ Note:
179
+ Uses the same high-quality Demucs model as separate_audio
180
+ Combines drums, bass, and other stems into a single instrumental track
181
+ Automatically normalized for consistent volume and quality
182
+ Perfect for karaoke applications or backing track creation
183
+ """
184
+ return create_karaoke_track(audio_path)
185
+
186
+
187
+ def create_interface():
188
+ """Create the Gradio interface with all tools."""
189
+
190
+ # Tab 1: Stem Separation
191
+ stem_interface = gr.Interface(
192
+ fn=separate_audio,
193
+ inputs=gr.Audio(type="filepath", label="Upload Audio File", sources=["upload"]),
194
+ outputs=[
195
+ gr.Audio(label="Vocals", type="filepath"),
196
+ gr.Audio(label="Drums", type="filepath"),
197
+ gr.Audio(label="Bass", type="filepath"),
198
+ gr.Audio(label="Other", type="filepath"),
199
+ ],
200
+ title="Audio Stem Separation",
201
+ description="Upload an audio file to separate it into vocals, drums, bass, and other stems.",
202
+ examples=None,
203
+ cache_examples=False,
204
+ flagging_mode="never",
205
+ )
206
+
207
+ # Tab 2: Track Combination
208
+ combine_interface = gr.Interface(
209
+ fn=combine_tracks,
210
+ inputs=[
211
+ gr.Audio(type="filepath", label="First Audio Track", sources=["upload"]),
212
+ gr.Audio(type="filepath", label="Second Audio Track", sources=["upload"]),
213
+ gr.Slider(
214
+ minimum=0.0, maximum=1.0, value=0.5, label="Weight for First Track"
215
+ ),
216
+ gr.Slider(
217
+ minimum=0.0, maximum=1.0, value=0.5, label="Weight for Second Track"
218
+ ),
219
+ gr.Checkbox(value=True, label="Normalize Output"),
220
+ gr.Number(value=0.0, label="Fade In Duration (seconds)"),
221
+ gr.Number(value=0.0, label="Fade Out Duration (seconds)"),
222
+ ],
223
+ outputs=gr.Audio(label="Combined Track", type="filepath"),
224
+ title="Combine Audio Tracks",
225
+ description="Combine two audio tracks with adjustable weights and optional fade effects.",
226
+ examples=None,
227
+ cache_examples=False,
228
+ flagging_mode="never",
229
+ )
230
+
231
+ # Tab 3: Pitch Alignment
232
+ pitch_interface = gr.Interface(
233
+ fn=pitch_shift_with_semitones,
234
+ inputs=[
235
+ gr.Audio(type="filepath", label="Upload Audio File", sources=["upload"]),
236
+ gr.Number(value=0, label="Semitones to Shift"),
237
+ ],
238
+ outputs=gr.Audio(label="Pitch Shifted Audio", type="filepath"),
239
+ title="Pitch Shift Audio",
240
+ description="Shift the pitch of an audio file by specified semitones.",
241
+ examples=None,
242
+ cache_examples=False,
243
+ flagging_mode="never",
244
+ )
245
+
246
+ # Tab 4: Time Stretching
247
+ stretch_interface = gr.Interface(
248
+ fn=stretch_audio_to_bpm_wrapper,
249
+ inputs=[
250
+ gr.Audio(type="filepath", label="Upload Audio File", sources=["upload"]),
251
+ gr.Number(value=120, label="Target BPM"),
252
+ ],
253
+ outputs=gr.Audio(label="Stretched Audio", type="filepath"),
254
+ title="Stretch Audio to BPM",
255
+ description="Stretch audio to match a specific BPM.",
256
+ examples=None,
257
+ cache_examples=False,
258
+ allow_flagging="never",
259
+ )
260
+
261
+ # Tab 5: BPM Alignment
262
+ bpm_interface = gr.Interface(
263
+ fn=align_songs_by_bpm,
264
+ inputs=[
265
+ gr.Audio(type="filepath", label="First Audio Track", sources=["upload"]),
266
+ gr.Audio(type="filepath", label="Second Audio Track", sources=["upload"]),
267
+ ],
268
+ outputs=[
269
+ gr.Audio(label="Aligned First Track", type="filepath"),
270
+ gr.Audio(label="Aligned Second Track", type="filepath"),
271
+ ],
272
+ title="Align Songs by BPM",
273
+ description="Align two songs to the same BPM by stretching the faster one to match the slower one.",
274
+ examples=None,
275
+ cache_examples=False,
276
+ flagging_mode="never",
277
+ )
278
+
279
+ # Tab 6: Selective Stem Extraction
280
+ selective_interface = gr.Interface(
281
+ fn=extract_selected_stems_wrapper,
282
+ inputs=[
283
+ gr.Audio(type="filepath", label="Upload Audio File", sources=["upload"]),
284
+ gr.Checkbox(value=True, label="Extract Vocals"),
285
+ gr.Checkbox(value=True, label="Extract Drums"),
286
+ gr.Checkbox(value=True, label="Extract Bass"),
287
+ gr.Checkbox(value=True, label="Extract Other"),
288
+ ],
289
+ outputs=gr.JSON(label="Extracted Stems"),
290
+ title="Selective Stem Extraction",
291
+ description="Extract only specific stems from an audio file to save processing time and storage.",
292
+ examples=None,
293
+ cache_examples=False,
294
+ flagging_mode="never",
295
+ )
296
+
297
+ # Tab 7: Vocal/Non-Vocal Separation
298
+ vocal_nonvocal_interface = gr.Interface(
299
+ fn=extract_vocal_non_vocal_wrapper,
300
+ inputs=gr.Audio(type="filepath", label="Upload Audio File", sources=["upload"]),
301
+ outputs=[
302
+ gr.Audio(label="Vocals Track", type="filepath"),
303
+ gr.Audio(label="Instrumental Track", type="filepath"),
304
+ ],
305
+ title="Vocal/Instrumental Separation",
306
+ description="Separate audio into vocal and instrumental components for karaoke or vocal isolation.",
307
+ examples=None,
308
+ cache_examples=False,
309
+ flagging_mode="never",
310
+ )
311
+
312
+ # Tab 8: Karaoke Track Creation
313
+ karaoke_interface = gr.Interface(
314
+ fn=create_karaoke_track_wrapper,
315
+ inputs=gr.Audio(type="filepath", label="Upload Audio File", sources=["upload"]),
316
+ outputs=gr.Audio(label="Karaoke Track", type="filepath"),
317
+ title="Create Karaoke Track",
318
+ description="Create a karaoke-ready instrumental track by removing vocals from any song.",
319
+ examples=None,
320
+ cache_examples=False,
321
+ flagging_mode="never",
322
+ )
323
+
324
+ # Tab 9: Medley Creation
325
+ medley_interface = gr.Interface(
326
+ fn=create_medley,
327
+ inputs=[
328
+ gr.Audio(type="filepath", label="Vocals Stem", sources=["upload"]),
329
+ gr.Audio(type="filepath", label="Instrumental Stem", sources=["upload"]),
330
+ gr.Number(
331
+ value=1.2, label="Vocals Gain", minimum=0.1, maximum=3.0, step=0.1
332
+ ),
333
+ gr.Number(
334
+ value=0.9, label="Instrumental Gain", minimum=0.1, maximum=3.0, step=0.1
335
+ ),
336
+ gr.Textbox(
337
+ value="threshold=-18dB:ratio=3:attack=50:release=200",
338
+ label="Compressor Settings",
339
+ placeholder="threshold=-18dB:ratio=3:attack=50:release=200",
340
+ ),
341
+ gr.Dropdown(
342
+ choices=["libmp3lame", "aac", "flac", "pcm_s16le"],
343
+ value="libmp3lame",
344
+ label="Audio Codec",
345
+ ),
346
+ gr.Textbox(value="192k", label="Audio Bitrate", placeholder="192k"),
347
+ ],
348
+ outputs=gr.Audio(label="Medley Audio", type="filepath"),
349
+ title="Create Vocal/Instrumental Medley",
350
+ description="Mix vocals and instrumental stems into a polished medley with compression and gain control.",
351
+ examples=None,
352
+ cache_examples=False,
353
+ flagging_mode="never",
354
+ )
355
+
356
+ # Tab 10: YouTube Extraction
357
+ youtube_interface = gr.Interface(
358
+ fn=extract_audio_from_youtube,
359
+ inputs=[
360
+ gr.Textbox(
361
+ label="YouTube URL", placeholder="https://www.youtube.com/watch?v=..."
362
+ ),
363
+ gr.Dropdown(
364
+ choices=["wav", "mp3", "flac"], value="wav", label="Output Format"
365
+ ),
366
+ gr.Dropdown(choices=["best", "worst"], value="best", label="Audio Quality"),
367
+ ],
368
+ outputs=gr.Audio(label="Extracted Audio", type="filepath"),
369
+ title="Extract Audio from YouTube",
370
+ description="Extract audio from a YouTube video URL.",
371
+ examples=None,
372
+ cache_examples=False,
373
+ flagging_mode="never",
374
+ )
375
+
376
+ # Tab 7: YouTube Extraction
377
+ youtube_interface = gr.Interface(
378
+ fn=extract_audio_from_youtube,
379
+ inputs=[
380
+ gr.Textbox(
381
+ label="YouTube URL", placeholder="https://www.youtube.com/watch?v=..."
382
+ ),
383
+ gr.Dropdown(
384
+ choices=["wav", "mp3", "flac"], value="wav", label="Output Format"
385
+ ),
386
+ gr.Dropdown(choices=["best", "worst"], value="best", label="Audio Quality"),
387
+ ],
388
+ outputs=gr.Audio(label="Extracted Audio", type="filepath"),
389
+ title="Extract Audio from YouTube",
390
+ description="Extract audio from a YouTube video URL.",
391
+ examples=None,
392
+ cache_examples=False,
393
+ flagging_mode="never",
394
+ )
395
+
396
+ return gr.TabbedInterface(
397
+ [
398
+ stem_interface,
399
+ combine_interface,
400
+ pitch_interface,
401
+ stretch_interface,
402
+ bpm_interface,
403
+ selective_interface,
404
+ vocal_nonvocal_interface,
405
+ karaoke_interface,
406
+ medley_interface,
407
+ youtube_interface,
408
+ ],
409
+ [
410
+ "Stem Separation",
411
+ "Track Combination",
412
+ "Pitch Alignment",
413
+ "Time Stretching",
414
+ "BPM Alignment",
415
+ "Selective Stems",
416
+ "Vocal/Instrumental",
417
+ "Karaoke Creation",
418
+ "Medley Creation",
419
+ "YouTube Extraction",
420
+ ],
421
+ )
422
+
423
+
424
+ if __name__ == "__main__":
425
+ interface = create_interface()
426
+ interface.launch(server_name="0.0.0.0", server_port=7860, mcp_server=True)
mypy.ini ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ [mypy-untyped_package.*]
2
+ follow_untyped_imports = True
requirements.txt ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ gradio[mcp]>=4.0.0
2
+ librosa>=0.10.0
3
+ numpy>=1.24.0
4
+ torch>=2.0.0
5
+ torchaudio>=2.0.0
6
+ transformers>=4.30.0
7
+ soundfile>=0.12.0
8
+ pydub>=0.25.0
9
+ demucs>=4.0.0
10
+ pytest>=7.0.0
11
+ ruff>=0.1.0
12
+ mypy>=1.0.0
13
+ smolagents[mcp]
14
+ yt_dlp>=2025.11.12
tools/__init__.py ADDED
File without changes
tools/combine_tracks.py ADDED
@@ -0,0 +1,387 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import subprocess
3
+ import tempfile
4
+ from pathlib import Path
5
+ from typing import Optional
6
+
7
+ import librosa
8
+ import numpy as np
9
+ import soundfile as sf
10
+
11
+
12
+ def combine_tracks(
13
+ track1_path: str,
14
+ track2_path: str,
15
+ weight1: float = 0.5,
16
+ weight2: float = 0.5,
17
+ output_path: Optional[str] = None,
18
+ normalize: bool = True,
19
+ fade_in: float = 0.0,
20
+ fade_out: float = 0.0,
21
+ ) -> str:
22
+ """
23
+ Combine two audio tracks into a new single stereo audio track with adjustable mixing weights.
24
+
25
+ This function mixes two audio files together with customizable balance, normalization,
26
+ and fade effects. Useful for creating mashups, adding background music to vocals,
27
+ or layering multiple audio sources.
28
+
29
+ Args:
30
+ track1_path: Path to first audio file (supports common formats: WAV, MP3, FLAC)
31
+ track2_path: Path to second audio file (supports common formats: WAV, MP3, FLAC)
32
+ weight1: Weight factor for first track (0.0-1.0, default: 0.5)
33
+ 1.0 = full volume, 0.5 = half volume, 0.0 = silent
34
+ weight2: Weight factor for second track (0.0-1.0, default: 0.5)
35
+ 1.0 = full volume, 0.5 = half volume, 0.0 = silent
36
+ output_path: Optional output file path (default: temporary file)
37
+ normalize: Whether to normalize the final output to prevent clipping (default: True)
38
+ fade_in: Fade in duration in seconds (default: 0.0)
39
+ fade_out: Fade out duration in seconds (default: 0.0)
40
+
41
+ Returns:
42
+ Path to the combined audio file in WAV format
43
+
44
+ Examples:
45
+ - weight1=0.8, weight2=0.2: First track dominates the mix
46
+ - weight1=0.5, weight2=0.5: Equal balance between tracks
47
+ - weight1=1.0, weight2=0.3: First track at full volume, second track quiet
48
+ - fade_in=2.0, fade_out=3.0: Gradual volume increase and decrease
49
+
50
+ Note:
51
+ Both tracks are automatically resampled to match the higher sample rate
52
+ Tracks of different lengths are padded with silence to match the longer one
53
+ Output is saved in WAV format for maximum quality
54
+ """
55
+ try:
56
+ # Load both audio files
57
+ y1, sr1 = librosa.load(track1_path, mono=False)
58
+ y2, sr2 = librosa.load(track2_path, mono=False)
59
+
60
+ # Ensure both tracks are stereo
61
+ if y1.ndim == 1:
62
+ y1 = np.stack([y1, y1])
63
+ if y2.ndim == 1:
64
+ y2 = np.stack([y2, y2])
65
+
66
+ # Ensure same sample rate
67
+ if sr1 != sr2:
68
+ y2 = librosa.resample(y2, orig_sr=sr2, target_sr=sr1)
69
+ sr2 = sr1
70
+
71
+ # Ensure same length
72
+ max_length = max(y1.shape[1], y2.shape[1])
73
+ if y1.shape[1] < max_length:
74
+ y1 = np.pad(y1, ((0, 0), (0, max_length - y1.shape[1])), mode="constant")
75
+ if y2.shape[1] < max_length:
76
+ y2 = np.pad(y2, ((0, 0), (0, max_length - y2.shape[1])), mode="constant")
77
+
78
+ # Apply weights and combine
79
+ combined = weight1 * y1 + weight2 * y2
80
+
81
+ # Apply fade in/out if specified
82
+ if fade_in > 0:
83
+ fade_samples = int(fade_in * sr1)
84
+ if fade_samples > 0:
85
+ fade_curve = np.linspace(0, 1, fade_samples)
86
+ combined[:, :fade_samples] *= fade_curve
87
+
88
+ if fade_out > 0:
89
+ fade_samples = int(fade_out * sr1)
90
+ if fade_samples > 0:
91
+ fade_curve = np.linspace(1, 0, fade_samples)
92
+ combined[:, -fade_samples:] *= fade_curve
93
+
94
+ # Normalize if requested
95
+ if normalize:
96
+ max_val = np.max(np.abs(combined))
97
+ if max_val > 0:
98
+ combined = combined / max_val * 0.95
99
+
100
+ # Save to file
101
+ if output_path:
102
+ os.makedirs(output_path, exist_ok=True)
103
+ else:
104
+ output_path = tempfile.mkdtemp(suffix="_combined")
105
+
106
+ final_audio_filename = os.path.join(output_path, "stereo_combined.wav")
107
+ sf.write(final_audio_filename, combined.T, sr1, format="wav", subtype="PCM_16")
108
+
109
+ return final_audio_filename
110
+
111
+ except Exception as e:
112
+ raise RuntimeError(f"Error combining tracks: {str(e)}")
113
+
114
+
115
+ def create_stereo_mix(
116
+ left_track_path: str,
117
+ right_track_path: str,
118
+ output_path: Optional[str] = None,
119
+ normalize: bool = True,
120
+ ) -> str:
121
+ """
122
+ Create a stereo track with one track in left channel and another in right channel.
123
+
124
+ Args:
125
+ left_track_path: Path to audio file for left channel
126
+ right_track_path: Path to audio file for right channel
127
+ output_path: Optional output file path (default: temp file)
128
+ normalize: Whether to normalize the final output
129
+
130
+ Returns:
131
+ Path to the stereo audio file
132
+ """
133
+ try:
134
+ # Load both audio files
135
+ y_left, sr_left = librosa.load(left_track_path, mono=True)
136
+ y_right, sr_right = librosa.load(right_track_path, mono=True)
137
+
138
+ # Ensure same sample rate
139
+ if sr_left != sr_right:
140
+ y_right = librosa.resample(y_right, orig_sr=sr_right, target_sr=sr_left)
141
+ sr_right = sr_left
142
+
143
+ # Ensure same length
144
+ max_length = max(len(y_left), len(y_right))
145
+ if len(y_left) < max_length:
146
+ y_left = np.pad(y_left, (0, max_length - len(y_left)), mode="constant")
147
+ if len(y_right) < max_length:
148
+ y_right = np.pad(y_right, (0, max_length - len(y_right)), mode="constant")
149
+
150
+ # Create stereo array
151
+ stereo = np.array([y_left, y_right])
152
+
153
+ # Normalize if requested
154
+ if normalize:
155
+ max_val = np.max(np.abs(stereo))
156
+ if max_val > 0:
157
+ stereo = stereo / max_val * 0.95
158
+
159
+ # Save to file
160
+
161
+ if output_path is None:
162
+ output_path = tempfile.mkdtemp(suffix="_combined")
163
+ else:
164
+ os.makedirs(output_path, exist_ok=True)
165
+
166
+ final_audio_filename = os.path.join(output_path, "stereo_mix.wav")
167
+ sf.write(
168
+ final_audio_filename, stereo.T, sr_left, format="wav", subtype="PCM_16"
169
+ )
170
+
171
+ return final_audio_filename
172
+
173
+ except Exception as e:
174
+ raise RuntimeError(f"Error creating stereo mix: {str(e)}")
175
+
176
+
177
+ def create_medley(
178
+ vocals_path: str,
179
+ instrumental_path: str,
180
+ *,
181
+ output_path: Optional[str] = None,
182
+ vocals_gain: float = 1.2,
183
+ instrumental_gain: float = 0.9,
184
+ compressor: str = "threshold=-18dB:ratio=3:attack=50:release=200",
185
+ audio_codec: str = "libmp3lame",
186
+ audio_bitrate: str = "192k",
187
+ ) -> str:
188
+ """
189
+ Mixes a vocal stem with an instrumental stem using ffmpeg filters.
190
+
191
+ Parameters
192
+ ----------
193
+ vocals_path : str
194
+ Absolute path (or MCP-accessible URI) to the vocals stem.
195
+ instrumental_path : str
196
+ Absolute path (or MCP-accessible URI) to the instrumental/no-vocals stem.
197
+ output_path : str, optional
198
+ Where to write the medley. Defaults to a temp file the MCP tool returns.
199
+ vocals_gain : float
200
+ Linear gain applied to the vocals stem (1.0 = unity).
201
+ instrumental_gain : float
202
+ Linear gain applied to the instrumental stem.
203
+ compressor : str
204
+ ffmpeg acompressor parameters for peak control after mixing.
205
+ audio_codec : str
206
+ Target codec passed to ffmpeg’s -c:a flag.
207
+ audio_bitrate : str
208
+ Bitrate passed to ffmpeg’s -b:a flag.
209
+
210
+ Returns
211
+ -------
212
+ str
213
+ Path to the rendered medley file.
214
+ """
215
+ vocals = Path(vocals_path).expanduser().resolve()
216
+ instrumental = Path(instrumental_path).expanduser().resolve()
217
+ if not vocals.exists():
218
+ raise FileNotFoundError(f"Vocals stem not found: {vocals}")
219
+ if not instrumental.exists():
220
+ raise FileNotFoundError(f"Instrumental stem not found: {instrumental}")
221
+
222
+ if output_path is None:
223
+ tmp_dir = tempfile.mkdtemp(prefix="mcp-medley-")
224
+ output = Path(tmp_dir) / "unidos_hyper_medley.mp3"
225
+ else:
226
+ output = Path(output_path).expanduser().resolve()
227
+ output.parent.mkdir(parents=True, exist_ok=True)
228
+
229
+ filter_complex = (
230
+ f"[0:a]volume={vocals_gain}[v];"
231
+ f"[1:a]volume={instrumental_gain}[i];"
232
+ f"[v][i]amix=inputs=2:duration=longest:dropout_transition=2,"
233
+ f"acompressor={compressor}"
234
+ )
235
+
236
+ cmd = [
237
+ "ffmpeg",
238
+ "-y",
239
+ "-i",
240
+ str(vocals),
241
+ "-i",
242
+ str(instrumental),
243
+ "-filter_complex",
244
+ filter_complex,
245
+ "-c:a",
246
+ audio_codec,
247
+ "-b:a",
248
+ audio_bitrate,
249
+ str(output),
250
+ ]
251
+
252
+ completed = subprocess.run(cmd, capture_output=True, text=True)
253
+ if completed.returncode != 0:
254
+ raise RuntimeError(
255
+ f"ffmpeg failed ({completed.returncode}):\n"
256
+ f"STDOUT:\n{completed.stdout}\nSTDERR:\n{completed.stderr}"
257
+ )
258
+
259
+ return str(output)
260
+
261
+
262
+ if __name__ == "__main__":
263
+ import argparse
264
+
265
+ parser = argparse.ArgumentParser(description="Combine audio tracks")
266
+ subparsers = parser.add_subparsers(dest="command", help="Available commands")
267
+
268
+ # Combine tracks with weights
269
+ combine_parser = subparsers.add_parser(
270
+ "combine", help="Combine two tracks with weights"
271
+ )
272
+ combine_parser.add_argument("track1", help="Path to first audio file")
273
+ combine_parser.add_argument("track2", help="Path to second audio file")
274
+ combine_parser.add_argument(
275
+ "--weight1", type=float, default=0.5, help="Weight for first track (0.0-1.0)"
276
+ )
277
+ combine_parser.add_argument(
278
+ "--weight2", type=float, default=0.5, help="Weight for second track (0.0-1.0)"
279
+ )
280
+ combine_parser.add_argument(
281
+ "--fade-in", type=float, default=0.0, help="Fade in duration in seconds"
282
+ )
283
+ combine_parser.add_argument(
284
+ "--fade-out", type=float, default=0.0, help="Fade out duration in seconds"
285
+ )
286
+ combine_parser.add_argument(
287
+ "--no-normalize", action="store_true", help="Disable normalization"
288
+ )
289
+ combine_parser.add_argument(
290
+ "--output", type=str, default="output", help="Output file path"
291
+ )
292
+
293
+ # Create stereo mix
294
+ stereo_parser = subparsers.add_parser(
295
+ "stereo", help="Create stereo mix (left/right channels)"
296
+ )
297
+ stereo_parser.add_argument("left", help="Path to left channel audio file")
298
+ stereo_parser.add_argument("right", help="Path to right channel audio file")
299
+ stereo_parser.add_argument(
300
+ "--no-normalize", action="store_true", help="Disable normalization"
301
+ )
302
+ stereo_parser.add_argument(
303
+ "--output", type=str, default="stereo_output", help="Output file path"
304
+ )
305
+
306
+ # Create medley
307
+ medley_parser = subparsers.add_parser(
308
+ "medley", help="Create a vocal/instrumental medley using ffmpeg"
309
+ )
310
+ medley_parser.add_argument("vocals", help="Path to vocals stem audio file")
311
+ medley_parser.add_argument(
312
+ "instrumental", help="Path to instrumental stem audio file"
313
+ )
314
+ medley_parser.add_argument(
315
+ "--vocals-gain",
316
+ type=float,
317
+ default=1.2,
318
+ help="Linear gain for vocals (default: 1.2)",
319
+ )
320
+ medley_parser.add_argument(
321
+ "--instrumental-gain",
322
+ type=float,
323
+ default=0.9,
324
+ help="Linear gain for instrumental (default: 0.9)",
325
+ )
326
+ medley_parser.add_argument(
327
+ "--compressor",
328
+ type=str,
329
+ default="threshold=-18dB:ratio=3:attack=50:release=200",
330
+ help="FFmpeg acompressor parameters (default: threshold=-18dB:ratio=3:attack=50:release=200)",
331
+ )
332
+ medley_parser.add_argument(
333
+ "--audio-codec",
334
+ type=str,
335
+ default="libmp3lame",
336
+ help="Target audio codec (default: libmp3lame)",
337
+ )
338
+ medley_parser.add_argument(
339
+ "--audio-bitrate",
340
+ type=str,
341
+ default="192k",
342
+ help="Audio bitrate (default: 192k)",
343
+ )
344
+ medley_parser.add_argument(
345
+ "--output", type=str, help="Output file path (default: temporary file)"
346
+ )
347
+
348
+ args = parser.parse_args()
349
+
350
+ try:
351
+ if args.command == "combine":
352
+ output = combine_tracks(
353
+ args.track1,
354
+ args.track2,
355
+ weight1=args.weight1,
356
+ weight2=args.weight2,
357
+ normalize=not args.no_normalize,
358
+ fade_in=args.fade_in,
359
+ fade_out=args.fade_out,
360
+ output_path=args.output,
361
+ )
362
+ print(f"Combined audio saved to: {output}")
363
+ elif args.command == "stereo":
364
+ output = create_stereo_mix(
365
+ args.left,
366
+ args.right,
367
+ normalize=not args.no_normalize,
368
+ output_path=args.output,
369
+ )
370
+ print(f"Stereo mix saved to: {output}")
371
+ elif args.command == "medley":
372
+ output = create_medley(
373
+ args.vocals,
374
+ args.instrumental,
375
+ output_path=args.output,
376
+ vocals_gain=args.vocals_gain,
377
+ instrumental_gain=args.instrumental_gain,
378
+ compressor=args.compressor,
379
+ audio_codec=args.audio_codec,
380
+ audio_bitrate=args.audio_bitrate,
381
+ )
382
+ print(f"Medley saved to: {output}")
383
+ else:
384
+ parser.print_help()
385
+ except Exception as e:
386
+ print(f"Error: {e}")
387
+ exit(1)
tools/pitch_alignment.py ADDED
@@ -0,0 +1,228 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from typing import Tuple
3
+
4
+ import librosa
5
+ import numpy as np
6
+ import soundfile as sf
7
+
8
+
9
+ def _load_audio(audio_path: str, mono: bool = False) -> Tuple[np.ndarray, float]:
10
+ """
11
+ Load an audio file in stereo format.
12
+
13
+ Args:
14
+ audio_path: Path to audio file
15
+ mono: Whether to load as mono or stereo (default: False)
16
+
17
+ Returns:
18
+ Tuple of (audio_data, sample_rate)
19
+ """
20
+ y, sr = librosa.load(audio_path, sr=None, mono=mono, res_type="soxr_vhq")
21
+ return y, sr
22
+
23
+
24
+ def estimate_key(audio_path: str) -> str:
25
+ """
26
+ Estimate the musical key of an audio file using chroma features and harmonic analysis.
27
+
28
+ This function analyzes the harmonic content of an audio file to determine its musical key
29
+ using chroma features and statistical analysis of pitch class distributions.
30
+
31
+ Args:
32
+ audio_path: Path to audio file (supports common formats: WAV, MP3, FLAC)
33
+
34
+ Returns:
35
+ Estimated key as string (e.g., 'C', 'C#', 'D', 'D#', 'E', 'F', 'F#', 'G', 'G#', 'A', 'A#', 'B')
36
+
37
+ Examples:
38
+ - Returns 'C' for audio in C major/A minor
39
+ - Returns 'F#' for audio in F# major/D# minor
40
+ - Returns 'A' for audio in A major/F# minor
41
+
42
+ Note:
43
+ Uses medium quality processing for faster analysis
44
+ Most accurate for music with clear harmonic content
45
+ May be less accurate for atonal or highly percussive music
46
+ """
47
+ try:
48
+ y, sr = librosa.load(
49
+ audio_path, res_type="soxr_mq"
50
+ ) # Medium quality for faster processing
51
+
52
+ # Extract chroma features
53
+ chroma = librosa.feature.chroma_cqt(y=y, sr=sr)
54
+
55
+ # Get the most prominent pitch class
56
+ chroma_mean = np.mean(chroma, axis=1)
57
+ key_index = np.argmax(chroma_mean)
58
+
59
+ # Map index to key names
60
+ keys = ["C", "C#", "D", "D#", "E", "F", "F#", "G", "G#", "A", "A#", "B"]
61
+ estimated_key = keys[key_index]
62
+
63
+ return estimated_key
64
+
65
+ except Exception as e:
66
+ raise RuntimeError(f"Error estimating key: {str(e)}")
67
+
68
+
69
+ def key_to_semitones(key: str, target_key: str = "C") -> int:
70
+ """
71
+ Calculate semitone difference between two keys.
72
+
73
+ Args:
74
+ key: Source key
75
+ target_key: Target key to align to
76
+
77
+ Returns:
78
+ Number of semitones to shift
79
+ """
80
+ keys = ["C", "C#", "D", "D#", "E", "F", "F#", "G", "G#", "A", "A#", "B"]
81
+
82
+ if key not in keys or target_key not in keys:
83
+ raise ValueError("Invalid key name")
84
+
85
+ key_index = keys.index(key)
86
+ target_index = keys.index(target_key)
87
+
88
+ # Calculate semitone difference (wrapping around 12 semitones)
89
+ semitones = (target_index - key_index) % 12
90
+ if semitones > 6:
91
+ semitones -= 12
92
+
93
+ return semitones
94
+
95
+
96
+ def align_songs_by_key(
97
+ audio1_path: str,
98
+ audio2_path: str,
99
+ target_key: str = "C",
100
+ output_path: str = "output",
101
+ ) -> Tuple[str, str]:
102
+ """
103
+ Align two songs to the same musical key by pitch shifting.
104
+
105
+ Args:
106
+ audio1_path: Path to first audio file
107
+ audio2_path: Path to second audio file
108
+ target_key: Target key to align both songs to (default: 'C')
109
+ output_path: Directory to save the aligned audio files
110
+
111
+ Returns:
112
+ Tuple of (aligned_audio1_path, aligned_audio2_path) - paths to processed files
113
+ """
114
+ try:
115
+ # Estimate keys for both tracks (handled internally by shift_to_key)
116
+ # key1 = estimate_key(audio1_path)
117
+ # key2 = estimate_key(audio2_path)
118
+
119
+ # Calculate semitone shifts (handled internally by shift_to_key)
120
+ # semitones1 = key_to_semitones(key1, target_key)
121
+ # semitones2 = key_to_semitones(key2, target_key)
122
+
123
+ # Load audio files
124
+ y1, sr1 = _load_audio(audio1_path)
125
+ y2, sr2 = _load_audio(audio2_path)
126
+
127
+ # res_type = "soxr_vhq" # Very high quality for final output (set in shift_to_key)
128
+
129
+ aligned1_path = shift_to_key(audio1_path, target_key, output_path)
130
+ aligned2_path = shift_to_key(audio2_path, target_key, output_path)
131
+
132
+ return aligned1_path, aligned2_path
133
+
134
+ except Exception as e:
135
+ raise RuntimeError(f"Error aligning audio keys: {str(e)}") from e
136
+
137
+
138
+ def shift_to_key(audio_path: str, target_key: str, output_path: str = "output") -> str:
139
+ """
140
+ Shift an audio file to a specific musical key.
141
+
142
+ Args:
143
+ audio_path: Path to audio file
144
+ target_key: Target key to shift to
145
+ output_path: Directory to save the shifted audio file
146
+
147
+ Returns:
148
+ Path to the pitch-shifted audio file
149
+ """
150
+ try:
151
+ # Estimate current key
152
+ current_key = estimate_key(audio_path)
153
+
154
+ # Calculate semitone shift
155
+ semitones = key_to_semitones(current_key, target_key)
156
+
157
+ # Load and shift audio
158
+ y, sr = _load_audio(audio_path)
159
+ y_shifted = librosa.effects.pitch_shift(
160
+ y, n_steps=semitones, scale=True, sr=sr, res_type="soxr_vhq"
161
+ )
162
+
163
+ # Save to temporary file
164
+ audio_path = os.path.basename(audio_path).replace(".wav", "")
165
+ os.makedirs(output_path, exist_ok=True)
166
+
167
+ if y_shifted.ndim == 2:
168
+ y_shifted = y_shifted.T
169
+
170
+ final_audio_path = os.path.join(
171
+ output_path, f"{audio_path}_shifted_to_{target_key}.wav"
172
+ )
173
+ sf.write(final_audio_path, y_shifted, sr, format="wav", subtype="PCM_16")
174
+
175
+ return final_audio_path
176
+
177
+ except Exception as e:
178
+ raise RuntimeError(f"Error shifting key: {str(e)}")
179
+
180
+
181
+ if __name__ == "__main__":
182
+ import argparse
183
+
184
+ parser = argparse.ArgumentParser(
185
+ description="Pitch alignment tools for audio files"
186
+ )
187
+ subparsers = parser.add_subparsers(dest="command", help="Available commands")
188
+
189
+ # Estimate key of a single file
190
+ estimate_parser = subparsers.add_parser(
191
+ "estimate", help="Estimate the key of an audio file"
192
+ )
193
+ estimate_parser.add_argument("audio", help="Path to audio file")
194
+
195
+ # Align two songs by key
196
+ align_parser = subparsers.add_parser("align", help="Align two songs to same key")
197
+ align_parser.add_argument("audio1", help="Path to first audio file")
198
+ align_parser.add_argument("audio2", help="Path to second audio file")
199
+ align_parser.add_argument(
200
+ "--target-key", default="C", help="Target key to align to (default: C)"
201
+ )
202
+
203
+ # Shift single file to key
204
+ shift_parser = subparsers.add_parser("shift", help="Shift audio to specific key")
205
+ shift_parser.add_argument("audio", help="Path to audio file")
206
+ shift_parser.add_argument("target_key", help="Target key to shift to")
207
+
208
+ args = parser.parse_args()
209
+
210
+ try:
211
+ if args.command == "estimate":
212
+ key = estimate_key(args.audio)
213
+ print(f"Estimated key: {key}")
214
+ elif args.command == "align":
215
+ aligned1, aligned2 = align_songs_by_key(
216
+ args.audio1, args.audio2, args.target_key
217
+ )
218
+ print(f"Aligned audio 1: {aligned1}")
219
+ print(f"Aligned audio 2: {aligned2}")
220
+ elif args.command == "shift":
221
+ output = shift_to_key(args.audio, args.target_key)
222
+ print(f"Shifted audio saved to: {output}")
223
+ else:
224
+ parser.print_help()
225
+ except Exception as e:
226
+ print(f"Error: {e}")
227
+ raise e
228
+ exit(1)
tools/stems_separation.py ADDED
@@ -0,0 +1,358 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import os
3
+ import subprocess
4
+ from pathlib import Path
5
+ from typing import Tuple, List, Dict, Optional
6
+
7
+
8
+ class Error(Exception):
9
+ pass
10
+
11
+
12
+ def separate_audio(
13
+ audio_path: str, output_path: Optional[str] = None
14
+ ) -> Tuple[str, str, str, str]:
15
+ """
16
+ Separate audio into vocals, drums, bass, and other stems using Demucs.
17
+
18
+ This function uses the Demucs neural network model to separate a mixed audio file
19
+ into individual instrument stems. It's particularly effective for separating
20
+ vocals from instrumental backing tracks.
21
+
22
+ Args:
23
+ audio_path: Path to the input audio file (supports common formats: WAV, MP3, FLAC, M4A)
24
+ output_path: Directory to save the separated stems (default: 'output' directory)
25
+
26
+ Returns:
27
+ tuple[str, str, str, str]: Paths to the separated audio files in order:
28
+ - vocals: Isolated vocal track
29
+ - drums: Isolated drum/percussion track
30
+ - bass: Isolated bass track
31
+ - other: Remaining instruments (guitars, keyboards, etc.)
32
+
33
+ Examples:
34
+ - Extract vocals for karaoke creation
35
+ - Isolate drums for remixing
36
+ - Separate bass for transcription
37
+ - Create instrumental versions by combining drums+bass+other
38
+
39
+ Note:
40
+ Uses the htdemucs model which is optimized for high-quality separation
41
+ Processing time depends on audio length and system performance
42
+ Output files are saved in WAV format for maximum quality
43
+ """
44
+ try:
45
+ # Prepare the output directory
46
+ if not output_path:
47
+ output_path = "output"
48
+
49
+ output_dir = os.path.join(output_path, "separated")
50
+ os.makedirs(output_dir, exist_ok=True)
51
+
52
+ # Run Demucs separation
53
+ cmd = [
54
+ "python",
55
+ "-m",
56
+ "demucs.separate",
57
+ "--out",
58
+ output_dir,
59
+ "--name",
60
+ "htdemucs",
61
+ audio_path,
62
+ ]
63
+
64
+ result = subprocess.run(cmd, capture_output=True, text=True)
65
+
66
+ if result.returncode != 0:
67
+ raise Error(f"Demucs separation failed: {result.stderr}")
68
+
69
+ # Find the separated files
70
+ track_name = Path(audio_path).stem
71
+ htdemucs_dir = os.path.join(output_dir, "htdemucs", track_name)
72
+
73
+ vocals_path = os.path.join(htdemucs_dir, "vocals.wav")
74
+ drums_path = os.path.join(htdemucs_dir, "drums.wav")
75
+ bass_path = os.path.join(htdemucs_dir, "bass.wav")
76
+ other_path = os.path.join(htdemucs_dir, "other.wav")
77
+
78
+ # Verify all files exist
79
+ for file_path in [vocals_path, drums_path, bass_path, other_path]:
80
+ if not os.path.exists(file_path):
81
+ raise Error(f"Separated file not found: {file_path}")
82
+
83
+ return vocals_path, drums_path, bass_path, other_path
84
+
85
+ except Exception as e:
86
+ raise Error(f"Error processing audio: {str(e)}")
87
+
88
+
89
+ def extract_selected_stems(
90
+ audio_path: str, stems_to_extract: List[str], output_path: Optional[str] = None
91
+ ) -> Dict[str, str]:
92
+ """
93
+ Extract only specific stems from an audio file.
94
+
95
+ This function allows selective extraction of specific stems rather than all four stems,
96
+ which can save processing time and storage space when only certain elements are needed.
97
+
98
+ Args:
99
+ audio_path: Path to the input audio file (supports common formats: WAV, MP3, FLAC, M4A)
100
+ stems_to_extract: List of stems to extract. Valid options: ['vocals', 'drums', 'bass', 'other']
101
+ output_path: Directory to save the selected stems (default: 'output' directory)
102
+
103
+ Returns:
104
+ dict[str, str]: Dictionary mapping stem names to their file paths
105
+
106
+ Examples:
107
+ - extract_selected_stems('song.mp3', ['vocals', 'drums']): Extract only vocals and drums
108
+ - extract_selected_stems('song.mp3', ['vocals']): Extract only vocals for karaoke
109
+ - extract_selected_stems('song.mp3', ['bass', 'drums']): Extract rhythm section
110
+
111
+ Note:
112
+ Valid stem names are: 'vocals', 'drums', 'bass', 'other'
113
+ Invalid stem names will be ignored with a warning
114
+ Uses the same high-quality Demucs model as separate_audio
115
+ """
116
+ # Validate stem names
117
+ valid_stems = ["vocals", "drums", "bass", "other"]
118
+ invalid_stems = [stem for stem in stems_to_extract if stem not in valid_stems]
119
+
120
+ if invalid_stems:
121
+ print(f"Warning: Invalid stem names will be ignored: {invalid_stems}")
122
+
123
+ # Filter to only valid stems
124
+ valid_stems_to_extract = [stem for stem in stems_to_extract if stem in valid_stems]
125
+
126
+ if not valid_stems_to_extract:
127
+ raise ValueError("No valid stems specified for extraction")
128
+
129
+ # First, separate all stems
130
+ all_stems = separate_audio(audio_path, output_path)
131
+ vocals_path, drums_path, bass_path, other_path = all_stems
132
+
133
+ # Create mapping of all stems
134
+ stem_mapping = {
135
+ "vocals": vocals_path,
136
+ "drums": drums_path,
137
+ "bass": bass_path,
138
+ "other": other_path,
139
+ }
140
+
141
+ # Return only requested stems
142
+ result = {}
143
+ for stem in valid_stems_to_extract:
144
+ result[stem] = stem_mapping[stem]
145
+
146
+ return result
147
+
148
+
149
+ def extract_vocal_non_vocal(
150
+ audio_path: str, output_path: Optional[str] = None
151
+ ) -> Tuple[str, str]:
152
+ """
153
+ Extract vocals and non-vocals (instrumental) stems from an audio file.
154
+
155
+ This function provides a simple interface to separate audio into vocal and
156
+ non-vocal components, which is useful for karaoke creation, vocal isolation,
157
+ or instrumental extraction.
158
+
159
+ Args:
160
+ audio_path: Path to the input audio file (supports common formats: WAV, MP3, FLAC, M4A)
161
+ output_path: Directory to save the separated stems (default: 'output' directory)
162
+
163
+ Returns:
164
+ tuple[str, str]: Paths to (vocals_file, non_vocals_file)
165
+ - vocals_file: Path to the isolated vocal track
166
+ - non_vocals_file: Path to the combined instrumental track (drums + bass + other)
167
+
168
+ Examples:
169
+ - extract_vocal_non_vocal('song.mp3'): Separate into vocals and instrumental
170
+ - extract_vocal_non_vocal('song.wav', 'karaoke'): Create karaoke version
171
+
172
+ Note:
173
+ The non-vocals track combines drums, bass, and other stems into a single instrumental
174
+ Uses the same high-quality Demucs model as separate_audio
175
+ Non-vocals track is automatically mixed and normalized
176
+ """
177
+ # Extract all stems
178
+ all_stems = separate_audio(audio_path, output_path)
179
+ vocals_path, drums_path, bass_path, other_path = all_stems
180
+
181
+ # Create non-vocals by combining drums, bass, and other
182
+ try:
183
+ # Load all non-vocal stems
184
+ import librosa
185
+ import numpy as np
186
+ import soundfile as sf
187
+
188
+ y_drums, sr_drums = librosa.load(drums_path, sr=None, mono=False)
189
+ y_bass, sr_bass = librosa.load(bass_path, sr=None, mono=False)
190
+ y_other, sr_other = librosa.load(other_path, sr=None, mono=False)
191
+
192
+ # Ensure same sample rate
193
+ target_sr = max(sr_drums, sr_bass, sr_other)
194
+
195
+ if sr_drums != target_sr:
196
+ y_drums = librosa.resample(y_drums, orig_sr=sr_drums, target_sr=target_sr)
197
+ if sr_bass != target_sr:
198
+ y_bass = librosa.resample(y_bass, orig_sr=sr_bass, target_sr=target_sr)
199
+ if sr_other != target_sr:
200
+ y_other = librosa.resample(y_other, orig_sr=sr_other, target_sr=target_sr)
201
+
202
+ # Ensure same shape
203
+ max_length = max(y_drums.shape[-1], y_bass.shape[-1], y_other.shape[-1])
204
+
205
+ def pad_to_length(y, target_length):
206
+ if y.shape[-1] < target_length:
207
+ if y.ndim == 1:
208
+ return np.pad(y, (0, target_length - y.shape[-1]), mode="constant")
209
+ else:
210
+ return np.pad(
211
+ y, ((0, 0), (0, target_length - y.shape[-1])), mode="constant"
212
+ )
213
+ return y
214
+
215
+ y_drums = pad_to_length(y_drums, max_length)
216
+ y_bass = pad_to_length(y_bass, max_length)
217
+ y_other = pad_to_length(y_other, max_length)
218
+
219
+ # Combine non-vocal stems
220
+ non_vocals = y_drums + y_bass + y_other
221
+
222
+ # Normalize to prevent clipping
223
+ max_val = np.max(np.abs(non_vocals))
224
+ if max_val > 0:
225
+ non_vocals = non_vocals / max_val * 0.95
226
+
227
+ # Save non-vocals file
228
+ if output_path:
229
+ os.makedirs(output_path, exist_ok=True)
230
+ non_vocals_filename = os.path.join(output_path, "non_vocals.wav")
231
+ else:
232
+ non_vocals_filename = os.path.join(
233
+ os.path.dirname(drums_path), "non_vocals.wav"
234
+ )
235
+
236
+ if non_vocals.ndim == 2:
237
+ non_vocals = non_vocals.T
238
+
239
+ sf.write(
240
+ non_vocals_filename, non_vocals, target_sr, format="wav", subtype="PCM_16"
241
+ )
242
+
243
+ return vocals_path, non_vocals_filename
244
+
245
+ except Exception as e:
246
+ raise RuntimeError(f"Error creating non-vocals track: {str(e)}")
247
+
248
+
249
+ def create_karaoke_track(audio_path: str, output_path: Optional[str] = None) -> str:
250
+ """
251
+ Create a karaoke (instrumental) track by removing vocals from an audio file.
252
+
253
+ This is a convenience function that extracts the instrumental (non-vocal) portion
254
+ of a song, creating a karaoke-ready backing track.
255
+
256
+ Args:
257
+ audio_path: Path to the input audio file (supports common formats: WAV, MP3, FLAC, M4A)
258
+ output_path: Directory to save the karaoke track (default: 'output' directory)
259
+
260
+ Returns:
261
+ Path to the karaoke (instrumental) audio file
262
+
263
+ Examples:
264
+ - create_karaoke_track('song.mp3'): Create karaoke version
265
+ - create_karaoke_track('song.wav', 'karaoke_tracks'): Save to specific folder
266
+
267
+ Note:
268
+ Uses the same high-quality Demucs model as separate_audio
269
+ Combines drums, bass, and other stems into instrumental track
270
+ Automatically normalized for consistent volume
271
+ """
272
+ vocals_path, instrumental_path = extract_vocal_non_vocal(audio_path, output_path)
273
+ return instrumental_path
274
+
275
+
276
+ if __name__ == "__main__":
277
+ parser = argparse.ArgumentParser(
278
+ description="Separate audio into stems using Demucs"
279
+ )
280
+ subparsers = parser.add_subparsers(dest="command", help="Available commands")
281
+
282
+ # Original separate command
283
+ separate_parser = subparsers.add_parser(
284
+ "separate", help="Separate into all four stems"
285
+ )
286
+ separate_parser.add_argument("audio_path", help="Path to the input audio file")
287
+ separate_parser.add_argument(
288
+ "--output-dir", help="Directory to save separated stems (default: output)"
289
+ )
290
+
291
+ # New selective stems command
292
+ select_parser = subparsers.add_parser("select", help="Extract specific stems only")
293
+ select_parser.add_argument("audio_path", help="Path to the input audio file")
294
+ select_parser.add_argument(
295
+ "stems",
296
+ nargs="+",
297
+ choices=["vocals", "drums", "bass", "other"],
298
+ help="Stems to extract (choose from: vocals, drums, bass, other)",
299
+ )
300
+ select_parser.add_argument(
301
+ "--output-dir", help="Directory to save separated stems (default: output)"
302
+ )
303
+
304
+ # New vocal/non-vocal command
305
+ vocal_parser = subparsers.add_parser(
306
+ "vocal-nonvocal", help="Extract vocals and instrumental only"
307
+ )
308
+ vocal_parser.add_argument("audio_path", help="Path to the input audio file")
309
+ vocal_parser.add_argument(
310
+ "--output-dir", help="Directory to save separated stems (default: output)"
311
+ )
312
+
313
+ # New karaoke command
314
+ karaoke_parser = subparsers.add_parser(
315
+ "karaoke", help="Create karaoke (instrumental) track"
316
+ )
317
+ karaoke_parser.add_argument("audio_path", help="Path to the input audio file")
318
+ karaoke_parser.add_argument(
319
+ "--output-dir", help="Directory to save karaoke track (default: output)"
320
+ )
321
+
322
+ args = parser.parse_args()
323
+
324
+ if not args.command:
325
+ parser.print_help()
326
+ exit(1)
327
+
328
+ try:
329
+ if args.command == "separate":
330
+ vocals, drums, bass, other = separate_audio(
331
+ args.audio_path, args.output_dir
332
+ )
333
+ print(f"Vocals: {vocals}")
334
+ print(f"Drums: {drums}")
335
+ print(f"Bass: {bass}")
336
+ print(f"Other: {other}")
337
+
338
+ elif args.command == "select":
339
+ selected_stems = extract_selected_stems(
340
+ args.audio_path, args.stems, args.output_dir
341
+ )
342
+ for stem, path in selected_stems.items():
343
+ print(f"{stem.capitalize()}: {path}")
344
+
345
+ elif args.command == "vocal-nonvocal":
346
+ vocals_path, non_vocals_path = extract_vocal_non_vocal(
347
+ args.audio_path, args.output_dir
348
+ )
349
+ print(f"Vocals: {vocals_path}")
350
+ print(f"Non-vocals (Instrumental): {non_vocals_path}")
351
+
352
+ elif args.command == "karaoke":
353
+ karaoke_path = create_karaoke_track(args.audio_path, args.output_dir)
354
+ print(f"Karaoke track: {karaoke_path}")
355
+
356
+ except Exception as e:
357
+ print(f"Error: {e}")
358
+ exit(1)
tools/time_strech.py ADDED
@@ -0,0 +1,145 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from typing import Optional, Tuple
3
+
4
+ import librosa
5
+ import soundfile as sf
6
+
7
+
8
+ def align_songs_by_bpm(
9
+ audio1_path: str, audio2_path: str, output_path: Optional[str] = None
10
+ ) -> Tuple[str, str]:
11
+ """
12
+ Align two songs to the same BPM by stretching the faster one to match the slower one.
13
+
14
+ This function analyzes the tempo of two audio files and automatically stretches the faster
15
+ track to match the BPM of the slower track, making them suitable for mixing or mashups.
16
+
17
+ Args:
18
+ audio1_path: Path to first audio file (supports common formats: WAV, MP3, FLAC)
19
+ audio2_path: Path to second audio file (supports common formats: WAV, MP3, FLAC)
20
+ output_path: Optional output directory (default: None, uses temporary directory)
21
+
22
+ Returns:
23
+ Tuple of (aligned_audio1_path, aligned_audio2_path): Paths to the processed audio files
24
+ Both files will have the same BPM (the slower of the two original tempos)
25
+
26
+ Examples:
27
+ - Song A: 140 BPM, Song B: 128 BPM → Both become 128 BPM
28
+ - Song A: 120 BPM, Song B: 130 BPM → Both become 120 BPM
29
+
30
+ Note:
31
+ Uses high-quality time-stretching to maintain audio quality
32
+ Preserves the original pitch of both tracks
33
+ Processing time depends on audio length and tempo difference
34
+ """
35
+ try:
36
+ # Load both audio files
37
+ y1, sr1 = librosa.load(audio1_path)
38
+ y2, sr2 = librosa.load(audio2_path)
39
+
40
+ # Get BPM for both tracks
41
+ tempo1, _ = librosa.beat.beat_track(y=y1, sr=sr1)
42
+ tempo2, _ = librosa.beat.beat_track(y=y2, sr=sr2)
43
+
44
+ bpm1 = float(tempo1)
45
+ bpm2 = float(tempo2)
46
+
47
+ # Determine which track is faster and needs stretching
48
+ if bpm1 > bpm2:
49
+ # Stretch first track to match second track's BPM
50
+ aligned1_path = stretch_to_bpm(audio1_path, bpm2, output_path)
51
+ aligned2_path = stretch_to_bpm(audio2_path, bpm2, output_path)
52
+ else:
53
+ # Stretch second track to match first track's BPM
54
+ aligned1_path = stretch_to_bpm(audio1_path, bpm1, output_path)
55
+ aligned2_path = stretch_to_bpm(audio2_path, bpm1, output_path)
56
+
57
+ return aligned1_path, aligned2_path
58
+
59
+ except Exception as e:
60
+ raise RuntimeError(f"Error aligning audio files: {str(e)}")
61
+
62
+
63
+ def stretch_to_bpm(
64
+ audio_path: str, target_bpm: float, output_path: Optional[str] = None
65
+ ) -> str:
66
+ """
67
+ Stretch an audio file to a specific BPM.
68
+
69
+ Args:
70
+ audio_path: Path to audio file
71
+ target_bpm: Target BPM to stretch to
72
+ output_path: Path to output file
73
+
74
+ Returns:
75
+ Path to the stretched audio file
76
+ """
77
+ try:
78
+ y, sr = librosa.load(audio_path, sr=None, mono=False)
79
+
80
+ # Get current BPM
81
+ y_hat, sr_hat = librosa.load(audio_path)
82
+ tempo, _ = librosa.beat.beat_track(y=y_hat, sr=sr_hat)
83
+ current_bpm = float(tempo)
84
+
85
+ # Calculate stretch factor
86
+ stretch_factor = target_bpm / current_bpm
87
+
88
+ # Apply time stretching
89
+ y_stretched = librosa.effects.time_stretch(y, rate=stretch_factor)
90
+
91
+ # Save to temporary file
92
+ if not output_path:
93
+ output_path = "output"
94
+ os.makedirs(output_path, exist_ok=True)
95
+
96
+ original_audio_filename = os.path.basename(audio_path).replace(".wav", "")
97
+ output_file_path = os.path.join(
98
+ output_path,
99
+ f"{original_audio_filename}_stretched_to_{int(target_bpm)}_bpm.wav",
100
+ )
101
+
102
+ if y_stretched.ndim == 2:
103
+ y_stretched = y_stretched.T # Transpose for multi-channel audio
104
+
105
+ sf.write(output_file_path, y_stretched, sr)
106
+
107
+ return output_file_path
108
+
109
+ except Exception as e:
110
+ raise RuntimeError(f"Error stretching audio: {str(e)}")
111
+
112
+
113
+ if __name__ == "__main__":
114
+ import argparse
115
+
116
+ parser = argparse.ArgumentParser(description="Time stretch audio files")
117
+ subparsers = parser.add_subparsers(dest="command", help="Available commands")
118
+
119
+ # Align two songs by BPM
120
+ align_parser = subparsers.add_parser("align", help="Align two songs to same BPM")
121
+ align_parser.add_argument("audio1", help="Path to first audio file")
122
+ align_parser.add_argument("audio2", help="Path to second audio file")
123
+
124
+ # Stretch to specific BPM
125
+ stretch_parser = subparsers.add_parser(
126
+ "stretch", help="Stretch audio to specific BPM"
127
+ )
128
+ stretch_parser.add_argument("audio", help="Path to audio file")
129
+ stretch_parser.add_argument("target_bpm", type=float, help="Target BPM")
130
+
131
+ args = parser.parse_args()
132
+
133
+ try:
134
+ if args.command == "align":
135
+ aligned1, aligned2 = align_songs_by_bpm(args.audio1, args.audio2)
136
+ print(f"Aligned audio 1: {aligned1}")
137
+ print(f"Aligned audio 2: {aligned2}")
138
+ elif args.command == "stretch":
139
+ output = stretch_to_bpm(args.audio, args.target_bpm)
140
+ print(f"Stretched audio saved to: {output}")
141
+ else:
142
+ parser.print_help()
143
+ except Exception as e:
144
+ print(f"Error: {e}")
145
+ exit(1)
tools/youtube_extract.py ADDED
@@ -0,0 +1,167 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+
3
+ import yt_dlp
4
+
5
+
6
+ def extract_audio_from_youtube(
7
+ youtube_url: str,
8
+ audio_format: str = "wav",
9
+ quality: str = "best",
10
+ output_path: str = "output",
11
+ ) -> str:
12
+ """
13
+ Extract high-quality audio from a YouTube video URL using yt-dlp.
14
+
15
+ This function downloads the audio stream from YouTube videos and converts it to
16
+ the specified format while maintaining the best available quality.
17
+
18
+ Args:
19
+ youtube_url: YouTube video URL (full URL format: https://www.youtube.com/watch?v=...)
20
+ audio_format: Output audio format (default: 'wav')
21
+ Supported: 'wav' (uncompressed), 'mp3' (compressed), 'flac' (lossless)
22
+ quality: Audio quality selection (default: 'best')
23
+ Options: 'best' (highest available), 'worst' (lowest available)
24
+ output_path: Directory to save the extracted audio (default: 'output')
25
+
26
+ Returns:
27
+ Path to the extracted audio file in the specified format
28
+
29
+ Examples:
30
+ - Extract WAV audio: extract_audio_from_youtube('https://youtube.com/watch?v=...', 'wav')
31
+ - Extract MP3 audio: extract_audio_from_youtube('https://youtube.com/watch?v=...', 'mp3')
32
+ - High quality WAV: extract_audio_from_youtube(url, 'wav', 'best')
33
+
34
+ Note:
35
+ Requires internet connection for downloading
36
+ Respects YouTube's terms of service
37
+ Processing time depends on video length and connection speed
38
+ Output files are saved with descriptive names including video title
39
+ """
40
+ try:
41
+ # Create temporary directory for downloads if no output path is provided
42
+ output_path = output_path or "output"
43
+ os.makedirs(output_path, exist_ok=True)
44
+
45
+ # Configure yt-dlp options
46
+ ydl_opts = {
47
+ "format": "bestaudio/best",
48
+ "outtmpl": os.path.join(output_path, "%(title)s.%(ext)s"),
49
+ "postprocessors": [
50
+ {
51
+ "key": "FFmpegExtractAudio",
52
+ "preferredcodec": audio_format,
53
+ "preferredquality": "192" if quality == "best" else "128",
54
+ }
55
+ ],
56
+ "quiet": True,
57
+ "no_warnings": True,
58
+ }
59
+
60
+ # Download and extract audio
61
+ with yt_dlp.YoutubeDL(params=ydl_opts) as ydl:
62
+ info = ydl.extract_info(youtube_url, download=False)
63
+ video_title = info.get("title", "audio")
64
+
65
+ ydl.download([youtube_url])
66
+
67
+ # Find the downloaded file
68
+ expected_filename = f"{video_title}.{audio_format}"
69
+ audio_path = os.path.join(output_path, expected_filename)
70
+
71
+ # Handle special characters in filename
72
+ if not os.path.exists(audio_path):
73
+ # Try to find any audio file in the directory
74
+ audio_files = [
75
+ f for f in os.listdir(output_path) if f.endswith(f".{audio_format}")
76
+ ]
77
+ if audio_files:
78
+ audio_path = os.path.join(output_path, audio_files[0])
79
+ else:
80
+ raise RuntimeError("Audio file not found after download")
81
+
82
+ return audio_path
83
+
84
+ except Exception as e:
85
+ raise RuntimeError(f"Error extracting audio from YouTube: {str(e)}")
86
+
87
+
88
+ def get_video_info(youtube_url: str) -> dict:
89
+ """
90
+ Get information about a YouTube video without downloading.
91
+
92
+ Args:
93
+ youtube_url: YouTube video URL
94
+
95
+ Returns:
96
+ Dictionary with video information (title, duration, uploader, etc.)
97
+ """
98
+ try:
99
+ ydl_opts = {
100
+ "quiet": True,
101
+ "no_warnings": True,
102
+ "skip_download": True,
103
+ }
104
+
105
+ with yt_dlp.YoutubeDL(params=ydl_opts) as ydl:
106
+ info = ydl.extract_info(youtube_url, download=False)
107
+
108
+ return {
109
+ "title": info.get("title"),
110
+ "duration": info.get("duration"),
111
+ "uploader": info.get("uploader"),
112
+ "upload_date": info.get("upload_date"),
113
+ "view_count": info.get("view_count"),
114
+ "description": info.get("description"),
115
+ "thumbnail": info.get("thumbnail"),
116
+ }
117
+
118
+ except Exception as e:
119
+ raise RuntimeError(f"Error getting video info: {str(e)}")
120
+
121
+
122
+ if __name__ == "__main__":
123
+ import argparse
124
+
125
+ parser = argparse.ArgumentParser(description="Extract audio from YouTube videos")
126
+ subparsers = parser.add_subparsers(dest="command", help="Available commands")
127
+
128
+ # Extract audio
129
+ extract_parser = subparsers.add_parser(
130
+ "extract", help="Extract audio from YouTube URL"
131
+ )
132
+ extract_parser.add_argument("url", help="YouTube video URL")
133
+ extract_parser.add_argument(
134
+ "--format",
135
+ default="wav",
136
+ choices=["wav", "mp3", "flac", "m4a"],
137
+ help="Output audio format (default: wav)",
138
+ )
139
+ extract_parser.add_argument(
140
+ "--quality",
141
+ default="best",
142
+ choices=["best", "worst"],
143
+ help="Audio quality (default: best)",
144
+ )
145
+
146
+ # Get video info
147
+ info_parser = subparsers.add_parser("info", help="Get video information")
148
+ info_parser.add_argument("url", help="YouTube video URL")
149
+
150
+ args = parser.parse_args()
151
+
152
+ try:
153
+ if args.command == "extract":
154
+ audio_path = extract_audio_from_youtube(args.url, args.format, args.quality)
155
+ print(f"Audio extracted to: {audio_path}")
156
+ elif args.command == "info":
157
+ info = get_video_info(args.url)
158
+ print(f"Title: {info['title']}")
159
+ print(f"Duration: {info['duration']} seconds")
160
+ print(f"Uploader: {info['uploader']}")
161
+ print(f"Upload date: {info['upload_date']}")
162
+ print(f"Views: {info['view_count']}")
163
+ else:
164
+ parser.print_help()
165
+ except Exception as e:
166
+ print(f"Error: {e}")
167
+ exit(1)