Spaces:

frascuchon
/

music-mcp

Running on CPU Upgrade

App Files Files Community

frascuchon HF Staff commited on 24 days ago

Commit

801ea60

1 Parent(s): f086c75

All the tools and gradio server

Browse files

Files changed (11) hide show

.gitignore +82 -0
README.md +1 -1
mcp_server.py +426 -0
mypy.ini +2 -0
requirements.txt +14 -0
tools/__init__.py +0 -0
tools/combine_tracks.py +387 -0
tools/pitch_alignment.py +228 -0
tools/stems_separation.py +358 -0
tools/time_strech.py +145 -0
tools/youtube_extract.py +167 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,82 @@

+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+*.egg-info/
+.installed.cfg
+*.egg
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+.hypothesis/
+.pytest_cache/
+# Jupyter Notebook
+.ipynb_checkpoints
+# pyenv
+.python-version
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# VS Code
+.vscode/
+# Local env
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# MacOS
+.DS_Store
+# IDEs
+.idea/
+*.iml
+*.sublime-workspace
+*.sublime-project

README.md CHANGED Viewed

@@ -5,7 +5,7 @@ colorFrom: indigo
 colorTo: pink
 sdk: gradio
 sdk_version: 5.49.1
-app_file: app.py
 pinned: false
 ---

 colorTo: pink
 sdk: gradio
 sdk_version: 5.49.1
+app_file: mcp_server.py
 pinned: false
 ---

mcp_server.py ADDED Viewed

	@@ -0,0 +1,426 @@

+import gradio as gr
+from typing import Dict, Tuple
+from tools.combine_tracks import combine_tracks, create_medley
+from tools.stems_separation import (
+    separate_audio,
+    extract_selected_stems,
+    extract_vocal_non_vocal,
+    create_karaoke_track,
+)
+from tools.time_strech import align_songs_by_bpm, stretch_to_bpm
+from tools.youtube_extract import extract_audio_from_youtube
+def pitch_shift_with_semitones(audio_path: str, semitones: int) -> str:
+    """
+    Shift the pitch of an audio file by a specified number of semitones.
+    This function uses librosa's pitch shifting algorithm to change the musical pitch
+    of an audio file while maintaining its tempo and duration.
+    Args:
+        audio_path: Path to the input audio file (supports common formats: WAV, MP3, FLAC)
+        semitones: Number of semitones to shift (positive = higher pitch, negative = lower pitch)
+                 Range: -12 to +12 semitones (1 octave up/down)
+    Returns:
+        Path to the pitch-shifted audio file in WAV format
+    Examples:
+        - semitones=2: Shift up by 2 semitones (1 whole tone)
+        - semitones=-5: Shift down by 5 semitones (1 perfect fourth)
+        - semitones=0: No change (returns original file)
+    Note:
+        The function creates a temporary WAV file that should be cleaned up by the caller
+    """
+    if semitones == 0:
+        return audio_path
+    # Load audio to get sample rate
+    import librosa
+    y, sr = librosa.load(audio_path, sr=None, mono=False)
+    # Apply pitch shift
+    y_shifted = librosa.effects.pitch_shift(y, n_steps=semitones, sr=sr)
+    # Save to temporary file
+    import tempfile
+    import soundfile as sf
+    with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp:
+        if y_shifted.ndim == 2:
+            y_shifted = y_shifted.T
+        sf.write(tmp.name, y_shifted, sr, format="wav", subtype="PCM_16")
+        return tmp.name
+def stretch_audio_to_bpm_wrapper(audio_path: str, target_bpm: float) -> str:
+    """
+    Stretch or compress audio to match a specific BPM (beats per minute) while maintaining pitch.
+    This function uses time-stretching algorithms to change the tempo of an audio file
+    without affecting its musical pitch, making it useful for beat-matching and tempo alignment.
+    Args:
+        audio_path: Path to the input audio file (supports common formats: WAV, MP3, FLAC)
+        target_bpm: Target beats per minute (BPM) value
+                   Typical range: 60-200 BPM
+                   Common values: 90 (slow), 120 (medium), 140 (fast), 128 (electronic)
+    Returns:
+        Path to the time-stretched audio file in WAV format
+    Examples:
+        - target_bpm=128: Stretch to typical electronic dance music tempo
+        - target_bpm=120: Stretch to standard pop/rock tempo
+        - target_bpm=140: Stretch to fast electronic or rock tempo
+    Note:
+        The function automatically detects the original BPM and calculates the stretch factor
+        Creates a new WAV file with the modified tempo
+    """
+    return stretch_to_bpm(audio_path, target_bpm)
+def extract_selected_stems_wrapper(
+    audio_path: str, vocals: bool, drums: bool, bass: bool, other: bool
+) -> Dict[str, str]:
+    """
+    Extract selected stems from an audio file based on user choices.
+    This function allows selective extraction of specific stems rather than all four stems,
+    which can save processing time and storage space when only certain elements are needed.
+    Args:
+        audio_path: Path to the input audio file (supports common formats: WAV, MP3, FLAC, M4A)
+        vocals: Whether to extract the vocals stem
+        drums: Whether to extract the drums stem
+        bass: Whether to extract the bass stem
+        other: Whether to extract the other stem
+    Returns:
+        dict[str, str]: Dictionary mapping stem names to their file paths
+    Examples:
+        - vocals=True, drums=True, bass=False, other=False: Extract only vocals and drums
+        - vocals=True, drums=False, bass=False, other=False: Extract only vocals for karaoke
+        - vocals=False, drums=True, bass=True, other=False: Extract rhythm section (drums + bass)
+    Note:
+        At least one stem must be selected for extraction
+        Uses the same high-quality Demucs model as separate_audio
+        Processing time is the same as full separation since Demucs extracts all stems internally
+    """
+    stems_to_extract = []
+    if vocals:
+        stems_to_extract.append("vocals")
+    if drums:
+        stems_to_extract.append("drums")
+    if bass:
+        stems_to_extract.append("bass")
+    if other:
+        stems_to_extract.append("other")
+    if not stems_to_extract:
+        raise ValueError("At least one stem must be selected for extraction")
+    return extract_selected_stems(audio_path, stems_to_extract)
+def extract_vocal_non_vocal_wrapper(audio_path: str) -> Tuple[str, str]:
+    """
+    Extract vocals and non-vocals (instrumental) stems from an audio file.
+    This function provides a simple interface to separate audio into vocal and
+    non-vocal components, which is useful for karaoke creation, vocal isolation,
+    or instrumental extraction.
+    Args:
+        audio_path: Path to the input audio file (supports common formats: WAV, MP3, FLAC, M4A)
+    Returns:
+        tuple[str, str]: Paths to (vocals_file, instrumental_file)
+        - vocals_file: Path to the isolated vocal track
+        - instrumental_file: Path to the combined instrumental track (drums + bass + other)
+    Examples:
+        - extract_vocal_non_vocal_wrapper('song.mp3'): Separate into vocals and instrumental
+        - extract_vocal_non_vocal_wrapper('song.wav'): Create vocal and backing track versions
+    Note:
+        The instrumental track combines drums, bass, and other stems into a single track
+        Uses the same high-quality Demucs model as separate_audio
+        Instrumental track is automatically mixed and normalized for consistent volume
+    """
+    return extract_vocal_non_vocal(audio_path)
+def create_karaoke_track_wrapper(audio_path: str) -> str:
+    """
+    Create a karaoke (instrumental) track by removing vocals from an audio file.
+    This is a convenience function that extracts the instrumental (non-vocal) portion
+    of a song, creating a karaoke-ready backing track.
+    Args:
+        audio_path: Path to the input audio file (supports common formats: WAV, MP3, FLAC, M4A)
+    Returns:
+        Path to the karaoke (instrumental) audio file
+    Examples:
+        - create_karaoke_track_wrapper('song.mp3'): Create karaoke version
+        - create_karaoke_track_wrapper('song.wav'): Create instrumental backing track
+    Note:
+        Uses the same high-quality Demucs model as separate_audio
+        Combines drums, bass, and other stems into a single instrumental track
+        Automatically normalized for consistent volume and quality
+        Perfect for karaoke applications or backing track creation
+    """
+    return create_karaoke_track(audio_path)
+def create_interface():
+    """Create the Gradio interface with all tools."""
+    # Tab 1: Stem Separation
+    stem_interface = gr.Interface(
+        fn=separate_audio,
+        inputs=gr.Audio(type="filepath", label="Upload Audio File", sources=["upload"]),
+        outputs=[
+            gr.Audio(label="Vocals", type="filepath"),
+            gr.Audio(label="Drums", type="filepath"),
+            gr.Audio(label="Bass", type="filepath"),
+            gr.Audio(label="Other", type="filepath"),
+        ],
+        title="Audio Stem Separation",
+        description="Upload an audio file to separate it into vocals, drums, bass, and other stems.",
+        examples=None,
+        cache_examples=False,
+        flagging_mode="never",
+    )
+    # Tab 2: Track Combination
+    combine_interface = gr.Interface(
+        fn=combine_tracks,
+        inputs=[
+            gr.Audio(type="filepath", label="First Audio Track", sources=["upload"]),
+            gr.Audio(type="filepath", label="Second Audio Track", sources=["upload"]),
+            gr.Slider(
+                minimum=0.0, maximum=1.0, value=0.5, label="Weight for First Track"
+            ),
+            gr.Slider(
+                minimum=0.0, maximum=1.0, value=0.5, label="Weight for Second Track"
+            ),
+            gr.Checkbox(value=True, label="Normalize Output"),
+            gr.Number(value=0.0, label="Fade In Duration (seconds)"),
+            gr.Number(value=0.0, label="Fade Out Duration (seconds)"),
+        ],
+        outputs=gr.Audio(label="Combined Track", type="filepath"),
+        title="Combine Audio Tracks",
+        description="Combine two audio tracks with adjustable weights and optional fade effects.",
+        examples=None,
+        cache_examples=False,
+        flagging_mode="never",
+    )
+    # Tab 3: Pitch Alignment
+    pitch_interface = gr.Interface(
+        fn=pitch_shift_with_semitones,
+        inputs=[
+            gr.Audio(type="filepath", label="Upload Audio File", sources=["upload"]),
+            gr.Number(value=0, label="Semitones to Shift"),
+        ],
+        outputs=gr.Audio(label="Pitch Shifted Audio", type="filepath"),
+        title="Pitch Shift Audio",
+        description="Shift the pitch of an audio file by specified semitones.",
+        examples=None,
+        cache_examples=False,
+        flagging_mode="never",
+    )
+    # Tab 4: Time Stretching
+    stretch_interface = gr.Interface(
+        fn=stretch_audio_to_bpm_wrapper,
+        inputs=[
+            gr.Audio(type="filepath", label="Upload Audio File", sources=["upload"]),
+            gr.Number(value=120, label="Target BPM"),
+        ],
+        outputs=gr.Audio(label="Stretched Audio", type="filepath"),
+        title="Stretch Audio to BPM",
+        description="Stretch audio to match a specific BPM.",
+        examples=None,
+        cache_examples=False,
+        allow_flagging="never",
+    )
+    # Tab 5: BPM Alignment
+    bpm_interface = gr.Interface(
+        fn=align_songs_by_bpm,
+        inputs=[
+            gr.Audio(type="filepath", label="First Audio Track", sources=["upload"]),
+            gr.Audio(type="filepath", label="Second Audio Track", sources=["upload"]),
+        ],
+        outputs=[
+            gr.Audio(label="Aligned First Track", type="filepath"),
+            gr.Audio(label="Aligned Second Track", type="filepath"),
+        ],
+        title="Align Songs by BPM",
+        description="Align two songs to the same BPM by stretching the faster one to match the slower one.",
+        examples=None,
+        cache_examples=False,
+        flagging_mode="never",
+    )
+    # Tab 6: Selective Stem Extraction
+    selective_interface = gr.Interface(
+        fn=extract_selected_stems_wrapper,
+        inputs=[
+            gr.Audio(type="filepath", label="Upload Audio File", sources=["upload"]),
+            gr.Checkbox(value=True, label="Extract Vocals"),
+            gr.Checkbox(value=True, label="Extract Drums"),
+            gr.Checkbox(value=True, label="Extract Bass"),
+            gr.Checkbox(value=True, label="Extract Other"),
+        ],
+        outputs=gr.JSON(label="Extracted Stems"),
+        title="Selective Stem Extraction",
+        description="Extract only specific stems from an audio file to save processing time and storage.",
+        examples=None,
+        cache_examples=False,
+        flagging_mode="never",
+    )
+    # Tab 7: Vocal/Non-Vocal Separation
+    vocal_nonvocal_interface = gr.Interface(
+        fn=extract_vocal_non_vocal_wrapper,
+        inputs=gr.Audio(type="filepath", label="Upload Audio File", sources=["upload"]),
+        outputs=[
+            gr.Audio(label="Vocals Track", type="filepath"),
+            gr.Audio(label="Instrumental Track", type="filepath"),
+        ],
+        title="Vocal/Instrumental Separation",
+        description="Separate audio into vocal and instrumental components for karaoke or vocal isolation.",
+        examples=None,
+        cache_examples=False,
+        flagging_mode="never",
+    )
+    # Tab 8: Karaoke Track Creation
+    karaoke_interface = gr.Interface(
+        fn=create_karaoke_track_wrapper,
+        inputs=gr.Audio(type="filepath", label="Upload Audio File", sources=["upload"]),
+        outputs=gr.Audio(label="Karaoke Track", type="filepath"),
+        title="Create Karaoke Track",
+        description="Create a karaoke-ready instrumental track by removing vocals from any song.",
+        examples=None,
+        cache_examples=False,
+        flagging_mode="never",
+    )
+    # Tab 9: Medley Creation
+    medley_interface = gr.Interface(
+        fn=create_medley,
+        inputs=[
+            gr.Audio(type="filepath", label="Vocals Stem", sources=["upload"]),
+            gr.Audio(type="filepath", label="Instrumental Stem", sources=["upload"]),
+            gr.Number(
+                value=1.2, label="Vocals Gain", minimum=0.1, maximum=3.0, step=0.1
+            ),
+            gr.Number(
+                value=0.9, label="Instrumental Gain", minimum=0.1, maximum=3.0, step=0.1
+            ),
+            gr.Textbox(
+                value="threshold=-18dB:ratio=3:attack=50:release=200",
+                label="Compressor Settings",
+                placeholder="threshold=-18dB:ratio=3:attack=50:release=200",
+            ),
+            gr.Dropdown(
+                choices=["libmp3lame", "aac", "flac", "pcm_s16le"],
+                value="libmp3lame",
+                label="Audio Codec",
+            ),
+            gr.Textbox(value="192k", label="Audio Bitrate", placeholder="192k"),
+        ],
+        outputs=gr.Audio(label="Medley Audio", type="filepath"),
+        title="Create Vocal/Instrumental Medley",
+        description="Mix vocals and instrumental stems into a polished medley with compression and gain control.",
+        examples=None,
+        cache_examples=False,
+        flagging_mode="never",
+    )
+    # Tab 10: YouTube Extraction
+    youtube_interface = gr.Interface(
+        fn=extract_audio_from_youtube,
+        inputs=[
+            gr.Textbox(
+                label="YouTube URL", placeholder="https://www.youtube.com/watch?v=..."
+            ),
+            gr.Dropdown(
+                choices=["wav", "mp3", "flac"], value="wav", label="Output Format"
+            ),
+            gr.Dropdown(choices=["best", "worst"], value="best", label="Audio Quality"),
+        ],
+        outputs=gr.Audio(label="Extracted Audio", type="filepath"),
+        title="Extract Audio from YouTube",
+        description="Extract audio from a YouTube video URL.",
+        examples=None,
+        cache_examples=False,
+        flagging_mode="never",
+    )
+    # Tab 7: YouTube Extraction
+    youtube_interface = gr.Interface(
+        fn=extract_audio_from_youtube,
+        inputs=[
+            gr.Textbox(
+                label="YouTube URL", placeholder="https://www.youtube.com/watch?v=..."
+            ),
+            gr.Dropdown(
+                choices=["wav", "mp3", "flac"], value="wav", label="Output Format"
+            ),
+            gr.Dropdown(choices=["best", "worst"], value="best", label="Audio Quality"),
+        ],
+        outputs=gr.Audio(label="Extracted Audio", type="filepath"),
+        title="Extract Audio from YouTube",
+        description="Extract audio from a YouTube video URL.",
+        examples=None,
+        cache_examples=False,
+        flagging_mode="never",
+    )
+    return gr.TabbedInterface(
+        [
+            stem_interface,
+            combine_interface,
+            pitch_interface,
+            stretch_interface,
+            bpm_interface,
+            selective_interface,
+            vocal_nonvocal_interface,
+            karaoke_interface,
+            medley_interface,
+            youtube_interface,
+        ],
+        [
+            "Stem Separation",
+            "Track Combination",
+            "Pitch Alignment",
+            "Time Stretching",
+            "BPM Alignment",
+            "Selective Stems",
+            "Vocal/Instrumental",
+            "Karaoke Creation",
+            "Medley Creation",
+            "YouTube Extraction",
+        ],
+    )
+if __name__ == "__main__":
+    interface = create_interface()
+    interface.launch(server_name="0.0.0.0", server_port=7860, mcp_server=True)

mypy.ini ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ [mypy-untyped_package.*]
2	+ follow_untyped_imports = True

requirements.txt ADDED Viewed

	@@ -0,0 +1,14 @@

+gradio[mcp]>=4.0.0
+librosa>=0.10.0
+numpy>=1.24.0
+torch>=2.0.0
+torchaudio>=2.0.0
+transformers>=4.30.0
+soundfile>=0.12.0
+pydub>=0.25.0
+demucs>=4.0.0
+pytest>=7.0.0
+ruff>=0.1.0
+mypy>=1.0.0
+smolagents[mcp]
+yt_dlp>=2025.11.12

tools/__init__.py ADDED Viewed

File without changes

tools/combine_tracks.py ADDED Viewed

	@@ -0,0 +1,387 @@

+import os
+import subprocess
+import tempfile
+from pathlib import Path
+from typing import Optional
+import librosa
+import numpy as np
+import soundfile as sf
+def combine_tracks(
+    track1_path: str,
+    track2_path: str,
+    weight1: float = 0.5,
+    weight2: float = 0.5,
+    output_path: Optional[str] = None,
+    normalize: bool = True,
+    fade_in: float = 0.0,
+    fade_out: float = 0.0,
+) -> str:
+    """
+    Combine two audio tracks into a new single stereo audio track with adjustable mixing weights.
+    This function mixes two audio files together with customizable balance, normalization,
+    and fade effects. Useful for creating mashups, adding background music to vocals,
+    or layering multiple audio sources.
+    Args:
+        track1_path: Path to first audio file (supports common formats: WAV, MP3, FLAC)
+        track2_path: Path to second audio file (supports common formats: WAV, MP3, FLAC)
+        weight1: Weight factor for first track (0.0-1.0, default: 0.5)
+                1.0 = full volume, 0.5 = half volume, 0.0 = silent
+        weight2: Weight factor for second track (0.0-1.0, default: 0.5)
+                1.0 = full volume, 0.5 = half volume, 0.0 = silent
+        output_path: Optional output file path (default: temporary file)
+        normalize: Whether to normalize the final output to prevent clipping (default: True)
+        fade_in: Fade in duration in seconds (default: 0.0)
+        fade_out: Fade out duration in seconds (default: 0.0)
+    Returns:
+        Path to the combined audio file in WAV format
+    Examples:
+        - weight1=0.8, weight2=0.2: First track dominates the mix
+        - weight1=0.5, weight2=0.5: Equal balance between tracks
+        - weight1=1.0, weight2=0.3: First track at full volume, second track quiet
+        - fade_in=2.0, fade_out=3.0: Gradual volume increase and decrease
+    Note:
+        Both tracks are automatically resampled to match the higher sample rate
+        Tracks of different lengths are padded with silence to match the longer one
+        Output is saved in WAV format for maximum quality
+    """
+    try:
+        # Load both audio files
+        y1, sr1 = librosa.load(track1_path, mono=False)
+        y2, sr2 = librosa.load(track2_path, mono=False)
+        # Ensure both tracks are stereo
+        if y1.ndim == 1:
+            y1 = np.stack([y1, y1])
+        if y2.ndim == 1:
+            y2 = np.stack([y2, y2])
+        # Ensure same sample rate
+        if sr1 != sr2:
+            y2 = librosa.resample(y2, orig_sr=sr2, target_sr=sr1)
+            sr2 = sr1
+        # Ensure same length
+        max_length = max(y1.shape[1], y2.shape[1])
+        if y1.shape[1] < max_length:
+            y1 = np.pad(y1, ((0, 0), (0, max_length - y1.shape[1])), mode="constant")
+        if y2.shape[1] < max_length:
+            y2 = np.pad(y2, ((0, 0), (0, max_length - y2.shape[1])), mode="constant")
+        # Apply weights and combine
+        combined = weight1 * y1 + weight2 * y2
+        # Apply fade in/out if specified
+        if fade_in > 0:
+            fade_samples = int(fade_in * sr1)
+            if fade_samples > 0:
+                fade_curve = np.linspace(0, 1, fade_samples)
+                combined[:, :fade_samples] *= fade_curve
+        if fade_out > 0:
+            fade_samples = int(fade_out * sr1)
+            if fade_samples > 0:
+                fade_curve = np.linspace(1, 0, fade_samples)
+                combined[:, -fade_samples:] *= fade_curve
+        # Normalize if requested
+        if normalize:
+            max_val = np.max(np.abs(combined))
+            if max_val > 0:
+                combined = combined / max_val * 0.95
+        # Save to file
+        if output_path:
+            os.makedirs(output_path, exist_ok=True)
+        else:
+            output_path = tempfile.mkdtemp(suffix="_combined")
+        final_audio_filename = os.path.join(output_path, "stereo_combined.wav")
+        sf.write(final_audio_filename, combined.T, sr1, format="wav", subtype="PCM_16")
+        return final_audio_filename
+    except Exception as e:
+        raise RuntimeError(f"Error combining tracks: {str(e)}")
+def create_stereo_mix(
+    left_track_path: str,
+    right_track_path: str,
+    output_path: Optional[str] = None,
+    normalize: bool = True,
+) -> str:
+    """
+    Create a stereo track with one track in left channel and another in right channel.
+    Args:
+        left_track_path: Path to audio file for left channel
+        right_track_path: Path to audio file for right channel
+        output_path: Optional output file path (default: temp file)
+        normalize: Whether to normalize the final output
+    Returns:
+        Path to the stereo audio file
+    """
+    try:
+        # Load both audio files
+        y_left, sr_left = librosa.load(left_track_path, mono=True)
+        y_right, sr_right = librosa.load(right_track_path, mono=True)
+        # Ensure same sample rate
+        if sr_left != sr_right:
+            y_right = librosa.resample(y_right, orig_sr=sr_right, target_sr=sr_left)
+            sr_right = sr_left
+        # Ensure same length
+        max_length = max(len(y_left), len(y_right))
+        if len(y_left) < max_length:
+            y_left = np.pad(y_left, (0, max_length - len(y_left)), mode="constant")
+        if len(y_right) < max_length:
+            y_right = np.pad(y_right, (0, max_length - len(y_right)), mode="constant")
+        # Create stereo array
+        stereo = np.array([y_left, y_right])
+        # Normalize if requested
+        if normalize:
+            max_val = np.max(np.abs(stereo))
+            if max_val > 0:
+                stereo = stereo / max_val * 0.95
+        # Save to file
+        if output_path is None:
+            output_path = tempfile.mkdtemp(suffix="_combined")
+        else:
+            os.makedirs(output_path, exist_ok=True)
+        final_audio_filename = os.path.join(output_path, "stereo_mix.wav")
+        sf.write(
+            final_audio_filename, stereo.T, sr_left, format="wav", subtype="PCM_16"
+        )
+        return final_audio_filename
+    except Exception as e:
+        raise RuntimeError(f"Error creating stereo mix: {str(e)}")
+def create_medley(
+    vocals_path: str,
+    instrumental_path: str,
+    *,
+    output_path: Optional[str] = None,
+    vocals_gain: float = 1.2,
+    instrumental_gain: float = 0.9,
+    compressor: str = "threshold=-18dB:ratio=3:attack=50:release=200",
+    audio_codec: str = "libmp3lame",
+    audio_bitrate: str = "192k",
+) -> str:
+    """
+    Mixes a vocal stem with an instrumental stem using ffmpeg filters.
+    Parameters
+    ----------
+    vocals_path : str
+        Absolute path (or MCP-accessible URI) to the vocals stem.
+    instrumental_path : str
+        Absolute path (or MCP-accessible URI) to the instrumental/no-vocals stem.
+    output_path : str, optional
+        Where to write the medley. Defaults to a temp file the MCP tool returns.
+    vocals_gain : float
+        Linear gain applied to the vocals stem (1.0 = unity).
+    instrumental_gain : float
+        Linear gain applied to the instrumental stem.
+    compressor : str
+        ffmpeg acompressor parameters for peak control after mixing.
+    audio_codec : str
+        Target codec passed to ffmpeg’s -c:a flag.
+    audio_bitrate : str
+        Bitrate passed to ffmpeg’s -b:a flag.
+    Returns
+    -------
+    str
+        Path to the rendered medley file.
+    """
+    vocals = Path(vocals_path).expanduser().resolve()
+    instrumental = Path(instrumental_path).expanduser().resolve()
+    if not vocals.exists():
+        raise FileNotFoundError(f"Vocals stem not found: {vocals}")
+    if not instrumental.exists():
+        raise FileNotFoundError(f"Instrumental stem not found: {instrumental}")
+    if output_path is None:
+        tmp_dir = tempfile.mkdtemp(prefix="mcp-medley-")
+        output = Path(tmp_dir) / "unidos_hyper_medley.mp3"
+    else:
+        output = Path(output_path).expanduser().resolve()
+        output.parent.mkdir(parents=True, exist_ok=True)
+    filter_complex = (
+        f"[0:a]volume={vocals_gain}[v];"
+        f"[1:a]volume={instrumental_gain}[i];"
+        f"[v][i]amix=inputs=2:duration=longest:dropout_transition=2,"
+        f"acompressor={compressor}"
+    )
+    cmd = [
+        "ffmpeg",
+        "-y",
+        "-i",
+        str(vocals),
+        "-i",
+        str(instrumental),
+        "-filter_complex",
+        filter_complex,
+        "-c:a",
+        audio_codec,
+        "-b:a",
+        audio_bitrate,
+        str(output),
+    ]
+    completed = subprocess.run(cmd, capture_output=True, text=True)
+    if completed.returncode != 0:
+        raise RuntimeError(
+            f"ffmpeg failed ({completed.returncode}):\n"
+            f"STDOUT:\n{completed.stdout}\nSTDERR:\n{completed.stderr}"
+        )
+    return str(output)
+if __name__ == "__main__":
+    import argparse
+    parser = argparse.ArgumentParser(description="Combine audio tracks")
+    subparsers = parser.add_subparsers(dest="command", help="Available commands")
+    # Combine tracks with weights
+    combine_parser = subparsers.add_parser(
+        "combine", help="Combine two tracks with weights"
+    )
+    combine_parser.add_argument("track1", help="Path to first audio file")
+    combine_parser.add_argument("track2", help="Path to second audio file")
+    combine_parser.add_argument(
+        "--weight1", type=float, default=0.5, help="Weight for first track (0.0-1.0)"
+    )
+    combine_parser.add_argument(
+        "--weight2", type=float, default=0.5, help="Weight for second track (0.0-1.0)"
+    )
+    combine_parser.add_argument(
+        "--fade-in", type=float, default=0.0, help="Fade in duration in seconds"
+    )
+    combine_parser.add_argument(
+        "--fade-out", type=float, default=0.0, help="Fade out duration in seconds"
+    )
+    combine_parser.add_argument(
+        "--no-normalize", action="store_true", help="Disable normalization"
+    )
+    combine_parser.add_argument(
+        "--output", type=str, default="output", help="Output file path"
+    )
+    # Create stereo mix
+    stereo_parser = subparsers.add_parser(
+        "stereo", help="Create stereo mix (left/right channels)"
+    )
+    stereo_parser.add_argument("left", help="Path to left channel audio file")
+    stereo_parser.add_argument("right", help="Path to right channel audio file")
+    stereo_parser.add_argument(
+        "--no-normalize", action="store_true", help="Disable normalization"
+    )
+    stereo_parser.add_argument(
+        "--output", type=str, default="stereo_output", help="Output file path"
+    )
+    # Create medley
+    medley_parser = subparsers.add_parser(
+        "medley", help="Create a vocal/instrumental medley using ffmpeg"
+    )
+    medley_parser.add_argument("vocals", help="Path to vocals stem audio file")
+    medley_parser.add_argument(
+        "instrumental", help="Path to instrumental stem audio file"
+    )
+    medley_parser.add_argument(
+        "--vocals-gain",
+        type=float,
+        default=1.2,
+        help="Linear gain for vocals (default: 1.2)",
+    )
+    medley_parser.add_argument(
+        "--instrumental-gain",
+        type=float,
+        default=0.9,
+        help="Linear gain for instrumental (default: 0.9)",
+    )
+    medley_parser.add_argument(
+        "--compressor",
+        type=str,
+        default="threshold=-18dB:ratio=3:attack=50:release=200",
+        help="FFmpeg acompressor parameters (default: threshold=-18dB:ratio=3:attack=50:release=200)",
+    )
+    medley_parser.add_argument(
+        "--audio-codec",
+        type=str,
+        default="libmp3lame",
+        help="Target audio codec (default: libmp3lame)",
+    )
+    medley_parser.add_argument(
+        "--audio-bitrate",
+        type=str,
+        default="192k",
+        help="Audio bitrate (default: 192k)",
+    )
+    medley_parser.add_argument(
+        "--output", type=str, help="Output file path (default: temporary file)"
+    )
+    args = parser.parse_args()
+    try:
+        if args.command == "combine":
+            output = combine_tracks(
+                args.track1,
+                args.track2,
+                weight1=args.weight1,
+                weight2=args.weight2,
+                normalize=not args.no_normalize,
+                fade_in=args.fade_in,
+                fade_out=args.fade_out,
+                output_path=args.output,
+            )
+            print(f"Combined audio saved to: {output}")
+        elif args.command == "stereo":
+            output = create_stereo_mix(
+                args.left,
+                args.right,
+                normalize=not args.no_normalize,
+                output_path=args.output,
+            )
+            print(f"Stereo mix saved to: {output}")
+        elif args.command == "medley":
+            output = create_medley(
+                args.vocals,
+                args.instrumental,
+                output_path=args.output,
+                vocals_gain=args.vocals_gain,
+                instrumental_gain=args.instrumental_gain,
+                compressor=args.compressor,
+                audio_codec=args.audio_codec,
+                audio_bitrate=args.audio_bitrate,
+            )
+            print(f"Medley saved to: {output}")
+        else:
+            parser.print_help()
+    except Exception as e:
+        print(f"Error: {e}")
+        exit(1)

tools/pitch_alignment.py ADDED Viewed

	@@ -0,0 +1,228 @@

+import os
+from typing import Tuple
+import librosa
+import numpy as np
+import soundfile as sf
+def _load_audio(audio_path: str, mono: bool = False) -> Tuple[np.ndarray, float]:
+    """
+    Load an audio file in stereo format.
+    Args:
+        audio_path: Path to audio file
+        mono: Whether to load as mono or stereo (default: False)
+    Returns:
+        Tuple of (audio_data, sample_rate)
+    """
+    y, sr = librosa.load(audio_path, sr=None, mono=mono, res_type="soxr_vhq")
+    return y, sr
+def estimate_key(audio_path: str) -> str:
+    """
+    Estimate the musical key of an audio file using chroma features and harmonic analysis.
+    This function analyzes the harmonic content of an audio file to determine its musical key
+    using chroma features and statistical analysis of pitch class distributions.
+    Args:
+        audio_path: Path to audio file (supports common formats: WAV, MP3, FLAC)
+    Returns:
+        Estimated key as string (e.g., 'C', 'C#', 'D', 'D#', 'E', 'F', 'F#', 'G', 'G#', 'A', 'A#', 'B')
+    Examples:
+        - Returns 'C' for audio in C major/A minor
+        - Returns 'F#' for audio in F# major/D# minor
+        - Returns 'A' for audio in A major/F# minor
+    Note:
+        Uses medium quality processing for faster analysis
+        Most accurate for music with clear harmonic content
+        May be less accurate for atonal or highly percussive music
+    """
+    try:
+        y, sr = librosa.load(
+            audio_path, res_type="soxr_mq"
+        )  # Medium quality for faster processing
+        # Extract chroma features
+        chroma = librosa.feature.chroma_cqt(y=y, sr=sr)
+        # Get the most prominent pitch class
+        chroma_mean = np.mean(chroma, axis=1)
+        key_index = np.argmax(chroma_mean)
+        # Map index to key names
+        keys = ["C", "C#", "D", "D#", "E", "F", "F#", "G", "G#", "A", "A#", "B"]
+        estimated_key = keys[key_index]
+        return estimated_key
+    except Exception as e:
+        raise RuntimeError(f"Error estimating key: {str(e)}")
+def key_to_semitones(key: str, target_key: str = "C") -> int:
+    """
+    Calculate semitone difference between two keys.
+    Args:
+        key: Source key
+        target_key: Target key to align to
+    Returns:
+        Number of semitones to shift
+    """
+    keys = ["C", "C#", "D", "D#", "E", "F", "F#", "G", "G#", "A", "A#", "B"]
+    if key not in keys or target_key not in keys:
+        raise ValueError("Invalid key name")
+    key_index = keys.index(key)
+    target_index = keys.index(target_key)
+    # Calculate semitone difference (wrapping around 12 semitones)
+    semitones = (target_index - key_index) % 12
+    if semitones > 6:
+        semitones -= 12
+    return semitones
+def align_songs_by_key(
+    audio1_path: str,
+    audio2_path: str,
+    target_key: str = "C",
+    output_path: str = "output",
+) -> Tuple[str, str]:
+    """
+    Align two songs to the same musical key by pitch shifting.
+    Args:
+        audio1_path: Path to first audio file
+        audio2_path: Path to second audio file
+        target_key: Target key to align both songs to (default: 'C')
+        output_path: Directory to save the aligned audio files
+    Returns:
+        Tuple of (aligned_audio1_path, aligned_audio2_path) - paths to processed files
+    """
+    try:
+        # Estimate keys for both tracks (handled internally by shift_to_key)
+        # key1 = estimate_key(audio1_path)
+        # key2 = estimate_key(audio2_path)
+        # Calculate semitone shifts (handled internally by shift_to_key)
+        # semitones1 = key_to_semitones(key1, target_key)
+        # semitones2 = key_to_semitones(key2, target_key)
+        # Load audio files
+        y1, sr1 = _load_audio(audio1_path)
+        y2, sr2 = _load_audio(audio2_path)
+        # res_type = "soxr_vhq"  # Very high quality for final output (set in shift_to_key)
+        aligned1_path = shift_to_key(audio1_path, target_key, output_path)
+        aligned2_path = shift_to_key(audio2_path, target_key, output_path)
+        return aligned1_path, aligned2_path
+    except Exception as e:
+        raise RuntimeError(f"Error aligning audio keys: {str(e)}") from e
+def shift_to_key(audio_path: str, target_key: str, output_path: str = "output") -> str:
+    """
+    Shift an audio file to a specific musical key.
+    Args:
+        audio_path: Path to audio file
+        target_key: Target key to shift to
+        output_path: Directory to save the shifted audio file
+    Returns:
+        Path to the pitch-shifted audio file
+    """
+    try:
+        # Estimate current key
+        current_key = estimate_key(audio_path)
+        # Calculate semitone shift
+        semitones = key_to_semitones(current_key, target_key)
+        # Load and shift audio
+        y, sr = _load_audio(audio_path)
+        y_shifted = librosa.effects.pitch_shift(
+            y, n_steps=semitones, scale=True, sr=sr, res_type="soxr_vhq"
+        )
+        # Save to temporary file
+        audio_path = os.path.basename(audio_path).replace(".wav", "")
+        os.makedirs(output_path, exist_ok=True)
+        if y_shifted.ndim == 2:
+            y_shifted = y_shifted.T
+        final_audio_path = os.path.join(
+            output_path, f"{audio_path}_shifted_to_{target_key}.wav"
+        )
+        sf.write(final_audio_path, y_shifted, sr, format="wav", subtype="PCM_16")
+        return final_audio_path
+    except Exception as e:
+        raise RuntimeError(f"Error shifting key: {str(e)}")
+if __name__ == "__main__":
+    import argparse
+    parser = argparse.ArgumentParser(
+        description="Pitch alignment tools for audio files"
+    )
+    subparsers = parser.add_subparsers(dest="command", help="Available commands")
+    # Estimate key of a single file
+    estimate_parser = subparsers.add_parser(
+        "estimate", help="Estimate the key of an audio file"
+    )
+    estimate_parser.add_argument("audio", help="Path to audio file")
+    # Align two songs by key
+    align_parser = subparsers.add_parser("align", help="Align two songs to same key")
+    align_parser.add_argument("audio1", help="Path to first audio file")
+    align_parser.add_argument("audio2", help="Path to second audio file")
+    align_parser.add_argument(
+        "--target-key", default="C", help="Target key to align to (default: C)"
+    )
+    # Shift single file to key
+    shift_parser = subparsers.add_parser("shift", help="Shift audio to specific key")
+    shift_parser.add_argument("audio", help="Path to audio file")
+    shift_parser.add_argument("target_key", help="Target key to shift to")
+    args = parser.parse_args()
+    try:
+        if args.command == "estimate":
+            key = estimate_key(args.audio)
+            print(f"Estimated key: {key}")
+        elif args.command == "align":
+            aligned1, aligned2 = align_songs_by_key(
+                args.audio1, args.audio2, args.target_key
+            )
+            print(f"Aligned audio 1: {aligned1}")
+            print(f"Aligned audio 2: {aligned2}")
+        elif args.command == "shift":
+            output = shift_to_key(args.audio, args.target_key)
+            print(f"Shifted audio saved to: {output}")
+        else:
+            parser.print_help()
+    except Exception as e:
+        print(f"Error: {e}")
+        raise e
+        exit(1)

tools/stems_separation.py ADDED Viewed

	@@ -0,0 +1,358 @@

+import argparse
+import os
+import subprocess
+from pathlib import Path
+from typing import Tuple, List, Dict, Optional
+class Error(Exception):
+    pass
+def separate_audio(
+    audio_path: str, output_path: Optional[str] = None
+) -> Tuple[str, str, str, str]:
+    """
+    Separate audio into vocals, drums, bass, and other stems using Demucs.
+    This function uses the Demucs neural network model to separate a mixed audio file
+    into individual instrument stems. It's particularly effective for separating
+    vocals from instrumental backing tracks.
+    Args:
+        audio_path: Path to the input audio file (supports common formats: WAV, MP3, FLAC, M4A)
+        output_path: Directory to save the separated stems (default: 'output' directory)
+    Returns:
+        tuple[str, str, str, str]: Paths to the separated audio files in order:
+            - vocals: Isolated vocal track
+            - drums: Isolated drum/percussion track
+            - bass: Isolated bass track
+            - other: Remaining instruments (guitars, keyboards, etc.)
+    Examples:
+        - Extract vocals for karaoke creation
+        - Isolate drums for remixing
+        - Separate bass for transcription
+        - Create instrumental versions by combining drums+bass+other
+    Note:
+        Uses the htdemucs model which is optimized for high-quality separation
+        Processing time depends on audio length and system performance
+        Output files are saved in WAV format for maximum quality
+    """
+    try:
+        # Prepare the output directory
+        if not output_path:
+            output_path = "output"
+        output_dir = os.path.join(output_path, "separated")
+        os.makedirs(output_dir, exist_ok=True)
+        # Run Demucs separation
+        cmd = [
+            "python",
+            "-m",
+            "demucs.separate",
+            "--out",
+            output_dir,
+            "--name",
+            "htdemucs",
+            audio_path,
+        ]
+        result = subprocess.run(cmd, capture_output=True, text=True)
+        if result.returncode != 0:
+            raise Error(f"Demucs separation failed: {result.stderr}")
+        # Find the separated files
+        track_name = Path(audio_path).stem
+        htdemucs_dir = os.path.join(output_dir, "htdemucs", track_name)
+        vocals_path = os.path.join(htdemucs_dir, "vocals.wav")
+        drums_path = os.path.join(htdemucs_dir, "drums.wav")
+        bass_path = os.path.join(htdemucs_dir, "bass.wav")
+        other_path = os.path.join(htdemucs_dir, "other.wav")
+        # Verify all files exist
+        for file_path in [vocals_path, drums_path, bass_path, other_path]:
+            if not os.path.exists(file_path):
+                raise Error(f"Separated file not found: {file_path}")
+        return vocals_path, drums_path, bass_path, other_path
+    except Exception as e:
+        raise Error(f"Error processing audio: {str(e)}")
+def extract_selected_stems(
+    audio_path: str, stems_to_extract: List[str], output_path: Optional[str] = None
+) -> Dict[str, str]:
+    """
+    Extract only specific stems from an audio file.
+    This function allows selective extraction of specific stems rather than all four stems,
+    which can save processing time and storage space when only certain elements are needed.
+    Args:
+        audio_path: Path to the input audio file (supports common formats: WAV, MP3, FLAC, M4A)
+        stems_to_extract: List of stems to extract. Valid options: ['vocals', 'drums', 'bass', 'other']
+        output_path: Directory to save the selected stems (default: 'output' directory)
+    Returns:
+        dict[str, str]: Dictionary mapping stem names to their file paths
+    Examples:
+        - extract_selected_stems('song.mp3', ['vocals', 'drums']): Extract only vocals and drums
+        - extract_selected_stems('song.mp3', ['vocals']): Extract only vocals for karaoke
+        - extract_selected_stems('song.mp3', ['bass', 'drums']): Extract rhythm section
+    Note:
+        Valid stem names are: 'vocals', 'drums', 'bass', 'other'
+        Invalid stem names will be ignored with a warning
+        Uses the same high-quality Demucs model as separate_audio
+    """
+    # Validate stem names
+    valid_stems = ["vocals", "drums", "bass", "other"]
+    invalid_stems = [stem for stem in stems_to_extract if stem not in valid_stems]
+    if invalid_stems:
+        print(f"Warning: Invalid stem names will be ignored: {invalid_stems}")
+    # Filter to only valid stems
+    valid_stems_to_extract = [stem for stem in stems_to_extract if stem in valid_stems]
+    if not valid_stems_to_extract:
+        raise ValueError("No valid stems specified for extraction")
+    # First, separate all stems
+    all_stems = separate_audio(audio_path, output_path)
+    vocals_path, drums_path, bass_path, other_path = all_stems
+    # Create mapping of all stems
+    stem_mapping = {
+        "vocals": vocals_path,
+        "drums": drums_path,
+        "bass": bass_path,
+        "other": other_path,
+    }
+    # Return only requested stems
+    result = {}
+    for stem in valid_stems_to_extract:
+        result[stem] = stem_mapping[stem]
+    return result
+def extract_vocal_non_vocal(
+    audio_path: str, output_path: Optional[str] = None
+) -> Tuple[str, str]:
+    """
+    Extract vocals and non-vocals (instrumental) stems from an audio file.
+    This function provides a simple interface to separate audio into vocal and
+    non-vocal components, which is useful for karaoke creation, vocal isolation,
+    or instrumental extraction.
+    Args:
+        audio_path: Path to the input audio file (supports common formats: WAV, MP3, FLAC, M4A)
+        output_path: Directory to save the separated stems (default: 'output' directory)
+    Returns:
+        tuple[str, str]: Paths to (vocals_file, non_vocals_file)
+        - vocals_file: Path to the isolated vocal track
+        - non_vocals_file: Path to the combined instrumental track (drums + bass + other)
+    Examples:
+        - extract_vocal_non_vocal('song.mp3'): Separate into vocals and instrumental
+        - extract_vocal_non_vocal('song.wav', 'karaoke'): Create karaoke version
+    Note:
+        The non-vocals track combines drums, bass, and other stems into a single instrumental
+        Uses the same high-quality Demucs model as separate_audio
+        Non-vocals track is automatically mixed and normalized
+    """
+    # Extract all stems
+    all_stems = separate_audio(audio_path, output_path)
+    vocals_path, drums_path, bass_path, other_path = all_stems
+    # Create non-vocals by combining drums, bass, and other
+    try:
+        # Load all non-vocal stems
+        import librosa
+        import numpy as np
+        import soundfile as sf
+        y_drums, sr_drums = librosa.load(drums_path, sr=None, mono=False)
+        y_bass, sr_bass = librosa.load(bass_path, sr=None, mono=False)
+        y_other, sr_other = librosa.load(other_path, sr=None, mono=False)
+        # Ensure same sample rate
+        target_sr = max(sr_drums, sr_bass, sr_other)
+        if sr_drums != target_sr:
+            y_drums = librosa.resample(y_drums, orig_sr=sr_drums, target_sr=target_sr)
+        if sr_bass != target_sr:
+            y_bass = librosa.resample(y_bass, orig_sr=sr_bass, target_sr=target_sr)
+        if sr_other != target_sr:
+            y_other = librosa.resample(y_other, orig_sr=sr_other, target_sr=target_sr)
+        # Ensure same shape
+        max_length = max(y_drums.shape[-1], y_bass.shape[-1], y_other.shape[-1])
+        def pad_to_length(y, target_length):
+            if y.shape[-1] < target_length:
+                if y.ndim == 1:
+                    return np.pad(y, (0, target_length - y.shape[-1]), mode="constant")
+                else:
+                    return np.pad(
+                        y, ((0, 0), (0, target_length - y.shape[-1])), mode="constant"
+                    )
+            return y
+        y_drums = pad_to_length(y_drums, max_length)
+        y_bass = pad_to_length(y_bass, max_length)
+        y_other = pad_to_length(y_other, max_length)
+        # Combine non-vocal stems
+        non_vocals = y_drums + y_bass + y_other
+        # Normalize to prevent clipping
+        max_val = np.max(np.abs(non_vocals))
+        if max_val > 0:
+            non_vocals = non_vocals / max_val * 0.95
+        # Save non-vocals file
+        if output_path:
+            os.makedirs(output_path, exist_ok=True)
+            non_vocals_filename = os.path.join(output_path, "non_vocals.wav")
+        else:
+            non_vocals_filename = os.path.join(
+                os.path.dirname(drums_path), "non_vocals.wav"
+            )
+        if non_vocals.ndim == 2:
+            non_vocals = non_vocals.T
+        sf.write(
+            non_vocals_filename, non_vocals, target_sr, format="wav", subtype="PCM_16"
+        )
+        return vocals_path, non_vocals_filename
+    except Exception as e:
+        raise RuntimeError(f"Error creating non-vocals track: {str(e)}")
+def create_karaoke_track(audio_path: str, output_path: Optional[str] = None) -> str:
+    """
+    Create a karaoke (instrumental) track by removing vocals from an audio file.
+    This is a convenience function that extracts the instrumental (non-vocal) portion
+    of a song, creating a karaoke-ready backing track.
+    Args:
+        audio_path: Path to the input audio file (supports common formats: WAV, MP3, FLAC, M4A)
+        output_path: Directory to save the karaoke track (default: 'output' directory)
+    Returns:
+        Path to the karaoke (instrumental) audio file
+    Examples:
+        - create_karaoke_track('song.mp3'): Create karaoke version
+        - create_karaoke_track('song.wav', 'karaoke_tracks'): Save to specific folder
+    Note:
+        Uses the same high-quality Demucs model as separate_audio
+        Combines drums, bass, and other stems into instrumental track
+        Automatically normalized for consistent volume
+    """
+    vocals_path, instrumental_path = extract_vocal_non_vocal(audio_path, output_path)
+    return instrumental_path
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description="Separate audio into stems using Demucs"
+    )
+    subparsers = parser.add_subparsers(dest="command", help="Available commands")
+    # Original separate command
+    separate_parser = subparsers.add_parser(
+        "separate", help="Separate into all four stems"
+    )
+    separate_parser.add_argument("audio_path", help="Path to the input audio file")
+    separate_parser.add_argument(
+        "--output-dir", help="Directory to save separated stems (default: output)"
+    )
+    # New selective stems command
+    select_parser = subparsers.add_parser("select", help="Extract specific stems only")
+    select_parser.add_argument("audio_path", help="Path to the input audio file")
+    select_parser.add_argument(
+        "stems",
+        nargs="+",
+        choices=["vocals", "drums", "bass", "other"],
+        help="Stems to extract (choose from: vocals, drums, bass, other)",
+    )
+    select_parser.add_argument(
+        "--output-dir", help="Directory to save separated stems (default: output)"
+    )
+    # New vocal/non-vocal command
+    vocal_parser = subparsers.add_parser(
+        "vocal-nonvocal", help="Extract vocals and instrumental only"
+    )
+    vocal_parser.add_argument("audio_path", help="Path to the input audio file")
+    vocal_parser.add_argument(
+        "--output-dir", help="Directory to save separated stems (default: output)"
+    )
+    # New karaoke command
+    karaoke_parser = subparsers.add_parser(
+        "karaoke", help="Create karaoke (instrumental) track"
+    )
+    karaoke_parser.add_argument("audio_path", help="Path to the input audio file")
+    karaoke_parser.add_argument(
+        "--output-dir", help="Directory to save karaoke track (default: output)"
+    )
+    args = parser.parse_args()
+    if not args.command:
+        parser.print_help()
+        exit(1)
+    try:
+        if args.command == "separate":
+            vocals, drums, bass, other = separate_audio(
+                args.audio_path, args.output_dir
+            )
+            print(f"Vocals: {vocals}")
+            print(f"Drums: {drums}")
+            print(f"Bass: {bass}")
+            print(f"Other: {other}")
+        elif args.command == "select":
+            selected_stems = extract_selected_stems(
+                args.audio_path, args.stems, args.output_dir
+            )
+            for stem, path in selected_stems.items():
+                print(f"{stem.capitalize()}: {path}")
+        elif args.command == "vocal-nonvocal":
+            vocals_path, non_vocals_path = extract_vocal_non_vocal(
+                args.audio_path, args.output_dir
+            )
+            print(f"Vocals: {vocals_path}")
+            print(f"Non-vocals (Instrumental): {non_vocals_path}")
+        elif args.command == "karaoke":
+            karaoke_path = create_karaoke_track(args.audio_path, args.output_dir)
+            print(f"Karaoke track: {karaoke_path}")
+    except Exception as e:
+        print(f"Error: {e}")
+        exit(1)

tools/time_strech.py ADDED Viewed

	@@ -0,0 +1,145 @@

+import os
+from typing import Optional, Tuple
+import librosa
+import soundfile as sf
+def align_songs_by_bpm(
+    audio1_path: str, audio2_path: str, output_path: Optional[str] = None
+) -> Tuple[str, str]:
+    """
+    Align two songs to the same BPM by stretching the faster one to match the slower one.
+    This function analyzes the tempo of two audio files and automatically stretches the faster
+    track to match the BPM of the slower track, making them suitable for mixing or mashups.
+    Args:
+        audio1_path: Path to first audio file (supports common formats: WAV, MP3, FLAC)
+        audio2_path: Path to second audio file (supports common formats: WAV, MP3, FLAC)
+        output_path: Optional output directory (default: None, uses temporary directory)
+    Returns:
+        Tuple of (aligned_audio1_path, aligned_audio2_path): Paths to the processed audio files
+        Both files will have the same BPM (the slower of the two original tempos)
+    Examples:
+        - Song A: 140 BPM, Song B: 128 BPM → Both become 128 BPM
+        - Song A: 120 BPM, Song B: 130 BPM → Both become 120 BPM
+    Note:
+        Uses high-quality time-stretching to maintain audio quality
+        Preserves the original pitch of both tracks
+        Processing time depends on audio length and tempo difference
+    """
+    try:
+        # Load both audio files
+        y1, sr1 = librosa.load(audio1_path)
+        y2, sr2 = librosa.load(audio2_path)
+        # Get BPM for both tracks
+        tempo1, _ = librosa.beat.beat_track(y=y1, sr=sr1)
+        tempo2, _ = librosa.beat.beat_track(y=y2, sr=sr2)
+        bpm1 = float(tempo1)
+        bpm2 = float(tempo2)
+        # Determine which track is faster and needs stretching
+        if bpm1 > bpm2:
+            # Stretch first track to match second track's BPM
+            aligned1_path = stretch_to_bpm(audio1_path, bpm2, output_path)
+            aligned2_path = stretch_to_bpm(audio2_path, bpm2, output_path)
+        else:
+            # Stretch second track to match first track's BPM
+            aligned1_path = stretch_to_bpm(audio1_path, bpm1, output_path)
+            aligned2_path = stretch_to_bpm(audio2_path, bpm1, output_path)
+        return aligned1_path, aligned2_path
+    except Exception as e:
+        raise RuntimeError(f"Error aligning audio files: {str(e)}")
+def stretch_to_bpm(
+    audio_path: str, target_bpm: float, output_path: Optional[str] = None
+) -> str:
+    """
+    Stretch an audio file to a specific BPM.
+    Args:
+        audio_path: Path to audio file
+        target_bpm: Target BPM to stretch to
+        output_path: Path to output file
+    Returns:
+        Path to the stretched audio file
+    """
+    try:
+        y, sr = librosa.load(audio_path, sr=None, mono=False)
+        # Get current BPM
+        y_hat, sr_hat = librosa.load(audio_path)
+        tempo, _ = librosa.beat.beat_track(y=y_hat, sr=sr_hat)
+        current_bpm = float(tempo)
+        # Calculate stretch factor
+        stretch_factor = target_bpm / current_bpm
+        # Apply time stretching
+        y_stretched = librosa.effects.time_stretch(y, rate=stretch_factor)
+        # Save to temporary file
+        if not output_path:
+            output_path = "output"
+        os.makedirs(output_path, exist_ok=True)
+        original_audio_filename = os.path.basename(audio_path).replace(".wav", "")
+        output_file_path = os.path.join(
+            output_path,
+            f"{original_audio_filename}_stretched_to_{int(target_bpm)}_bpm.wav",
+        )
+        if y_stretched.ndim == 2:
+            y_stretched = y_stretched.T  # Transpose for multi-channel audio
+        sf.write(output_file_path, y_stretched, sr)
+        return output_file_path
+    except Exception as e:
+        raise RuntimeError(f"Error stretching audio: {str(e)}")
+if __name__ == "__main__":
+    import argparse
+    parser = argparse.ArgumentParser(description="Time stretch audio files")
+    subparsers = parser.add_subparsers(dest="command", help="Available commands")
+    # Align two songs by BPM
+    align_parser = subparsers.add_parser("align", help="Align two songs to same BPM")
+    align_parser.add_argument("audio1", help="Path to first audio file")
+    align_parser.add_argument("audio2", help="Path to second audio file")
+    # Stretch to specific BPM
+    stretch_parser = subparsers.add_parser(
+        "stretch", help="Stretch audio to specific BPM"
+    )
+    stretch_parser.add_argument("audio", help="Path to audio file")
+    stretch_parser.add_argument("target_bpm", type=float, help="Target BPM")
+    args = parser.parse_args()
+    try:
+        if args.command == "align":
+            aligned1, aligned2 = align_songs_by_bpm(args.audio1, args.audio2)
+            print(f"Aligned audio 1: {aligned1}")
+            print(f"Aligned audio 2: {aligned2}")
+        elif args.command == "stretch":
+            output = stretch_to_bpm(args.audio, args.target_bpm)
+            print(f"Stretched audio saved to: {output}")
+        else:
+            parser.print_help()
+    except Exception as e:
+        print(f"Error: {e}")
+        exit(1)

tools/youtube_extract.py ADDED Viewed

	@@ -0,0 +1,167 @@

+import os
+import yt_dlp
+def extract_audio_from_youtube(
+    youtube_url: str,
+    audio_format: str = "wav",
+    quality: str = "best",
+    output_path: str = "output",
+) -> str:
+    """
+    Extract high-quality audio from a YouTube video URL using yt-dlp.
+    This function downloads the audio stream from YouTube videos and converts it to
+    the specified format while maintaining the best available quality.
+    Args:
+        youtube_url: YouTube video URL (full URL format: https://www.youtube.com/watch?v=...)
+        audio_format: Output audio format (default: 'wav')
+                     Supported: 'wav' (uncompressed), 'mp3' (compressed), 'flac' (lossless)
+        quality: Audio quality selection (default: 'best')
+                Options: 'best' (highest available), 'worst' (lowest available)
+        output_path: Directory to save the extracted audio (default: 'output')
+    Returns:
+        Path to the extracted audio file in the specified format
+    Examples:
+        - Extract WAV audio: extract_audio_from_youtube('https://youtube.com/watch?v=...', 'wav')
+        - Extract MP3 audio: extract_audio_from_youtube('https://youtube.com/watch?v=...', 'mp3')
+        - High quality WAV: extract_audio_from_youtube(url, 'wav', 'best')
+    Note:
+        Requires internet connection for downloading
+        Respects YouTube's terms of service
+        Processing time depends on video length and connection speed
+        Output files are saved with descriptive names including video title
+    """
+    try:
+        # Create temporary directory for downloads if no output path is provided
+        output_path = output_path or "output"
+        os.makedirs(output_path, exist_ok=True)
+        # Configure yt-dlp options
+        ydl_opts = {
+            "format": "bestaudio/best",
+            "outtmpl": os.path.join(output_path, "%(title)s.%(ext)s"),
+            "postprocessors": [
+                {
+                    "key": "FFmpegExtractAudio",
+                    "preferredcodec": audio_format,
+                    "preferredquality": "192" if quality == "best" else "128",
+                }
+            ],
+            "quiet": True,
+            "no_warnings": True,
+        }
+        # Download and extract audio
+        with yt_dlp.YoutubeDL(params=ydl_opts) as ydl:
+            info = ydl.extract_info(youtube_url, download=False)
+            video_title = info.get("title", "audio")
+            ydl.download([youtube_url])
+            # Find the downloaded file
+            expected_filename = f"{video_title}.{audio_format}"
+            audio_path = os.path.join(output_path, expected_filename)
+            # Handle special characters in filename
+            if not os.path.exists(audio_path):
+                # Try to find any audio file in the directory
+                audio_files = [
+                    f for f in os.listdir(output_path) if f.endswith(f".{audio_format}")
+                ]
+                if audio_files:
+                    audio_path = os.path.join(output_path, audio_files[0])
+                else:
+                    raise RuntimeError("Audio file not found after download")
+            return audio_path
+    except Exception as e:
+        raise RuntimeError(f"Error extracting audio from YouTube: {str(e)}")
+def get_video_info(youtube_url: str) -> dict:
+    """
+    Get information about a YouTube video without downloading.
+    Args:
+        youtube_url: YouTube video URL
+    Returns:
+        Dictionary with video information (title, duration, uploader, etc.)
+    """
+    try:
+        ydl_opts = {
+            "quiet": True,
+            "no_warnings": True,
+            "skip_download": True,
+        }
+        with yt_dlp.YoutubeDL(params=ydl_opts) as ydl:
+            info = ydl.extract_info(youtube_url, download=False)
+            return {
+                "title": info.get("title"),
+                "duration": info.get("duration"),
+                "uploader": info.get("uploader"),
+                "upload_date": info.get("upload_date"),
+                "view_count": info.get("view_count"),
+                "description": info.get("description"),
+                "thumbnail": info.get("thumbnail"),
+            }
+    except Exception as e:
+        raise RuntimeError(f"Error getting video info: {str(e)}")
+if __name__ == "__main__":
+    import argparse
+    parser = argparse.ArgumentParser(description="Extract audio from YouTube videos")
+    subparsers = parser.add_subparsers(dest="command", help="Available commands")
+    # Extract audio
+    extract_parser = subparsers.add_parser(
+        "extract", help="Extract audio from YouTube URL"
+    )
+    extract_parser.add_argument("url", help="YouTube video URL")
+    extract_parser.add_argument(
+        "--format",
+        default="wav",
+        choices=["wav", "mp3", "flac", "m4a"],
+        help="Output audio format (default: wav)",
+    )
+    extract_parser.add_argument(
+        "--quality",
+        default="best",
+        choices=["best", "worst"],
+        help="Audio quality (default: best)",
+    )
+    # Get video info
+    info_parser = subparsers.add_parser("info", help="Get video information")
+    info_parser.add_argument("url", help="YouTube video URL")
+    args = parser.parse_args()
+    try:
+        if args.command == "extract":
+            audio_path = extract_audio_from_youtube(args.url, args.format, args.quality)
+            print(f"Audio extracted to: {audio_path}")
+        elif args.command == "info":
+            info = get_video_info(args.url)
+            print(f"Title: {info['title']}")
+            print(f"Duration: {info['duration']} seconds")
+            print(f"Uploader: {info['uploader']}")
+            print(f"Upload date: {info['upload_date']}")
+            print(f"Views: {info['view_count']}")
+        else:
+            parser.print_help()
+    except Exception as e:
+        print(f"Error: {e}")
+        exit(1)