music-mcp / mcp_server.py
frascuchon's picture
frascuchon HF Staff
Add more music tools
cafce31
raw
history blame
26.9 kB
import gradio as gr
from typing import Dict, Tuple
from tools.audio_info import get_audio_info
from tools.combine_tracks import combine_tracks, create_medley
from tools.stems_separation import (
separate_audio,
extract_selected_stems,
extract_vocal_non_vocal,
create_karaoke_track,
)
from tools.time_strech import align_songs_by_bpm, stretch_to_bpm
from tools.youtube_extract import extract_audio_from_youtube
from tools.audio_cutting import (
cut_audio,
mute_time_windows,
extract_segments,
trim_audio,
)
from tools.music_understanding import (
understand_music,
analyze_music_structure,
suggest_cutting_points,
analyze_genre_and_style,
)
def pitch_shift_with_semitones(audio_path: str, semitones: int) -> str:
"""
Shift the pitch of an audio file by a specified number of semitones.
This function uses librosa's pitch shifting algorithm to change the musical pitch
of an audio file while maintaining its tempo and duration.
Args:
audio_path: Path to the input audio file (supports common formats: WAV, MP3, FLAC)
semitones: Number of semitones to shift (positive = higher pitch, negative = lower pitch)
Range: -12 to +12 semitones (1 octave up/down)
Returns:
Path to the pitch-shifted audio file in WAV format
Examples:
- semitones=2: Shift up by 2 semitones (1 whole tone)
- semitones=-5: Shift down by 5 semitones (1 perfect fourth)
- semitones=0: No change (returns original file)
Note:
The function creates a temporary WAV file that should be cleaned up by the caller
"""
if semitones == 0:
return audio_path
# Load audio to get sample rate
import librosa
y, sr = librosa.load(audio_path, sr=None, mono=False)
# Apply pitch shift
y_shifted = librosa.effects.pitch_shift(y, n_steps=semitones, sr=sr)
# Save to temporary file
import tempfile
import soundfile as sf
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp:
if y_shifted.ndim == 2:
y_shifted = y_shifted.T
sf.write(tmp.name, y_shifted, sr, format="wav", subtype="PCM_16")
return tmp.name
def stretch_audio_to_bpm_wrapper(audio_path: str, target_bpm: float) -> str:
"""
Stretch or compress audio to match a specific BPM (beats per minute) while maintaining pitch.
This function uses time-stretching algorithms to change the tempo of an audio file
without affecting its musical pitch, making it useful for beat-matching and tempo alignment.
Args:
audio_path: Path to the input audio file (supports common formats: WAV, MP3, FLAC)
target_bpm: Target beats per minute (BPM) value
Typical range: 60-200 BPM
Common values: 90 (slow), 120 (medium), 140 (fast), 128 (electronic)
Returns:
Path to the time-stretched audio file in WAV format
Examples:
- target_bpm=128: Stretch to typical electronic dance music tempo
- target_bpm=120: Stretch to standard pop/rock tempo
- target_bpm=140: Stretch to fast electronic or rock tempo
Note:
The function automatically detects the original BPM and calculates the stretch factor
Creates a new WAV file with the modified tempo
"""
return stretch_to_bpm(audio_path, target_bpm)
def extract_selected_stems_wrapper(
audio_path: str, vocals: bool, drums: bool, bass: bool, other: bool
) -> Dict[str, str]:
"""
Extract selected stems from an audio file based on user choices.
This function allows selective extraction of specific stems rather than all four stems,
which can save processing time and storage space when only certain elements are needed.
Args:
audio_path: Path to the input audio file (supports common formats: WAV, MP3, FLAC, M4A)
vocals: Whether to extract the vocals stem
drums: Whether to extract the drums stem
bass: Whether to extract the bass stem
other: Whether to extract the other stem
Returns:
dict[str, str]: Dictionary mapping stem names to their file paths
Examples:
- vocals=True, drums=True, bass=False, other=False: Extract only vocals and drums
- vocals=True, drums=False, bass=False, other=False: Extract only vocals for karaoke
- vocals=False, drums=True, bass=True, other=False: Extract rhythm section (drums + bass)
Note:
At least one stem must be selected for extraction
Uses the same high-quality Demucs model as separate_audio
Processing time is the same as full separation since Demucs extracts all stems internally
"""
stems_to_extract = []
if vocals:
stems_to_extract.append("vocals")
if drums:
stems_to_extract.append("drums")
if bass:
stems_to_extract.append("bass")
if other:
stems_to_extract.append("other")
if not stems_to_extract:
raise ValueError("At least one stem must be selected for extraction")
return extract_selected_stems(audio_path, stems_to_extract)
def extract_vocal_non_vocal_wrapper(audio_path: str) -> Tuple[str, str]:
"""
Extract vocals and non-vocals (instrumental) stems from an audio file.
This function provides a simple interface to separate audio into vocal and
non-vocal components, which is useful for karaoke creation, vocal isolation,
or instrumental extraction.
Args:
audio_path: Path to the input audio file (supports common formats: WAV, MP3, FLAC, M4A)
Returns:
tuple[str, str]: Paths to (vocals_file, instrumental_file)
- vocals_file: Path to the isolated vocal track
- instrumental_file: Path to the combined instrumental track (drums + bass + other)
Examples:
- extract_vocal_non_vocal_wrapper('song.mp3'): Separate into vocals and instrumental
- extract_vocal_non_vocal_wrapper('song.wav'): Create vocal and backing track versions
Note:
The instrumental track combines drums, bass, and other stems into a single track
Uses the same high-quality Demucs model as separate_audio
Instrumental track is automatically mixed and normalized for consistent volume
"""
return extract_vocal_non_vocal(audio_path)
def create_karaoke_track_wrapper(audio_path: str) -> str:
"""
Create a karaoke (instrumental) track by removing vocals from an audio file.
This is a convenience function that extracts the instrumental (non-vocal) portion
of a song, creating a karaoke-ready backing track.
Args:
audio_path: Path to the input audio file (supports common formats: WAV, MP3, FLAC, M4A)
Returns:
Path to the karaoke (instrumental) audio file
Examples:
- create_karaoke_track_wrapper('song.mp3'): Create karaoke version
- create_karaoke_track_wrapper('song.wav'): Create instrumental backing track
Note:
Uses the same high-quality Demucs model as separate_audio
Combines drums, bass, and other stems into a single instrumental track
Automatically normalized for consistent volume and quality
Perfect for karaoke applications or backing track creation
"""
return create_karaoke_track(audio_path)
def create_interface():
"""Create the Gradio interface with all tools."""
# Tab 1: Stem Separation
stem_interface = gr.Interface(
fn=separate_audio,
inputs=gr.Audio(type="filepath", label="Upload Audio File", sources=["upload"]),
outputs=[
gr.Audio(label="Vocals", type="filepath"),
gr.Audio(label="Drums", type="filepath"),
gr.Audio(label="Bass", type="filepath"),
gr.Audio(label="Other", type="filepath"),
],
title="Audio Stem Separation",
description="Upload an audio file to separate it into vocals, drums, bass, and other stems.",
examples=None,
cache_examples=False,
flagging_mode="never",
)
# Tab 2: Track Combination
combine_interface = gr.Interface(
fn=combine_tracks,
inputs=[
gr.Audio(type="filepath", label="First Audio Track", sources=["upload"]),
gr.Audio(type="filepath", label="Second Audio Track", sources=["upload"]),
gr.Slider(
minimum=0.0, maximum=1.0, value=0.5, label="Weight for First Track"
),
gr.Slider(
minimum=0.0, maximum=1.0, value=0.5, label="Weight for Second Track"
),
gr.Checkbox(value=True, label="Normalize Output"),
gr.Number(value=0.0, label="Fade In Duration (seconds)"),
gr.Number(value=0.0, label="Fade Out Duration (seconds)"),
],
outputs=gr.Audio(label="Combined Track", type="filepath"),
title="Combine Audio Tracks",
description="Combine two audio tracks with adjustable weights and optional fade effects.",
examples=None,
cache_examples=False,
flagging_mode="never",
)
# Tab 3: Pitch Alignment
pitch_interface = gr.Interface(
fn=pitch_shift_with_semitones,
inputs=[
gr.Audio(type="filepath", label="Upload Audio File", sources=["upload"]),
gr.Number(value=0, label="Semitones to Shift"),
],
outputs=gr.Audio(label="Pitch Shifted Audio", type="filepath"),
title="Pitch Shift Audio",
description="Shift the pitch of an audio file by specified semitones.",
examples=None,
cache_examples=False,
flagging_mode="never",
)
# Tab 4: Time Stretching
stretch_interface = gr.Interface(
fn=stretch_audio_to_bpm_wrapper,
inputs=[
gr.Audio(type="filepath", label="Upload Audio File", sources=["upload"]),
gr.Number(value=120, label="Target BPM"),
],
outputs=gr.Audio(label="Stretched Audio", type="filepath"),
title="Stretch Audio to BPM",
description="Stretch audio to match a specific BPM.",
examples=None,
cache_examples=False,
flagging_mode="never",
)
# Tab 5: BPM Alignment
bpm_interface = gr.Interface(
fn=align_songs_by_bpm,
inputs=[
gr.Audio(type="filepath", label="First Audio Track", sources=["upload"]),
gr.Audio(type="filepath", label="Second Audio Track", sources=["upload"]),
],
outputs=[
gr.Audio(label="Aligned First Track", type="filepath"),
gr.Audio(label="Aligned Second Track", type="filepath"),
],
title="Align Songs by BPM",
description="Align two songs to the same BPM by stretching the faster one to match the slower one.",
examples=None,
cache_examples=False,
flagging_mode="never",
)
# Tab 6: Selective Stem Extraction
selective_interface = gr.Interface(
fn=extract_selected_stems_wrapper,
inputs=[
gr.Audio(type="filepath", label="Upload Audio File", sources=["upload"]),
gr.Checkbox(value=True, label="Extract Vocals"),
gr.Checkbox(value=True, label="Extract Drums"),
gr.Checkbox(value=True, label="Extract Bass"),
gr.Checkbox(value=True, label="Extract Other"),
],
outputs=gr.JSON(label="Extracted Stems"),
title="Selective Stem Extraction",
description="Extract only specific stems from an audio file to save processing time and storage.",
examples=None,
cache_examples=False,
flagging_mode="never",
)
# Tab 7: Vocal/Non-Vocal Separation
vocal_nonvocal_interface = gr.Interface(
fn=extract_vocal_non_vocal_wrapper,
inputs=gr.Audio(type="filepath", label="Upload Audio File", sources=["upload"]),
outputs=[
gr.Audio(label="Vocals Track", type="filepath"),
gr.Audio(label="Instrumental Track", type="filepath"),
],
title="Vocal/Instrumental Separation",
description="Separate audio into vocal and instrumental components for karaoke or vocal isolation.",
examples=None,
cache_examples=False,
flagging_mode="never",
)
# Tab 8: Karaoke Track Creation
karaoke_interface = gr.Interface(
fn=create_karaoke_track_wrapper,
inputs=gr.Audio(type="filepath", label="Upload Audio File", sources=["upload"]),
outputs=gr.Audio(label="Karaoke Track", type="filepath"),
title="Create Karaoke Track",
description="Create a karaoke-ready instrumental track by removing vocals from any song.",
examples=None,
cache_examples=False,
flagging_mode="never",
)
# Tab 9: Medley Creation
medley_interface = gr.Interface(
fn=create_medley,
inputs=[
gr.Audio(type="filepath", label="Vocals Stem", sources=["upload"]),
gr.Audio(type="filepath", label="Instrumental Stem", sources=["upload"]),
gr.Number(
value=1.2, label="Vocals Gain", minimum=0.1, maximum=3.0, step=0.1
),
gr.Number(
value=0.9, label="Instrumental Gain", minimum=0.1, maximum=3.0, step=0.1
),
gr.Textbox(
value="threshold=-18dB:ratio=3:attack=50:release=200",
label="Compressor Settings",
placeholder="threshold=-18dB:ratio=3:attack=50:release=200",
),
gr.Dropdown(
choices=["libmp3lame", "aac", "flac", "pcm_s16le"],
value="libmp3lame",
label="Audio Codec",
),
gr.Textbox(value="192k", label="Audio Bitrate", placeholder="192k"),
],
outputs=gr.Audio(label="Medley Audio", type="filepath"),
title="Create Vocal/Instrumental Medley",
description="Mix vocals and instrumental stems into a polished medley with compression and gain control.",
examples=None,
cache_examples=False,
flagging_mode="never",
)
# Tab 10: Audio Information
audio_info_interface = gr.Interface(
fn=get_audio_info,
inputs=gr.Audio(type="filepath", label="Upload Audio File", sources=["upload"]),
outputs=gr.JSON(label="Audio Information"),
title="Get Audio Information",
description="Get detailed information about an audio file including duration, sample rate, channels, and file size.",
examples=None,
cache_examples=False,
flagging_mode="never",
)
# Tab 11: YouTube Extraction
youtube_interface = gr.Interface(
fn=extract_audio_from_youtube,
inputs=[
gr.Textbox(
label="YouTube URL", placeholder="https://www.youtube.com/watch?v=..."
),
gr.Dropdown(
choices=["wav", "mp3", "flac"], value="wav", label="Output Format"
),
gr.Dropdown(choices=["best", "worst"], value="best", label="Audio Quality"),
],
outputs=gr.Audio(label="Extracted Audio", type="filepath"),
title="Extract Audio from YouTube",
description="Extract audio from a YouTube video URL.",
examples=None,
cache_examples=False,
flagging_mode="never",
)
# Tab 11: YouTube Extraction
youtube_interface = gr.Interface(
fn=extract_audio_from_youtube,
inputs=[
gr.Textbox(
label="YouTube URL", placeholder="https://www.youtube.com/watch?v=..."
),
gr.Dropdown(
choices=["wav", "mp3", "flac"], value="wav", label="Output Format"
),
gr.Dropdown(choices=["best", "worst"], value="best", label="Audio Quality"),
],
outputs=gr.Audio(label="Extracted Audio", type="filepath"),
title="Extract Audio from YouTube",
description="Extract audio from a YouTube video URL.",
examples=None,
cache_examples=False,
flagging_mode="never",
)
# Tab 12: Audio Cutting
cut_interface = gr.Interface(
fn=cut_audio,
inputs=[
gr.Audio(type="filepath", label="Upload Audio File", sources=["upload"]),
gr.Number(value=0.0, label="Start Time (seconds)"),
gr.Number(value=10.0, label="End Time (seconds)"),
gr.Dropdown(choices=["wav", "mp3"], value="wav", label="Output Format"),
],
outputs=gr.Audio(label="Cut Audio", type="filepath"),
title="Cut Audio Segment",
description="Extract a segment from an audio file between specified start and end times.",
examples=None,
cache_examples=False,
flagging_mode="never",
)
# Tab 13: Mute Time Windows
def mute_time_windows_wrapper(audio_path, windows_str, format_val):
try:
windows = eval(windows_str) if windows_str else []
return mute_time_windows(
audio_path=audio_path, mute_windows=windows, output_format=format_val
)
except Exception:
return None
mute_interface = gr.Interface(
fn=mute_time_windows_wrapper,
inputs=[
gr.Audio(type="filepath", label="Upload Audio File", sources=["upload"]),
gr.Textbox(
value="[[1.0, 2.0], [3.0, 4.0]]",
label="Mute Windows (JSON format)",
placeholder="[[start1, end1], [start2, end2]]",
),
gr.Dropdown(choices=["wav", "mp3"], value="wav", label="Output Format"),
],
outputs=gr.Audio(label="Muted Audio", type="filepath"),
title="Mute Time Windows",
description="Mute specific time windows in an audio file with smooth fade transitions.",
examples=None,
cache_examples=False,
flagging_mode="never",
)
# Tab 14: Extract Segments
def extract_segments_wrapper(audio_path, segments_str, format_val, join):
try:
segments = eval(segments_str) if segments_str else []
result = extract_segments(
audio_path=audio_path,
segments=segments,
output_format=format_val,
join_segments=join,
)
# If result is a list, return the first item for Gradio
if isinstance(result, list):
return result[0] if result else None
return result
except Exception:
return None
extract_interface = gr.Interface(
fn=extract_segments_wrapper,
inputs=[
gr.Audio(type="filepath", label="Upload Audio File", sources=["upload"]),
gr.Textbox(
value="[[0.0, 1.0], [2.0, 3.0]]",
label="Segments (JSON format)",
placeholder="[[start1, end1], [start2, end2]]",
),
gr.Dropdown(choices=["wav", "mp3"], value="wav", label="Output Format"),
gr.Checkbox(value=False, label="Join Segments"),
],
outputs=gr.Audio(label="Extracted Segments", type="filepath"),
title="Extract Segments",
description="Extract multiple segments from an audio file.",
examples=None,
cache_examples=False,
flagging_mode="never",
)
# Tab 15: Trim Audio
trim_interface = gr.Interface(
fn=trim_audio,
inputs=[
gr.Audio(type="filepath", label="Upload Audio File", sources=["upload"]),
gr.Number(value=None, label="Trim Start (seconds, leave empty to skip)"),
gr.Number(value=None, label="Trim End (seconds, leave empty to skip)"),
gr.Dropdown(choices=["wav", "mp3"], value="wav", label="Output Format"),
],
outputs=gr.Audio(label="Trimmed Audio", type="filepath"),
title="Trim Audio",
description="Trim audio from the beginning and/or end.",
examples=None,
cache_examples=False,
flagging_mode="never",
)
# Tab 16: Music Understanding
def understand_music_wrapper(audio_path, prompt):
try:
result = understand_music(audio_path=audio_path, prompt_text=prompt)
if result["status"] == "success":
return result["analysis"]
else:
return f"Error: {result.get('error', 'Unknown error')}"
except Exception as e:
return f"Error: {str(e)}"
understand_interface = gr.Interface(
fn=understand_music_wrapper,
inputs=[
gr.Audio(type="filepath", label="Upload Audio File", sources=["upload"]),
gr.Textbox(
value="Describe this track in full detail - tell me the genre, tempo, and key, then dive into the instruments, production style, and overall mood it creates.",
label="Analysis Prompt",
lines=3,
),
],
outputs=gr.Textbox(label="Music Analysis", lines=10),
title="Music Understanding (AI)",
description="Analyze music using NVIDIA's Music-Flamingo Audio Language Model.",
examples=None,
cache_examples=False,
flagging_mode="never",
)
# Tab 17: Song Structure Analysis
def analyze_music_structure_wrapper(audio_path):
try:
result = analyze_music_structure(audio_path=audio_path)
if result["status"] == "success":
return result["analysis"]
else:
return f"Error: {result.get('error', 'Unknown error')}"
except Exception as e:
return f"Error: {str(e)}"
structure_interface = gr.Interface(
fn=analyze_music_structure_wrapper,
inputs=[
gr.Audio(type="filepath", label="Upload Audio File", sources=["upload"]),
],
outputs=gr.Textbox(label="Structure Analysis", lines=10),
title="Song Structure Analysis",
description="Analyze song structure and identify sections (verse, chorus, bridge, etc.).",
examples=None,
cache_examples=False,
flagging_mode="never",
)
# Tab 18: Cutting Points Suggestions
def suggest_cutting_points_wrapper(audio_path, purpose):
try:
result = suggest_cutting_points(audio_path=audio_path, purpose=purpose)
if result["status"] == "success":
return result["analysis"]
else:
return f"Error: {result.get('error', 'Unknown error')}"
except Exception as e:
return f"Error: {str(e)}"
cutting_points_interface = gr.Interface(
fn=suggest_cutting_points_wrapper,
inputs=[
gr.Audio(type="filepath", label="Upload Audio File", sources=["upload"]),
gr.Dropdown(
choices=["general", "dj_mix", "social_media", "ringtone"],
value="general",
label="Purpose",
),
],
outputs=gr.Textbox(label="Cutting Point Suggestions", lines=10),
title="AI Cutting Point Suggestions",
description="Get AI-suggested optimal cutting points for different purposes.",
examples=None,
cache_examples=False,
flagging_mode="never",
)
# Tab 19: Genre and Style Analysis
def analyze_genre_and_style_wrapper(audio_path):
try:
result = analyze_genre_and_style(audio_path=audio_path)
if result["status"] == "success":
return result["analysis"]
else:
return f"Error: {result.get('error', 'Unknown error')}"
except Exception as e:
return f"Error: {str(e)}"
genre_interface = gr.Interface(
fn=analyze_genre_and_style_wrapper,
inputs=[
gr.Audio(type="filepath", label="Upload Audio File", sources=["upload"]),
],
outputs=gr.Textbox(label="Genre & Style Analysis", lines=10),
title="Genre & Style Analysis",
description="Detailed analysis of genre, production style, and instrumentation.",
examples=None,
cache_examples=False,
flagging_mode="never",
)
# Tab 18: Cutting Points Suggestions
cutting_points_interface = gr.Interface(
fn=lambda audio, purpose: suggest_cutting_points(
audio_path=audio, purpose=purpose
),
inputs=[
gr.Audio(type="filepath", label="Upload Audio File", sources=["upload"]),
gr.Dropdown(
choices=["general", "dj_mix", "social_media", "ringtone"],
value="general",
label="Purpose",
),
],
outputs=gr.Textbox(label="Cutting Point Suggestions", lines=10),
title="AI Cutting Point Suggestions",
description="Get AI-suggested optimal cutting points for different purposes.",
examples=None,
cache_examples=False,
flagging_mode="never",
)
# Tab 19: Genre and Style Analysis
genre_interface = gr.Interface(
fn=analyze_genre_and_style,
inputs=[
gr.Audio(type="filepath", label="Upload Audio File", sources=["upload"]),
],
outputs=gr.Textbox(label="Genre & Style Analysis", lines=10),
title="Genre & Style Analysis",
description="Detailed analysis of genre, production style, and instrumentation.",
examples=None,
cache_examples=False,
flagging_mode="never",
)
return gr.TabbedInterface(
[
stem_interface,
combine_interface,
pitch_interface,
stretch_interface,
bpm_interface,
selective_interface,
vocal_nonvocal_interface,
karaoke_interface,
medley_interface,
audio_info_interface,
youtube_interface,
cut_interface,
mute_interface,
extract_interface,
trim_interface,
understand_interface,
structure_interface,
cutting_points_interface,
genre_interface,
],
[
"Stem Separation",
"Track Combination",
"Pitch Alignment",
"Time Stretching",
"BPM Alignment",
"Selective Stems",
"Vocal/Instrumental",
"Karaoke Creation",
"Medley Creation",
"Audio Information",
"YouTube Extraction",
"Audio Cutting",
"Mute Windows",
"Extract Segments",
"Trim Audio",
"Music Understanding",
"Song Structure",
"Cutting Points",
"Genre Analysis",
],
)
if __name__ == "__main__":
interface = create_interface()
interface.launch(server_name="0.0.0.0", server_port=7860, mcp_server=True)