Spaces:

frascuchon
/

music-mcp

Running on CPU Upgrade

App Files Files Community

music-mcp / mcp_server.py

frascuchon HF Staff

Add more music tools

cafce31 27 days ago

raw

history blame

26.9 kB

	import gradio as gr
	from typing import Dict, Tuple

	from tools.audio_info import get_audio_info
	from tools.combine_tracks import combine_tracks, create_medley
	from tools.stems_separation import (
	separate_audio,
	extract_selected_stems,
	extract_vocal_non_vocal,
	create_karaoke_track,
	)
	from tools.time_strech import align_songs_by_bpm, stretch_to_bpm
	from tools.youtube_extract import extract_audio_from_youtube
	from tools.audio_cutting import (
	cut_audio,
	mute_time_windows,
	extract_segments,
	trim_audio,
	)
	from tools.music_understanding import (
	understand_music,
	analyze_music_structure,
	suggest_cutting_points,
	analyze_genre_and_style,
	)


	def pitch_shift_with_semitones(audio_path: str, semitones: int) -> str:
	"""
	Shift the pitch of an audio file by a specified number of semitones.

	This function uses librosa's pitch shifting algorithm to change the musical pitch
	of an audio file while maintaining its tempo and duration.

	Args:
	audio_path: Path to the input audio file (supports common formats: WAV, MP3, FLAC)
	semitones: Number of semitones to shift (positive = higher pitch, negative = lower pitch)
	Range: -12 to +12 semitones (1 octave up/down)

	Returns:
	Path to the pitch-shifted audio file in WAV format

	Examples:
	- semitones=2: Shift up by 2 semitones (1 whole tone)
	- semitones=-5: Shift down by 5 semitones (1 perfect fourth)
	- semitones=0: No change (returns original file)

	Note:
	The function creates a temporary WAV file that should be cleaned up by the caller
	"""
	if semitones == 0:
	return audio_path

	# Load audio to get sample rate
	import librosa

	y, sr = librosa.load(audio_path, sr=None, mono=False)

	# Apply pitch shift
	y_shifted = librosa.effects.pitch_shift(y, n_steps=semitones, sr=sr)

	# Save to temporary file
	import tempfile
	import soundfile as sf

	with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp:
	if y_shifted.ndim == 2:
	y_shifted = y_shifted.T
	sf.write(tmp.name, y_shifted, sr, format="wav", subtype="PCM_16")
	return tmp.name


	def stretch_audio_to_bpm_wrapper(audio_path: str, target_bpm: float) -> str:
	"""
	Stretch or compress audio to match a specific BPM (beats per minute) while maintaining pitch.

	This function uses time-stretching algorithms to change the tempo of an audio file
	without affecting its musical pitch, making it useful for beat-matching and tempo alignment.

	Args:
	audio_path: Path to the input audio file (supports common formats: WAV, MP3, FLAC)
	target_bpm: Target beats per minute (BPM) value
	Typical range: 60-200 BPM
	Common values: 90 (slow), 120 (medium), 140 (fast), 128 (electronic)

	Returns:
	Path to the time-stretched audio file in WAV format

	Examples:
	- target_bpm=128: Stretch to typical electronic dance music tempo
	- target_bpm=120: Stretch to standard pop/rock tempo
	- target_bpm=140: Stretch to fast electronic or rock tempo

	Note:
	The function automatically detects the original BPM and calculates the stretch factor
	Creates a new WAV file with the modified tempo
	"""
	return stretch_to_bpm(audio_path, target_bpm)


	def extract_selected_stems_wrapper(
	audio_path: str, vocals: bool, drums: bool, bass: bool, other: bool
	) -> Dict[str, str]:
	"""
	Extract selected stems from an audio file based on user choices.

	This function allows selective extraction of specific stems rather than all four stems,
	which can save processing time and storage space when only certain elements are needed.

	Args:
	audio_path: Path to the input audio file (supports common formats: WAV, MP3, FLAC, M4A)
	vocals: Whether to extract the vocals stem
	drums: Whether to extract the drums stem
	bass: Whether to extract the bass stem
	other: Whether to extract the other stem

	Returns:
	dict[str, str]: Dictionary mapping stem names to their file paths

	Examples:
	- vocals=True, drums=True, bass=False, other=False: Extract only vocals and drums
	- vocals=True, drums=False, bass=False, other=False: Extract only vocals for karaoke
	- vocals=False, drums=True, bass=True, other=False: Extract rhythm section (drums + bass)

	Note:
	At least one stem must be selected for extraction
	Uses the same high-quality Demucs model as separate_audio
	Processing time is the same as full separation since Demucs extracts all stems internally
	"""
	stems_to_extract = []
	if vocals:
	stems_to_extract.append("vocals")
	if drums:
	stems_to_extract.append("drums")
	if bass:
	stems_to_extract.append("bass")
	if other:
	stems_to_extract.append("other")

	if not stems_to_extract:
	raise ValueError("At least one stem must be selected for extraction")

	return extract_selected_stems(audio_path, stems_to_extract)


	def extract_vocal_non_vocal_wrapper(audio_path: str) -> Tuple[str, str]:
	"""
	Extract vocals and non-vocals (instrumental) stems from an audio file.

	This function provides a simple interface to separate audio into vocal and
	non-vocal components, which is useful for karaoke creation, vocal isolation,
	or instrumental extraction.

	Args:
	audio_path: Path to the input audio file (supports common formats: WAV, MP3, FLAC, M4A)

	Returns:
	tuple[str, str]: Paths to (vocals_file, instrumental_file)
	- vocals_file: Path to the isolated vocal track
	- instrumental_file: Path to the combined instrumental track (drums + bass + other)

	Examples:
	- extract_vocal_non_vocal_wrapper('song.mp3'): Separate into vocals and instrumental
	- extract_vocal_non_vocal_wrapper('song.wav'): Create vocal and backing track versions

	Note:
	The instrumental track combines drums, bass, and other stems into a single track
	Uses the same high-quality Demucs model as separate_audio
	Instrumental track is automatically mixed and normalized for consistent volume
	"""
	return extract_vocal_non_vocal(audio_path)


	def create_karaoke_track_wrapper(audio_path: str) -> str:
	"""
	Create a karaoke (instrumental) track by removing vocals from an audio file.

	This is a convenience function that extracts the instrumental (non-vocal) portion
	of a song, creating a karaoke-ready backing track.

	Args:
	audio_path: Path to the input audio file (supports common formats: WAV, MP3, FLAC, M4A)

	Returns:
	Path to the karaoke (instrumental) audio file

	Examples:
	- create_karaoke_track_wrapper('song.mp3'): Create karaoke version
	- create_karaoke_track_wrapper('song.wav'): Create instrumental backing track

	Note:
	Uses the same high-quality Demucs model as separate_audio
	Combines drums, bass, and other stems into a single instrumental track
	Automatically normalized for consistent volume and quality
	Perfect for karaoke applications or backing track creation
	"""
	return create_karaoke_track(audio_path)


	def create_interface():
	"""Create the Gradio interface with all tools."""

	# Tab 1: Stem Separation
	stem_interface = gr.Interface(
	fn=separate_audio,
	inputs=gr.Audio(type="filepath", label="Upload Audio File", sources=["upload"]),
	outputs=[
	gr.Audio(label="Vocals", type="filepath"),
	gr.Audio(label="Drums", type="filepath"),
	gr.Audio(label="Bass", type="filepath"),
	gr.Audio(label="Other", type="filepath"),
	],
	title="Audio Stem Separation",
	description="Upload an audio file to separate it into vocals, drums, bass, and other stems.",
	examples=None,
	cache_examples=False,
	flagging_mode="never",
	)

	# Tab 2: Track Combination
	combine_interface = gr.Interface(
	fn=combine_tracks,
	inputs=[
	gr.Audio(type="filepath", label="First Audio Track", sources=["upload"]),
	gr.Audio(type="filepath", label="Second Audio Track", sources=["upload"]),
	gr.Slider(
	minimum=0.0, maximum=1.0, value=0.5, label="Weight for First Track"
	),
	gr.Slider(
	minimum=0.0, maximum=1.0, value=0.5, label="Weight for Second Track"
	),
	gr.Checkbox(value=True, label="Normalize Output"),
	gr.Number(value=0.0, label="Fade In Duration (seconds)"),
	gr.Number(value=0.0, label="Fade Out Duration (seconds)"),
	],
	outputs=gr.Audio(label="Combined Track", type="filepath"),
	title="Combine Audio Tracks",
	description="Combine two audio tracks with adjustable weights and optional fade effects.",
	examples=None,
	cache_examples=False,
	flagging_mode="never",
	)

	# Tab 3: Pitch Alignment
	pitch_interface = gr.Interface(
	fn=pitch_shift_with_semitones,
	inputs=[
	gr.Audio(type="filepath", label="Upload Audio File", sources=["upload"]),
	gr.Number(value=0, label="Semitones to Shift"),
	],
	outputs=gr.Audio(label="Pitch Shifted Audio", type="filepath"),
	title="Pitch Shift Audio",
	description="Shift the pitch of an audio file by specified semitones.",
	examples=None,
	cache_examples=False,
	flagging_mode="never",
	)

	# Tab 4: Time Stretching
	stretch_interface = gr.Interface(
	fn=stretch_audio_to_bpm_wrapper,
	inputs=[
	gr.Audio(type="filepath", label="Upload Audio File", sources=["upload"]),
	gr.Number(value=120, label="Target BPM"),
	],
	outputs=gr.Audio(label="Stretched Audio", type="filepath"),
	title="Stretch Audio to BPM",
	description="Stretch audio to match a specific BPM.",
	examples=None,
	cache_examples=False,
	flagging_mode="never",
	)

	# Tab 5: BPM Alignment
	bpm_interface = gr.Interface(
	fn=align_songs_by_bpm,
	inputs=[
	gr.Audio(type="filepath", label="First Audio Track", sources=["upload"]),
	gr.Audio(type="filepath", label="Second Audio Track", sources=["upload"]),
	],
	outputs=[
	gr.Audio(label="Aligned First Track", type="filepath"),
	gr.Audio(label="Aligned Second Track", type="filepath"),
	],
	title="Align Songs by BPM",
	description="Align two songs to the same BPM by stretching the faster one to match the slower one.",
	examples=None,
	cache_examples=False,
	flagging_mode="never",
	)

	# Tab 6: Selective Stem Extraction
	selective_interface = gr.Interface(
	fn=extract_selected_stems_wrapper,
	inputs=[
	gr.Audio(type="filepath", label="Upload Audio File", sources=["upload"]),
	gr.Checkbox(value=True, label="Extract Vocals"),
	gr.Checkbox(value=True, label="Extract Drums"),
	gr.Checkbox(value=True, label="Extract Bass"),
	gr.Checkbox(value=True, label="Extract Other"),
	],
	outputs=gr.JSON(label="Extracted Stems"),
	title="Selective Stem Extraction",
	description="Extract only specific stems from an audio file to save processing time and storage.",
	examples=None,
	cache_examples=False,
	flagging_mode="never",
	)

	# Tab 7: Vocal/Non-Vocal Separation
	vocal_nonvocal_interface = gr.Interface(
	fn=extract_vocal_non_vocal_wrapper,
	inputs=gr.Audio(type="filepath", label="Upload Audio File", sources=["upload"]),
	outputs=[
	gr.Audio(label="Vocals Track", type="filepath"),
	gr.Audio(label="Instrumental Track", type="filepath"),
	],
	title="Vocal/Instrumental Separation",
	description="Separate audio into vocal and instrumental components for karaoke or vocal isolation.",
	examples=None,
	cache_examples=False,
	flagging_mode="never",
	)

	# Tab 8: Karaoke Track Creation
	karaoke_interface = gr.Interface(
	fn=create_karaoke_track_wrapper,
	inputs=gr.Audio(type="filepath", label="Upload Audio File", sources=["upload"]),
	outputs=gr.Audio(label="Karaoke Track", type="filepath"),
	title="Create Karaoke Track",
	description="Create a karaoke-ready instrumental track by removing vocals from any song.",
	examples=None,
	cache_examples=False,
	flagging_mode="never",
	)

	# Tab 9: Medley Creation
	medley_interface = gr.Interface(
	fn=create_medley,
	inputs=[
	gr.Audio(type="filepath", label="Vocals Stem", sources=["upload"]),
	gr.Audio(type="filepath", label="Instrumental Stem", sources=["upload"]),
	gr.Number(
	value=1.2, label="Vocals Gain", minimum=0.1, maximum=3.0, step=0.1
	),
	gr.Number(
	value=0.9, label="Instrumental Gain", minimum=0.1, maximum=3.0, step=0.1
	),
	gr.Textbox(
	value="threshold=-18dB:ratio=3:attack=50:release=200",
	label="Compressor Settings",
	placeholder="threshold=-18dB:ratio=3:attack=50:release=200",
	),
	gr.Dropdown(
	choices=["libmp3lame", "aac", "flac", "pcm_s16le"],
	value="libmp3lame",
	label="Audio Codec",
	),
	gr.Textbox(value="192k", label="Audio Bitrate", placeholder="192k"),
	],
	outputs=gr.Audio(label="Medley Audio", type="filepath"),
	title="Create Vocal/Instrumental Medley",
	description="Mix vocals and instrumental stems into a polished medley with compression and gain control.",
	examples=None,
	cache_examples=False,
	flagging_mode="never",
	)

	# Tab 10: Audio Information
	audio_info_interface = gr.Interface(
	fn=get_audio_info,
	inputs=gr.Audio(type="filepath", label="Upload Audio File", sources=["upload"]),
	outputs=gr.JSON(label="Audio Information"),
	title="Get Audio Information",
	description="Get detailed information about an audio file including duration, sample rate, channels, and file size.",
	examples=None,
	cache_examples=False,
	flagging_mode="never",
	)

	# Tab 11: YouTube Extraction
	youtube_interface = gr.Interface(
	fn=extract_audio_from_youtube,
	inputs=[
	gr.Textbox(
	label="YouTube URL", placeholder="https://www.youtube.com/watch?v=..."
	),
	gr.Dropdown(
	choices=["wav", "mp3", "flac"], value="wav", label="Output Format"
	),
	gr.Dropdown(choices=["best", "worst"], value="best", label="Audio Quality"),
	],
	outputs=gr.Audio(label="Extracted Audio", type="filepath"),
	title="Extract Audio from YouTube",
	description="Extract audio from a YouTube video URL.",
	examples=None,
	cache_examples=False,
	flagging_mode="never",
	)

	# Tab 11: YouTube Extraction
	youtube_interface = gr.Interface(
	fn=extract_audio_from_youtube,
	inputs=[
	gr.Textbox(
	label="YouTube URL", placeholder="https://www.youtube.com/watch?v=..."
	),
	gr.Dropdown(
	choices=["wav", "mp3", "flac"], value="wav", label="Output Format"
	),
	gr.Dropdown(choices=["best", "worst"], value="best", label="Audio Quality"),
	],
	outputs=gr.Audio(label="Extracted Audio", type="filepath"),
	title="Extract Audio from YouTube",
	description="Extract audio from a YouTube video URL.",
	examples=None,
	cache_examples=False,
	flagging_mode="never",
	)

	# Tab 12: Audio Cutting
	cut_interface = gr.Interface(
	fn=cut_audio,
	inputs=[
	gr.Audio(type="filepath", label="Upload Audio File", sources=["upload"]),
	gr.Number(value=0.0, label="Start Time (seconds)"),
	gr.Number(value=10.0, label="End Time (seconds)"),
	gr.Dropdown(choices=["wav", "mp3"], value="wav", label="Output Format"),
	],
	outputs=gr.Audio(label="Cut Audio", type="filepath"),
	title="Cut Audio Segment",
	description="Extract a segment from an audio file between specified start and end times.",
	examples=None,
	cache_examples=False,
	flagging_mode="never",
	)

	# Tab 13: Mute Time Windows
	def mute_time_windows_wrapper(audio_path, windows_str, format_val):
	try:
	windows = eval(windows_str) if windows_str else []
	return mute_time_windows(
	audio_path=audio_path, mute_windows=windows, output_format=format_val
	)
	except Exception:
	return None

	mute_interface = gr.Interface(
	fn=mute_time_windows_wrapper,
	inputs=[
	gr.Audio(type="filepath", label="Upload Audio File", sources=["upload"]),
	gr.Textbox(
	value="[[1.0, 2.0], [3.0, 4.0]]",
	label="Mute Windows (JSON format)",
	placeholder="[[start1, end1], [start2, end2]]",
	),
	gr.Dropdown(choices=["wav", "mp3"], value="wav", label="Output Format"),
	],
	outputs=gr.Audio(label="Muted Audio", type="filepath"),
	title="Mute Time Windows",
	description="Mute specific time windows in an audio file with smooth fade transitions.",
	examples=None,
	cache_examples=False,
	flagging_mode="never",
	)

	# Tab 14: Extract Segments
	def extract_segments_wrapper(audio_path, segments_str, format_val, join):
	try:
	segments = eval(segments_str) if segments_str else []
	result = extract_segments(
	audio_path=audio_path,
	segments=segments,
	output_format=format_val,
	join_segments=join,
	)
	# If result is a list, return the first item for Gradio
	if isinstance(result, list):
	return result[0] if result else None
	return result
	except Exception:
	return None

	extract_interface = gr.Interface(
	fn=extract_segments_wrapper,
	inputs=[
	gr.Audio(type="filepath", label="Upload Audio File", sources=["upload"]),
	gr.Textbox(
	value="[[0.0, 1.0], [2.0, 3.0]]",
	label="Segments (JSON format)",
	placeholder="[[start1, end1], [start2, end2]]",
	),
	gr.Dropdown(choices=["wav", "mp3"], value="wav", label="Output Format"),
	gr.Checkbox(value=False, label="Join Segments"),
	],
	outputs=gr.Audio(label="Extracted Segments", type="filepath"),
	title="Extract Segments",
	description="Extract multiple segments from an audio file.",
	examples=None,
	cache_examples=False,
	flagging_mode="never",
	)

	# Tab 15: Trim Audio
	trim_interface = gr.Interface(
	fn=trim_audio,
	inputs=[
	gr.Audio(type="filepath", label="Upload Audio File", sources=["upload"]),
	gr.Number(value=None, label="Trim Start (seconds, leave empty to skip)"),
	gr.Number(value=None, label="Trim End (seconds, leave empty to skip)"),
	gr.Dropdown(choices=["wav", "mp3"], value="wav", label="Output Format"),
	],
	outputs=gr.Audio(label="Trimmed Audio", type="filepath"),
	title="Trim Audio",
	description="Trim audio from the beginning and/or end.",
	examples=None,
	cache_examples=False,
	flagging_mode="never",
	)

	# Tab 16: Music Understanding
	def understand_music_wrapper(audio_path, prompt):
	try:
	result = understand_music(audio_path=audio_path, prompt_text=prompt)
	if result["status"] == "success":
	return result["analysis"]
	else:
	return f"Error: {result.get('error', 'Unknown error')}"
	except Exception as e:
	return f"Error: {str(e)}"

	understand_interface = gr.Interface(
	fn=understand_music_wrapper,
	inputs=[
	gr.Audio(type="filepath", label="Upload Audio File", sources=["upload"]),
	gr.Textbox(
	value="Describe this track in full detail - tell me the genre, tempo, and key, then dive into the instruments, production style, and overall mood it creates.",
	label="Analysis Prompt",
	lines=3,
	),
	],
	outputs=gr.Textbox(label="Music Analysis", lines=10),
	title="Music Understanding (AI)",
	description="Analyze music using NVIDIA's Music-Flamingo Audio Language Model.",
	examples=None,
	cache_examples=False,
	flagging_mode="never",
	)

	# Tab 17: Song Structure Analysis
	def analyze_music_structure_wrapper(audio_path):
	try:
	result = analyze_music_structure(audio_path=audio_path)
	if result["status"] == "success":
	return result["analysis"]
	else:
	return f"Error: {result.get('error', 'Unknown error')}"
	except Exception as e:
	return f"Error: {str(e)}"

	structure_interface = gr.Interface(
	fn=analyze_music_structure_wrapper,
	inputs=[
	gr.Audio(type="filepath", label="Upload Audio File", sources=["upload"]),
	],
	outputs=gr.Textbox(label="Structure Analysis", lines=10),
	title="Song Structure Analysis",
	description="Analyze song structure and identify sections (verse, chorus, bridge, etc.).",
	examples=None,
	cache_examples=False,
	flagging_mode="never",
	)

	# Tab 18: Cutting Points Suggestions
	def suggest_cutting_points_wrapper(audio_path, purpose):
	try:
	result = suggest_cutting_points(audio_path=audio_path, purpose=purpose)
	if result["status"] == "success":
	return result["analysis"]
	else:
	return f"Error: {result.get('error', 'Unknown error')}"
	except Exception as e:
	return f"Error: {str(e)}"

	cutting_points_interface = gr.Interface(
	fn=suggest_cutting_points_wrapper,
	inputs=[
	gr.Audio(type="filepath", label="Upload Audio File", sources=["upload"]),
	gr.Dropdown(
	choices=["general", "dj_mix", "social_media", "ringtone"],
	value="general",
	label="Purpose",
	),
	],
	outputs=gr.Textbox(label="Cutting Point Suggestions", lines=10),
	title="AI Cutting Point Suggestions",
	description="Get AI-suggested optimal cutting points for different purposes.",
	examples=None,
	cache_examples=False,
	flagging_mode="never",
	)

	# Tab 19: Genre and Style Analysis
	def analyze_genre_and_style_wrapper(audio_path):
	try:
	result = analyze_genre_and_style(audio_path=audio_path)
	if result["status"] == "success":
	return result["analysis"]
	else:
	return f"Error: {result.get('error', 'Unknown error')}"
	except Exception as e:
	return f"Error: {str(e)}"

	genre_interface = gr.Interface(
	fn=analyze_genre_and_style_wrapper,
	inputs=[
	gr.Audio(type="filepath", label="Upload Audio File", sources=["upload"]),
	],
	outputs=gr.Textbox(label="Genre & Style Analysis", lines=10),
	title="Genre & Style Analysis",
	description="Detailed analysis of genre, production style, and instrumentation.",
	examples=None,
	cache_examples=False,
	flagging_mode="never",
	)

	# Tab 18: Cutting Points Suggestions
	cutting_points_interface = gr.Interface(
	fn=lambda audio, purpose: suggest_cutting_points(
	audio_path=audio, purpose=purpose
	),
	inputs=[
	gr.Audio(type="filepath", label="Upload Audio File", sources=["upload"]),
	gr.Dropdown(
	choices=["general", "dj_mix", "social_media", "ringtone"],
	value="general",
	label="Purpose",
	),
	],
	outputs=gr.Textbox(label="Cutting Point Suggestions", lines=10),
	title="AI Cutting Point Suggestions",
	description="Get AI-suggested optimal cutting points for different purposes.",
	examples=None,
	cache_examples=False,
	flagging_mode="never",
	)

	# Tab 19: Genre and Style Analysis
	genre_interface = gr.Interface(
	fn=analyze_genre_and_style,
	inputs=[
	gr.Audio(type="filepath", label="Upload Audio File", sources=["upload"]),
	],
	outputs=gr.Textbox(label="Genre & Style Analysis", lines=10),
	title="Genre & Style Analysis",
	description="Detailed analysis of genre, production style, and instrumentation.",
	examples=None,
	cache_examples=False,
	flagging_mode="never",
	)

	return gr.TabbedInterface(
	[
	stem_interface,
	combine_interface,
	pitch_interface,
	stretch_interface,
	bpm_interface,
	selective_interface,
	vocal_nonvocal_interface,
	karaoke_interface,
	medley_interface,
	audio_info_interface,
	youtube_interface,
	cut_interface,
	mute_interface,
	extract_interface,
	trim_interface,
	understand_interface,
	structure_interface,
	cutting_points_interface,
	genre_interface,
	],
	[
	"Stem Separation",
	"Track Combination",
	"Pitch Alignment",
	"Time Stretching",
	"BPM Alignment",
	"Selective Stems",
	"Vocal/Instrumental",
	"Karaoke Creation",
	"Medley Creation",
	"Audio Information",
	"YouTube Extraction",
	"Audio Cutting",
	"Mute Windows",
	"Extract Segments",
	"Trim Audio",
	"Music Understanding",
	"Song Structure",
	"Cutting Points",
	"Genre Analysis",
	],
	)


	if __name__ == "__main__":
	interface = create_interface()
	interface.launch(server_name="0.0.0.0", server_port=7860, mcp_server=True)