music-mcp / tools /music_understanding.py
frascuchon's picture
frascuchon HF Staff
audio_path documented
14e5437
import os
import tempfile
from typing import Any, Dict, Optional
from gradio_client import Client, handle_file
from .audio_info import validate_audio_path
def understand_music(
audio_path: Optional[str] = None,
audio_file: Optional[bytes] = None,
filename: str = "audio",
prompt_text: str = "Describe this track in full detail - tell me the genre, tempo, and key, then dive into the instruments, production style, and overall mood it creates.",
youtube_url: Optional[str] = None,
) -> Dict[str, Any]:
"""
Analyze music using NVIDIA's Music-Flamingo Audio Language Model.
This function uses the flamingo-3 model to provide detailed analysis of audio content,
including genre, tempo, key, instrumentation, production style, and mood.
Args:
audio_path: Path to local audio file or URL (supports WAV, MP3, FLAC, M4A)
audio_file: Raw audio bytes (alternative to audio_path)
filename: Original filename for reference (used with audio_file)
prompt_text: Custom prompt for analysis (default: comprehensive music description)
youtube_url: YouTube URL as alternative audio source
Returns:
Dictionary with analysis results:
{
"analysis": "Detailed music analysis text",
"audio_source": "path" or "bytes" or "youtube",
"filename": "Original filename",
"prompt": "Used prompt text",
"status": "success" or "error",
"error": "Error message if status is error"
}
Raises:
ValueError: If neither audio_path, audio_file, nor youtube_url is provided
FileNotFoundError: If audio_path doesn't exist
RuntimeError: If API call fails or network issues occur
Examples:
# Basic analysis with local file
result = understand_music(audio_path="song.mp3")
print(result["analysis"])
# Custom prompt for finding cut points
result = understand_music(
audio_path="song.mp3",
prompt_text="Identify the best cutting points for editing - suggest specific time stamps where verses, choruses, and bridges begin and end."
)
# Analysis with YouTube URL
result = understand_music(
youtube_url="https://youtube.com/watch?v=example",
prompt_text="Analyze the structure and suggest optimal edit points."
)
"""
try:
# Validate input parameters
if not any([audio_path, audio_file, youtube_url]):
raise ValueError(
"Either audio_path, audio_file, or youtube_url must be provided"
)
# Handle different audio sources
audio_source = None
temp_file_path = None
source_type = "unknown"
source_filename = "unknown"
try:
if audio_path:
# Validate and use local audio file
validated_path = validate_audio_path(audio_path)
audio_source = handle_file(validated_path)
source_type = "path"
source_filename = os.path.basename(validated_path)
elif audio_file:
# Save bytes to temporary file
if not filename:
raise ValueError("Filename must be provided when using audio_file")
# Create temporary file with appropriate extension
temp_dir = tempfile.mkdtemp()
if filename.lower().endswith((".wav", ".mp3", ".flac", ".m4a")):
temp_filename = filename
else:
temp_filename = f"{filename}.wav"
temp_file_path = os.path.join(temp_dir, temp_filename)
with open(temp_file_path, "wb") as f:
f.write(audio_file)
audio_source = handle_file(temp_file_path)
source_type = "bytes"
source_filename = filename
elif youtube_url:
# Use YouTube URL directly
audio_source = youtube_url
source_type = "youtube"
source_filename = youtube_url
# Initialize client and make prediction
client = Client("nvidia/music-flamingo")
result = client.predict(
audio_path=audio_source,
youtube_url=youtube_url if youtube_url else "",
prompt_text=prompt_text,
api_name="/infer",
)
return {
"analysis": result,
"audio_source": source_type,
"filename": source_filename,
"prompt": prompt_text,
"status": "success",
}
finally:
# Clean up temporary file if created
if temp_file_path and os.path.exists(temp_file_path):
os.unlink(temp_file_path)
# Remove temp directory if empty
temp_dir = os.path.dirname(temp_file_path)
try:
os.rmdir(temp_dir)
except OSError:
pass # Directory not empty, leave it
except Exception as e:
return {
"analysis": None,
"audio_source": audio_path or "bytes" or youtube_url or "unknown",
"filename": filename
if audio_file
else (os.path.basename(audio_path) if audio_path else youtube_url),
"prompt": prompt_text,
"status": "error",
"error": str(e),
}
def analyze_music_structure(
audio_path: Optional[str] = None,
audio_file: Optional[bytes] = None,
filename: str = "audio",
youtube_url: Optional[str] = None,
) -> Dict[str, Any]:
"""
Analyze music structure and identify sections (verse, chorus, bridge, etc.).
This function provides a focused analysis on song structure, making it ideal
for understanding where to make cuts and edits.
Args:
audio_path: Path to local audio file or URL
audio_file: Raw audio bytes
filename: Original filename for reference
youtube_url: YouTube URL as alternative audio source
Returns:
Dictionary with structure analysis results
"""
structure_prompt = (
"Analyze the structure of this music track. Identify and timestamp the different sections: "
"intro, verses, choruses, pre-chorus, bridge, instrumental breaks, solo sections, and outro/outro. "
"Provide specific time stamps (in MM:SS format) for where each section begins and ends. "
"Also note any transitions, buildups, or breakdowns that would be important for editing."
)
return understand_music(
audio_path=audio_path,
audio_file=audio_file,
filename=filename,
prompt_text=structure_prompt,
youtube_url=youtube_url,
)
def suggest_cutting_points(
audio_path: Optional[str] = None,
audio_file: Optional[bytes] = None,
filename: str = "audio",
youtube_url: Optional[str] = None,
purpose: str = "general",
) -> Dict[str, Any]:
"""
Suggest optimal cutting points for audio editing.
Args:
audio_path: Path to local audio file or URL
audio_file: Raw audio bytes
filename: Original filename for reference
youtube_url: YouTube URL as alternative audio source
purpose: Purpose of cutting ('general', 'dj_mix', 'social_media', 'ringtone')
Returns:
Dictionary with cutting point suggestions
"""
purpose_prompts = {
"general": (
"Suggest the best cutting points for this track. Identify natural edit points where "
"the music flows well for cuts. Provide timestamps in MM:SS format and explain why "
"each point is good for editing (e.g., clean transitions, beat drops, phrase endings)."
),
"dj_mix": (
"Analyze this track for DJ mixing purposes. Identify the best intro and outro sections "
"for beatmatching, suggest cue points for mixing, and provide timestamps for clean "
"transitions. Focus on drum patterns, BPM consistency, and mixable sections."
),
"social_media": (
"Suggest cutting points for social media content (15-60 seconds). Identify the most "
"engaging parts of the track, catchy hooks, or impactful moments. Provide timestamps "
"for creating short, attention-grabbing clips."
),
"ringtone": (
"Identify the best 15-30 second sections for ringtones. Look for memorable melodies, "
"catchy choruses, or distinctive instrumental parts. Provide timestamps and explain "
"why each section would work well as a ringtone."
),
}
prompt = purpose_prompts.get(purpose, purpose_prompts["general"])
return understand_music(
audio_path=audio_path,
audio_file=audio_file,
filename=filename,
prompt_text=prompt,
youtube_url=youtube_url,
)
def analyze_genre_and_style(
audio_path: Optional[str] = None,
audio_file: Optional[bytes] = None,
filename: str = "audio",
youtube_url: Optional[str] = None,
) -> Dict[str, Any]:
"""
Provide detailed genre and production style analysis.
Args:
audio_path: Path to local audio file or URL
audio_file: Raw audio bytes
filename: Original filename for reference
youtube_url: YouTube URL as alternative audio source
Returns:
Dictionary with genre and style analysis
"""
genre_prompt = (
"Provide a detailed analysis of this track's genre and production style. Identify the "
"primary genre and any subgenres or fusion elements. Describe the production techniques, "
"mixing style, sound design choices, and arrangement. Analyze the instrumentation, "
"including both traditional and electronic elements. Discuss the era or period the music "
"seems to draw inspiration from, and compare it to similar artists or tracks if applicable."
)
return understand_music(
audio_path=audio_path,
audio_file=audio_file,
filename=filename,
prompt_text=genre_prompt,
youtube_url=youtube_url,
)
if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser(
description="Music understanding and analysis tools"
)
subparsers = parser.add_subparsers(dest="command", help="Available commands")
# General understanding
understand_parser = subparsers.add_parser(
"understand", help="General music analysis"
)
understand_parser.add_argument("--audio", help="Path to audio file")
understand_parser.add_argument("--prompt", help="Custom prompt text")
understand_parser.add_argument("--youtube", help="YouTube URL")
# Structure analysis
structure_parser = subparsers.add_parser("structure", help="Analyze song structure")
structure_parser.add_argument("--audio", help="Path to audio file")
structure_parser.add_argument("--youtube", help="YouTube URL")
# Cutting points
cutting_parser = subparsers.add_parser("cutting", help="Suggest cutting points")
cutting_parser.add_argument("--audio", help="Path to audio file")
cutting_parser.add_argument(
"--purpose",
choices=["general", "dj_mix", "social_media", "ringtone"],
default="general",
help="Purpose of cutting",
)
cutting_parser.add_argument("--youtube", help="YouTube URL")
# Genre analysis
genre_parser = subparsers.add_parser("genre", help="Analyze genre and style")
genre_parser.add_argument("--audio", help="Path to audio file")
genre_parser.add_argument("--youtube", help="YouTube URL")
args = parser.parse_args()
try:
if args.command == "understand":
result = understand_music(
audio_path=args.audio,
youtube_url=args.youtube,
prompt_text=args.prompt
if args.prompt
else "Describe this track in full detail - tell me the genre, tempo, and key, then dive into the instruments, production style, and overall mood it creates.",
)
elif args.command == "cutting":
result = suggest_cutting_points(
audio_path=args.audio, youtube_url=args.youtube, purpose=args.purpose
)
elif args.command == "genre":
result = analyze_genre_and_style(
audio_path=args.audio, youtube_url=args.youtube
)
else:
parser.print_help()
exit(1)
# Output results
if result["status"] == "success":
print(f"Analysis for: {result['filename']}")
print(f"Source: {result['audio_source']}")
print(f"Prompt: {result['prompt']}")
print("\n" + "=" * 50)
print(result["analysis"])
else:
print(f"Error: {result['error']}")
exit(1)
except Exception as e:
print(f"Error: {e}")
exit(1)