Spaces:

frascuchon
/

music-mcp

Running on CPU Upgrade

File size: 13,169 Bytes

import os
import tempfile
from typing import Any, Dict, Optional

from gradio_client import Client, handle_file

from .audio_info import validate_audio_path


def understand_music(
    audio_path: Optional[str] = None,
    audio_file: Optional[bytes] = None,
    filename: str = "audio",
    prompt_text: str = "Describe this track in full detail - tell me the genre, tempo, and key, then dive into the instruments, production style, and overall mood it creates.",
    youtube_url: Optional[str] = None,
) -> Dict[str, Any]:
    """
    Analyze music using NVIDIA's Music-Flamingo Audio Language Model.

    This function uses the flamingo-3 model to provide detailed analysis of audio content,
    including genre, tempo, key, instrumentation, production style, and mood.

    Args:
        audio_path: Path to local audio file or URL (supports WAV, MP3, FLAC, M4A)
        audio_file: Raw audio bytes (alternative to audio_path)
        filename: Original filename for reference (used with audio_file)
        prompt_text: Custom prompt for analysis (default: comprehensive music description)
        youtube_url: YouTube URL as alternative audio source

    Returns:
        Dictionary with analysis results:
        {
            "analysis": "Detailed music analysis text",
            "audio_source": "path" or "bytes" or "youtube",
            "filename": "Original filename",
            "prompt": "Used prompt text",
            "status": "success" or "error",
            "error": "Error message if status is error"
        }

    Raises:
        ValueError: If neither audio_path, audio_file, nor youtube_url is provided
        FileNotFoundError: If audio_path doesn't exist
        RuntimeError: If API call fails or network issues occur

    Examples:
        # Basic analysis with local file
        result = understand_music(audio_path="song.mp3")
        print(result["analysis"])

        # Custom prompt for finding cut points
        result = understand_music(
            audio_path="song.mp3",
            prompt_text="Identify the best cutting points for editing - suggest specific time stamps where verses, choruses, and bridges begin and end."
        )

        # Analysis with YouTube URL
        result = understand_music(
            youtube_url="https://youtube.com/watch?v=example",
            prompt_text="Analyze the structure and suggest optimal edit points."
        )
    """
    try:
        # Validate input parameters
        if not any([audio_path, audio_file, youtube_url]):
            raise ValueError(
                "Either audio_path, audio_file, or youtube_url must be provided"
            )

        # Handle different audio sources
        audio_source = None
        temp_file_path = None
        source_type = "unknown"
        source_filename = "unknown"

        try:
            if audio_path:
                # Validate and use local audio file
                validated_path = validate_audio_path(audio_path)
                audio_source = handle_file(validated_path)
                source_type = "path"
                source_filename = os.path.basename(validated_path)

            elif audio_file:
                # Save bytes to temporary file
                if not filename:
                    raise ValueError("Filename must be provided when using audio_file")

                # Create temporary file with appropriate extension
                temp_dir = tempfile.mkdtemp()
                if filename.lower().endswith((".wav", ".mp3", ".flac", ".m4a")):
                    temp_filename = filename
                else:
                    temp_filename = f"{filename}.wav"

                temp_file_path = os.path.join(temp_dir, temp_filename)

                with open(temp_file_path, "wb") as f:
                    f.write(audio_file)

                audio_source = handle_file(temp_file_path)
                source_type = "bytes"
                source_filename = filename

            elif youtube_url:
                # Use YouTube URL directly
                audio_source = youtube_url
                source_type = "youtube"
                source_filename = youtube_url

            # Initialize client and make prediction
            client = Client("nvidia/music-flamingo")

            result = client.predict(
                audio_path=audio_source,
                youtube_url=youtube_url if youtube_url else "",
                prompt_text=prompt_text,
                api_name="/infer",
            )

            return {
                "analysis": result,
                "audio_source": source_type,
                "filename": source_filename,
                "prompt": prompt_text,
                "status": "success",
            }

        finally:
            # Clean up temporary file if created
            if temp_file_path and os.path.exists(temp_file_path):
                os.unlink(temp_file_path)
                # Remove temp directory if empty
                temp_dir = os.path.dirname(temp_file_path)
                try:
                    os.rmdir(temp_dir)
                except OSError:
                    pass  # Directory not empty, leave it

    except Exception as e:
        return {
            "analysis": None,
            "audio_source": audio_path or "bytes" or youtube_url or "unknown",
            "filename": filename
            if audio_file
            else (os.path.basename(audio_path) if audio_path else youtube_url),
            "prompt": prompt_text,
            "status": "error",
            "error": str(e),
        }


def analyze_music_structure(
    audio_path: Optional[str] = None,
    audio_file: Optional[bytes] = None,
    filename: str = "audio",
    youtube_url: Optional[str] = None,
) -> Dict[str, Any]:
    """
    Analyze music structure and identify sections (verse, chorus, bridge, etc.).

    This function provides a focused analysis on song structure, making it ideal
    for understanding where to make cuts and edits.

    Args:
        audio_path: Path to local audio file or URL
        audio_file: Raw audio bytes
        filename: Original filename for reference
        youtube_url: YouTube URL as alternative audio source

    Returns:
        Dictionary with structure analysis results
    """
    structure_prompt = (
        "Analyze the structure of this music track. Identify and timestamp the different sections: "
        "intro, verses, choruses, pre-chorus, bridge, instrumental breaks, solo sections, and outro/outro. "
        "Provide specific time stamps (in MM:SS format) for where each section begins and ends. "
        "Also note any transitions, buildups, or breakdowns that would be important for editing."
    )

    return understand_music(
        audio_path=audio_path,
        audio_file=audio_file,
        filename=filename,
        prompt_text=structure_prompt,
        youtube_url=youtube_url,
    )


def suggest_cutting_points(
    audio_path: Optional[str] = None,
    audio_file: Optional[bytes] = None,
    filename: str = "audio",
    youtube_url: Optional[str] = None,
    purpose: str = "general",
) -> Dict[str, Any]:
    """
    Suggest optimal cutting points for audio editing.

    Args:
        audio_path: Path to local audio file or URL
        audio_file: Raw audio bytes
        filename: Original filename for reference
        youtube_url: YouTube URL as alternative audio source
        purpose: Purpose of cutting ('general', 'dj_mix', 'social_media', 'ringtone')

    Returns:
        Dictionary with cutting point suggestions
    """
    purpose_prompts = {
        "general": (
            "Suggest the best cutting points for this track. Identify natural edit points where "
            "the music flows well for cuts. Provide timestamps in MM:SS format and explain why "
            "each point is good for editing (e.g., clean transitions, beat drops, phrase endings)."
        ),
        "dj_mix": (
            "Analyze this track for DJ mixing purposes. Identify the best intro and outro sections "
            "for beatmatching, suggest cue points for mixing, and provide timestamps for clean "
            "transitions. Focus on drum patterns, BPM consistency, and mixable sections."
        ),
        "social_media": (
            "Suggest cutting points for social media content (15-60 seconds). Identify the most "
            "engaging parts of the track, catchy hooks, or impactful moments. Provide timestamps "
            "for creating short, attention-grabbing clips."
        ),
        "ringtone": (
            "Identify the best 15-30 second sections for ringtones. Look for memorable melodies, "
            "catchy choruses, or distinctive instrumental parts. Provide timestamps and explain "
            "why each section would work well as a ringtone."
        ),
    }

    prompt = purpose_prompts.get(purpose, purpose_prompts["general"])

    return understand_music(
        audio_path=audio_path,
        audio_file=audio_file,
        filename=filename,
        prompt_text=prompt,
        youtube_url=youtube_url,
    )


def analyze_genre_and_style(
    audio_path: Optional[str] = None,
    audio_file: Optional[bytes] = None,
    filename: str = "audio",
    youtube_url: Optional[str] = None,
) -> Dict[str, Any]:
    """
    Provide detailed genre and production style analysis.

    Args:
        audio_path: Path to local audio file or URL
        audio_file: Raw audio bytes
        filename: Original filename for reference
        youtube_url: YouTube URL as alternative audio source

    Returns:
        Dictionary with genre and style analysis
    """
    genre_prompt = (
        "Provide a detailed analysis of this track's genre and production style. Identify the "
        "primary genre and any subgenres or fusion elements. Describe the production techniques, "
        "mixing style, sound design choices, and arrangement. Analyze the instrumentation, "
        "including both traditional and electronic elements. Discuss the era or period the music "
        "seems to draw inspiration from, and compare it to similar artists or tracks if applicable."
    )

    return understand_music(
        audio_path=audio_path,
        audio_file=audio_file,
        filename=filename,
        prompt_text=genre_prompt,
        youtube_url=youtube_url,
    )


if __name__ == "__main__":
    import argparse

    parser = argparse.ArgumentParser(
        description="Music understanding and analysis tools"
    )
    subparsers = parser.add_subparsers(dest="command", help="Available commands")

    # General understanding
    understand_parser = subparsers.add_parser(
        "understand", help="General music analysis"
    )
    understand_parser.add_argument("--audio", help="Path to audio file")
    understand_parser.add_argument("--prompt", help="Custom prompt text")
    understand_parser.add_argument("--youtube", help="YouTube URL")

    # Structure analysis
    structure_parser = subparsers.add_parser("structure", help="Analyze song structure")
    structure_parser.add_argument("--audio", help="Path to audio file")
    structure_parser.add_argument("--youtube", help="YouTube URL")

    # Cutting points
    cutting_parser = subparsers.add_parser("cutting", help="Suggest cutting points")
    cutting_parser.add_argument("--audio", help="Path to audio file")
    cutting_parser.add_argument(
        "--purpose",
        choices=["general", "dj_mix", "social_media", "ringtone"],
        default="general",
        help="Purpose of cutting",
    )
    cutting_parser.add_argument("--youtube", help="YouTube URL")

    # Genre analysis
    genre_parser = subparsers.add_parser("genre", help="Analyze genre and style")
    genre_parser.add_argument("--audio", help="Path to audio file")
    genre_parser.add_argument("--youtube", help="YouTube URL")

    args = parser.parse_args()

    try:
        if args.command == "understand":
            result = understand_music(
                audio_path=args.audio,
                youtube_url=args.youtube,
                prompt_text=args.prompt
                if args.prompt
                else "Describe this track in full detail - tell me the genre, tempo, and key, then dive into the instruments, production style, and overall mood it creates.",
            )

        elif args.command == "cutting":
            result = suggest_cutting_points(
                audio_path=args.audio, youtube_url=args.youtube, purpose=args.purpose
            )

        elif args.command == "genre":
            result = analyze_genre_and_style(
                audio_path=args.audio, youtube_url=args.youtube
            )

        else:
            parser.print_help()
            exit(1)

        # Output results
        if result["status"] == "success":
            print(f"Analysis for: {result['filename']}")
            print(f"Source: {result['audio_source']}")
            print(f"Prompt: {result['prompt']}")
            print("\n" + "=" * 50)
            print(result["analysis"])
        else:
            print(f"Error: {result['error']}")
            exit(1)

    except Exception as e:
        print(f"Error: {e}")
        exit(1)