Spaces:

frascuchon
/

music-mcp

Running on CPU Upgrade

File size: 22,476 Bytes

import os
import tempfile
from typing import Optional

import librosa
import numpy as np
import soundfile as sf


def _load_audio(audio_path: str, mono: bool = False) -> tuple[np.ndarray, int]:
    """Load audio file with standard settings."""
    y, sr = librosa.load(audio_path, sr=None, mono=mono, res_type="soxr_vhq")
    # Ensure consistent (samples, channels) format
    if not mono and y.ndim > 1 and y.shape[0] == 2:
        y = y.T
    return y, int(sr)


def detect_crossfade_point(
    insert_position: float, audio_duration: float, crossfade_duration: float = 0.1
) -> tuple[float, float]:
    """
    Calculate optimal crossfade points for seamless insertion.

    Args:
        insert_position: Where to insert the section (in seconds)
        audio_duration: Total duration of the target audio (in seconds)
        crossfade_duration: Length of crossfade (in seconds)

    Returns:
        Tuple of (start_time, end_time) for crossfade region
    """
    # Calculate crossfade boundaries
    fade_start = max(0, insert_position - crossfade_duration / 2)
    fade_end = min(audio_duration, insert_position + crossfade_duration / 2)

    return fade_start, fade_end


def apply_crossfade(
    section: np.ndarray, target: np.ndarray, crossfade_duration: float, sample_rate: int
) -> np.ndarray:
    """
    Apply crossfade between section and target audio.

    Args:
        section: Audio section to insert
        target: Target audio to insert into
        crossfade_duration: Length of crossfade in seconds
        sample_rate: Sample rate of audio

    Returns:
        Target audio with section inserted
    """
    # Calculate crossfade samples
    fade_samples = int(crossfade_duration * sample_rate)

    # Create crossfade envelope
    fade_in = np.linspace(0, 1, fade_samples)
    fade_out = np.linspace(1, 0, fade_samples)

    # Handle stereo audio
    if section.ndim > 1:
        crossfade = np.outer(fade_in * fade_out, np.ones(section.shape[1]))
    else:
        crossfade = fade_in * fade_out

    # Apply crossfade to section end
    section_end = section[-fade_samples:] if len(section) > fade_samples else section
    if section_end.ndim > 1:
        section_end[:fade_samples] *= crossfade
    else:
        section_end[:fade_samples] *= crossfade

    # Insert section into target
    insert_sample = int(len(target) * 0.5)  # Insert at middle
    result = np.insert(target, insert_sample, section_end, axis=0)

    return result


def insert_section(
    audio_path: str,
    section_path: str,
    insert_time: float,
    crossfade_duration: float = 0.1,
    output_path: Optional[str] = None,
    output_format: str = "wav",
) -> str:
    """
    Insert a section from one audio track into another at a precise time position.

    This function allows you to insert audio content (like an intro, advertisement,
    or sound effect) into an existing track at any position with smooth
    crossfading to avoid audible clicks or abrupt transitions.

    Args:
        audio_path: Path to the main audio file or URL (supports common formats: WAV, MP3, FLAC, M4A)
        section_path: Path to the audio section to insert (supports common formats: WAV, MP3, FLAC, M4A)
        insert_time: Position to insert the section (in seconds from start of main audio)
        crossfade_duration: Length of crossfade in seconds (default: 0.1)
                             Longer crossfades create smoother transitions but reduce clarity
        output_path: Optional output directory (default: None, uses temp directory)
        output_format: Output format for the final audio ('wav' or 'mp3', default: 'wav')

    Returns:
        Path to the audio file with the section inserted

    Examples:
        >>> insert_section("main_track.wav", "intro.wav", 5.0, 0.2, "output", "wav")
        # Returns 'path/to/main_with_intro.wav' with intro inserted at 5 seconds

        >>> insert_section("podcast.mp3", "advertisement.mp3", 180.0, 0.5, "output", "mp3")
        # Returns 'path/to/podcast_with_ad.mp3' with ad inserted at 3 minutes

    Note:
        - Insert position is measured from the start of the main audio
        - Crossfade prevents clicks and creates smooth transitions
        - If insert_time + section duration exceeds main audio duration, section is truncated
        - Works with mono or stereo audio files
        - Preserves original audio quality and sample rate
        - Processing time depends on audio length and crossfade duration
    """
    try:
        # Load both audio files
        main_audio, main_sr = _load_audio(audio_path, mono=False)
        section_audio, section_sr = _load_audio(section_path, mono=False)

        # Resample if needed
        if main_sr != section_sr:
            if section_audio.ndim > 1:
                # Resample each channel separately
                section_audio = np.array(
                    [
                        librosa.resample(
                            section_audio[:, ch], orig_sr=section_sr, target_sr=main_sr
                        )
                        for ch in range(section_audio.shape[1])
                    ]
                ).T
            else:
                section_audio = librosa.resample(
                    section_audio, orig_sr=section_sr, target_sr=main_sr
                )

        # Calculate timing
        main_duration = len(main_audio) / main_sr

        # Validate insert position
        if insert_time < 0:
            raise ValueError("Insert time must be positive")
        if insert_time > main_duration:
            raise ValueError(
                f"Insert time ({insert_time}s) exceeds main audio duration ({main_duration}s)"
            )

        # Calculate crossfade points
        fade_start, fade_end = detect_crossfade_point(
            insert_time, main_duration, crossfade_duration
        )

        # Extract main audio segments
        main_before = main_audio[: int(fade_start * main_sr)]
        main_after = main_audio[int(fade_end * main_sr) :]

        # Apply crossfade and insert section
        result = apply_crossfade(section_audio, main_after, crossfade_duration, main_sr)

        # Combine all parts
        final_audio = np.concatenate([main_before, result])

        # Save output
        if output_path is None:
            output_path = tempfile.mkdtemp(suffix="_inserted")
        else:
            os.makedirs(output_path, exist_ok=True)

        # Generate output filename
        main_filename = os.path.splitext(os.path.basename(audio_path))[0]
        output_filename = f"{main_filename}_with_insertion.{output_format}"
        output_file = os.path.join(output_path, output_filename)

        # Save final audio
        sf.write(output_file, final_audio, main_sr)

        return output_file

    except Exception as e:
        raise RuntimeError(f"Error inserting audio section: {str(e)}")


def insert_multiple_sections(
    audio_path: str,
    sections: list[tuple[str, float, float]],
    crossfade_duration: float = 0.1,
    output_path: Optional[str] = None,
    output_format: str = "wav",
) -> str:
    """
    Insert multiple sections into an audio track at specified positions.

    This function allows inserting multiple audio sections (like multiple ads,
    sound effects, or musical segments) into a main track with smooth
    transitions between each insertion.

    Args:
        audio_path: Path to the main audio file or URL (supports common formats: WAV, MP3, FLAC, M4A)
        sections: List of (section_path, insert_time) tuples
                  section_path: Path to audio section to insert
                  insert_time: Position to insert section (in seconds)
        crossfade_duration: Length of crossfade in seconds (default: 0.1)
        output_path: Optional output directory (default: None, uses temp directory)
        output_format: Output format for the final audio ('wav' or 'mp3', default: 'wav')

    Returns:
        Path to the audio file with all sections inserted

    Examples:
        >>> insert_multiple_sections("track.wav", [("intro.wav", 0), ("ad1.wav", 30), ("ad2.wav", 180)], 0.2)
        # Returns 'path/to/track_with_insertions.wav' with intro at start, ads at 30s and 3min

        >>> insert_multiple_sections("podcast.mp3", [("sponsor.wav", 60)], 0.3, "output", "mp3")
        # Returns 'path/to/podcast_with_sponsor.wav' with sponsor segment at 1 minute

    Note:
        - Sections are inserted in chronological order
        - Each section gets crossfade at both start and end
        - If sections overlap, later sections take precedence
        - Total processing time increases with number of sections
        - Works best with non-overlapping insertion times
    """
    try:
        # Load main audio
        main_audio, main_sr = _load_audio(audio_path, mono=False)
        main_duration = len(main_audio) / main_sr
        current_audio = main_audio.copy()

        # Sort sections by insert time
        sorted_sections = sorted(sections, key=lambda x: x[1])

        # Insert each section
        for section_path, insert_time, _ in sorted_sections:
            # Load section
            section_audio, section_sr = _load_audio(section_path, mono=False)

            # Resample if needed
            if section_sr != main_sr:
                if section_audio.ndim > 1:
                    # Resample each channel separately
                    section_audio = np.array(
                        [
                            librosa.resample(
                                section_audio[:, ch],
                                orig_sr=section_sr,
                                target_sr=main_sr,
                            )
                            for ch in range(section_audio.shape[1])
                        ]
                    ).T
                else:
                    section_audio = librosa.resample(
                        section_audio, orig_sr=section_sr, target_sr=main_sr
                    )

            # Calculate crossfade points
            fade_start, fade_end = detect_crossfade_point(
                insert_time, main_duration, crossfade_duration
            )

            # Extract current audio segments
            current_before = current_audio[: int(fade_start * main_sr)]
            current_after = current_audio[int(fade_end * main_sr) :]

            # Apply crossfade and insert section
            section_with_fade = apply_crossfade(
                section_audio, current_after, crossfade_duration, main_sr
            )

            # Update current audio
            current_audio = np.concatenate([current_before, section_with_fade])

            # Update duration for next insertion
            main_duration = len(current_audio) / main_sr

        # Save output
        if output_path is None:
            output_path = tempfile.mkdtemp(suffix="_multi_inserted")
        else:
            os.makedirs(output_path, exist_ok=True)

        # Generate output filename
        main_filename = os.path.splitext(os.path.basename(audio_path))[0]
        output_filename = f"{main_filename}_with_multiple_insertions.{output_format}"
        output_file = os.path.join(output_path, output_filename)

        # Save final audio
        sf.write(output_file, current_audio, main_sr)

        return output_file

    except Exception as e:
        raise RuntimeError(f"Error inserting multiple sections: {str(e)}")


def replace_section(
    audio_path: str,
    start_time: float,
    end_time: float,
    replacement_path: str,
    crossfade_duration: float = 0.1,
    output_path: Optional[str] = None,
    output_format: str = "wav",
) -> str:
    """
    Replace a section of an audio track with another audio segment.

    This function removes a specified time range from the main audio and
    replaces it with new content, using crossfades for smooth transitions.

    Args:
        audio_path: Path to the main audio file or URL (supports common formats: WAV, MP3, FLAC, M4A)
        start_time: Start time of section to replace (in seconds)
        end_time: End time of section to replace (in seconds)
        replacement_path: Path to the replacement audio segment (supports common formats: WAV, MP3, FLAC, M4A)
        crossfade_duration: Length of crossfade in seconds (default: 0.1)
        output_path: Optional output directory (default: None, uses temp directory)
        output_format: Output format for the final audio ('wav' or 'mp3', default: 'wav')

    Returns:
        Path to the audio file with the section replaced

    Examples:
        >>> replace_section("song.wav", 60.0, 90.0, "new_verse.wav", 0.2, "output", "wav")
        # Returns 'path/to/song_replaced.wav' with 60-90s section replaced

        >>> replace_section("podcast.mp3", 120.0, 150.0, "correction.wav", 0.3, "output", "mp3")
        # Returns 'path/to/podcast_replaced.mp3' with 2-minute section replaced

    Note:
        - Start time must be less than end time
        - Crossfade prevents clicks at replacement boundaries
        - Replacement section is trimmed if longer than specified duration
        - Preserves original audio quality and sample rate
        - Useful for fixing mistakes, updating content, or adding corrections
    """
    try:
        # Load both audio files
        main_audio, main_sr = _load_audio(audio_path, mono=False)
        replacement_audio, replacement_sr = _load_audio(replacement_path, mono=False)

        # Validate timing
        if start_time >= end_time:
            raise ValueError("Start time must be less than end time")

        # Convert times to samples
        start_sample = int(start_time * main_sr)
        end_sample = int(end_time * main_sr)

        # Extract main audio parts
        main_before = main_audio[:start_sample]
        main_after = main_audio[end_sample:]

        # Resample replacement if needed
        if replacement_sr != main_sr:
            if replacement_audio.ndim > 1:
                # Resample each channel separately
                replacement_audio = np.array(
                    [
                        librosa.resample(
                            replacement_audio[:, ch],
                            orig_sr=replacement_sr,
                            target_sr=main_sr,
                        )
                        for ch in range(replacement_audio.shape[1])
                    ]
                ).T
            else:
                replacement_audio = librosa.resample(
                    replacement_audio, orig_sr=replacement_sr, target_sr=main_sr
                )

        # Trim replacement to specified duration
        replacement_duration = end_time - start_time
        replacement_samples = int(replacement_duration * main_sr)
        trimmed_replacement = (
            replacement_audio[:replacement_samples]
            if len(replacement_audio) > replacement_samples
            else replacement_audio
        )

        # Apply crossfades
        fade_samples = int(crossfade_duration * main_sr)

        # Fade in replacement
        fade_in = np.linspace(0, 1, fade_samples)
        if trimmed_replacement.ndim > 1:
            fade_in = np.outer(fade_in, np.ones(trimmed_replacement.shape[1]))
        trimmed_replacement[:fade_samples] *= fade_in

        # Fade out at end of replacement
        fade_out = np.linspace(1, 0, fade_samples)
        if trimmed_replacement.ndim > 1:
            fade_out = np.outer(fade_out, np.ones(trimmed_replacement.shape[1]))
        trimmed_replacement[-fade_samples:] *= fade_out

        # Combine all parts
        final_audio = np.concatenate([main_before, trimmed_replacement, main_after])

        # Save output
        if output_path is None:
            output_path = tempfile.mkdtemp(suffix="_replaced")
        else:
            os.makedirs(output_path, exist_ok=True)

        # Generate output filename
        main_filename = os.path.splitext(os.path.basename(audio_path))[0]
        output_filename = f"{main_filename}_replaced.{output_format}"
        output_file = os.path.join(output_path, output_filename)

        # Save final audio
        sf.write(output_file, final_audio, main_sr)

        return output_file

    except Exception as e:
        raise RuntimeError(f"Error replacing audio section: {str(e)}")


def insert_section_wrapper(
    audio_path: str,
    insert_path: str,
    insert_time: float,
    crossfade_duration: float = 0.1,
    output_format: str = "wav",
) -> str:
    """
    Wrapper function for inserting audio sections with error handling for MCP integration.

    Args:
        audio_path: Path to the main audio file
        insert_path: Path to the audio section to insert
        insert_time: Time to insert the section (in seconds)
        crossfade_duration: Length of crossfade in seconds (default: 0.1)
        output_format: Output format ('wav' or 'mp3', default: 'wav')

    Returns:
        Path to output file or error message
    """
    try:
        return insert_section(
            audio_path=audio_path,
            section_path=insert_path,
            insert_time=insert_time,
            crossfade_duration=crossfade_duration,
            output_path=None,
            output_format=output_format,
        )
    except Exception as e:
        return f"Error: {str(e)}"


def replace_section_wrapper(
    audio_path: str,
    start_time: float,
    end_time: float,
    replacement_path: str,
    crossfade_duration: float = 0.1,
    output_format: str = "wav",
) -> str:
    """
    Wrapper function for replacing audio sections with error handling for MCP integration.

    Args:
        audio_path: Path to the main audio file
        start_time: Start time of section to replace (in seconds)
        end_time: End time of section to replace (in seconds)
        replacement_path: Path to the replacement audio segment
        crossfade_duration: Length of crossfade in seconds (default: 0.1)
        output_format: Output format ('wav' or 'mp3', default: 'wav')

    Returns:
        Path to output file or error message
    """
    try:
        return replace_section(
            audio_path=audio_path,
            start_time=start_time,
            end_time=end_time,
            replacement_path=replacement_path,
            crossfade_duration=crossfade_duration,
            output_path=None,
            output_format=output_format,
        )
    except Exception as e:
        return f"Error: {str(e)}"


if __name__ == "__main__":
    """
    Script section for running audio insertion/replacement locally.
    
    Usage:
        python tools/audio_insertion.py insert main.wav insert.wav 30.0
        python tools/audio_insertion.py replace main.wav 10.0 20.0 replacement.wav
    """
    import argparse
    import sys

    parser = argparse.ArgumentParser(
        description="Insert or replace audio sections",
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog="""
Examples:
  # Insert section at 30 seconds
  python tools/audio_insertion.py insert main.wav insert.wav 30.0
  
  # Replace section from 10s to 20s
  python tools/audio_insertion.py replace main.wav 10.0 20.0 replacement.wav
  
  # With custom crossfade
  python tools/audio_insertion.py insert main.wav insert.wav 30.0 --crossfade 0.2
        """,
    )

    subparsers = parser.add_subparsers(dest="command", help="Command to run")

    # Insert command
    insert_parser = subparsers.add_parser("insert", help="Insert audio section")
    insert_parser.add_argument("main", help="Main audio file")
    insert_parser.add_argument("insert", help="Audio section to insert")
    insert_parser.add_argument("time", type=float, help="Insert time in seconds")
    insert_parser.add_argument(
        "--crossfade",
        type=float,
        default=0.1,
        help="Crossfade duration in seconds (default: 0.1)",
    )
    insert_parser.add_argument(
        "--format",
        choices=["wav", "mp3"],
        default="wav",
        help="Output format (default: wav)",
    )

    # Replace command
    replace_parser = subparsers.add_parser("replace", help="Replace audio section")
    replace_parser.add_argument("main", help="Main audio file")
    replace_parser.add_argument("start", type=float, help="Start time in seconds")
    replace_parser.add_argument("end", type=float, help="End time in seconds")
    replace_parser.add_argument("replacement", help="Replacement audio section")
    replace_parser.add_argument(
        "--crossfade",
        type=float,
        default=0.1,
        help="Crossfade duration in seconds (default: 0.1)",
    )
    replace_parser.add_argument(
        "--format",
        choices=["wav", "mp3"],
        default="wav",
        help="Output format (default: wav)",
    )

    args = parser.parse_args()

    if not args.command:
        parser.print_help()
        sys.exit(1)

    print("Audio Insertion Tool")
    print("=" * 25)

    try:
        result = None

        if args.command == "insert":
            print(f"Main audio: {args.main}")
            print(f"Insert section: {args.insert}")
            print(f"Insert time: {args.time}s")
            print(f"Crossfade: {args.crossfade}s")
            print()

            result = insert_section_wrapper(
                audio_path=args.main,
                insert_path=args.insert,
                insert_time=args.time,
                crossfade_duration=args.crossfade,
                output_format=args.format,
            )

        elif args.command == "replace":
            print(f"Main audio: {args.main}")
            print(f"Replace section: {args.start}s - {args.end}s")
            print(f"Replacement: {args.replacement}")
            print(f"Crossfade: {args.crossfade}s")
            print()

            result = replace_section_wrapper(
                audio_path=args.main,
                start_time=args.start,
                end_time=args.end,
                replacement_path=args.replacement,
                crossfade_duration=args.crossfade,
                output_format=args.format,
            )

        if result is None:
            print("❌ No command executed")
            sys.exit(1)
        elif result.startswith("Error:"):
            print(f"❌ {result}")
            sys.exit(1)
        else:
            print(f"✅ Audio {args.command}ion completed!")
            print(f"Output saved to: {result}")

    except Exception as e:
        print(f"❌ Error: {e}")
        sys.exit(1)