Spaces:

frascuchon
/

music-mcp

Running on CPU Upgrade

File size: 16,567 Bytes

import argparse
import os
import ssl
import sys
import tempfile
import urllib.request
from datetime import datetime
from pathlib import Path
from typing import Optional

from gradio_client import Client, handle_file
from gradio_client.client import DEFAULT_TEMP_DIR

# Handle imports for both module and script usage
try:
    from tools.audio_info import validate_audio_path
except ImportError:
    from audio_info import validate_audio_path


def resolve_audio_path(audio_path: str) -> str:
    """
    Resolve audio path - handle both local files and URLs.

    Args:
        audio_path: Path to local audio file or URL

    Returns:
        Path to local audio file (downloads if URL)

    Raises:
        ValueError: If path is invalid
        RuntimeError: If URL download fails
    """
    if not audio_path:
        raise ValueError("Audio path cannot be empty")

    # Check if it's a URL
    if audio_path.startswith(("http://", "https://")):
        return download_audio_from_url(audio_path)
    else:
        # Handle local file
        return validate_audio_path(audio_path)


def download_audio_from_url(url: str, output_path: Optional[str] = None) -> str:
    """
    Download audio from URL to temporary file or specified output path.

    Args:
        url: URL to audio file
        output_path: Optional custom output path (if None, uses temp directory)

    Returns:
        Path to downloaded file

    Raises:
        RuntimeError: If download fails
    """
    if output_path:
        temp_path = output_path
    else:
        temp_dir = tempfile.gettempdir()
        filename = f"voice_replacement_{datetime.now().strftime('%Y%m%d_%H%M%S')}.wav"
        temp_path = os.path.join(temp_dir, filename)

    # Try multiple download methods
    download_methods = [
        # Method 1: Standard SSL context
        lambda: _download_with_ssl_context(
            url, temp_path, ssl.VerifyMode.CERT_REQUIRED
        ),
        # Method 2: Relaxed SSL (ignore cert errors)
        lambda: _download_with_ssl_context(url, temp_path, ssl.VerifyMode.CERT_NONE),
        # Method 3: No SSL verification
        lambda: _download_no_ssl(url, temp_path),
    ]

    last_error = None
    for i, download_method in enumerate(download_methods):
        try:
            download_method()
            if not os.path.exists(temp_path) or os.path.getsize(temp_path) == 0:
                raise RuntimeError(f"Downloaded file is empty or missing: {temp_path}")
            return temp_path
        except Exception as e:
            last_error = e
            if i < len(download_methods) - 1:
                # Clean up partial download and try next method
                if os.path.exists(temp_path):
                    os.remove(temp_path)
                continue

    raise RuntimeError(
        f"Failed to download audio from URL {url}. Last error: {str(last_error)}"
    )


def _download_with_ssl_context(
    url: str, temp_path: str, verify_mode: ssl.VerifyMode
) -> None:
    """Download with specific SSL certificate mode."""
    ssl_context = ssl.create_default_context()
    ssl_context.check_hostname = False
    ssl_context.verify_mode = verify_mode

    req = urllib.request.Request(url)
    req.add_header("User-Agent", "Mozilla/5.0 (compatible; Voice-Replacement-Tool/1.0)")

    with urllib.request.urlopen(req, context=ssl_context) as response:
        with open(temp_path, "wb") as f:
            f.write(response.read())


def _download_no_ssl(url: str, temp_path: str) -> None:
    """Download without SSL verification."""
    req = urllib.request.Request(url)
    req.add_header("User-Agent", "Mozilla/5.0 (compatible; Voice-Replacement-Tool/1.0)")

    # Open without SSL context
    with urllib.request.urlopen(req) as response:
        with open(temp_path, "wb") as f:
            f.write(response.read())


def cleanup_temp_file(file_path: str) -> None:
    """
    Clean up temporary file if it exists.

    Args:
        file_path: Path to temporary file
    """
    try:
        if os.path.exists(file_path) and file_path.startswith(tempfile.gettempdir()):
            os.remove(file_path)
    except Exception:
        # Ignore cleanup errors
        pass


def replace_voice(
    source_audio_path: str,
    target_audio_path: str,
    diffusion_steps: int = 10,
    length_adjust: float = 1.0,
    inference_cfg_rate: float = 0.7,
    f0_condition: bool = False,
    auto_f0_adjust: bool = True,
    pitch_shift: int = 0,
) -> str:
    """
    Replace voice in source audio with voice from target audio using Seed-VC.

    This function uses Seed-VC Gradio space to perform voice conversion,
    replacing voice characteristics in source audio with those from
    target audio while preserving linguistic content and timing.

    Examples:
        >>> replace_voice("source.wav", "target.wav")
        # Returns 'path/to/source_voice_replaced_by_target_20251126_143022.wav'

        >>> replace_voice("https://example.com/source.wav", "target.wav", diffusion_steps=15)
        # Downloads source audio and replaces voice with target voice

        >>> replace_voice("source.wav", "https://example.com/voice.mp3", pitch_shift=2)
        # Downloads target voice and applies to source with pitch shift

    Args:
        source_audio_path: Path to source audio file or URL (voice to be replaced)
                         Supports local files and HTTP/HTTPS URLs
        target_audio_path: Path to target audio file or URL (voice to use)
                         Supports local files and HTTP/HTTPS URLs
        diffusion_steps: Number of diffusion steps for inference (default: 10)
        length_adjust: Length adjustment factor (default: 1.0)
        inference_cfg_rate: Classifier-free guidance rate (default: 0.7)
        f0_condition: Whether to use F0 conditioning (default: False)
        auto_f0_adjust: Whether to auto-adjust F0 (default: True)
        pitch_shift: Pitch shift in semitones (default: 0)

    Returns:
        Path to generated voice-replaced audio file

    Raises:
        FileNotFoundError: If source or target audio files don't exist
        ValueError: If parameters are invalid
        RuntimeError: If voice replacement fails
    """
    source_temp_file = None
    target_temp_file = None

    try:
        # Resolve input paths (handle both URLs and local files)
        source_abs_path = resolve_audio_path(source_audio_path)
        target_abs_path = resolve_audio_path(target_audio_path)

        # Track temporary files for cleanup
        if source_audio_path.startswith(("http://", "https://")):
            source_temp_file = source_abs_path
        if target_audio_path.startswith(("http://", "https://")):
            target_temp_file = target_abs_path

        # Validate parameters
        if diffusion_steps < 1 or diffusion_steps > 50:
            raise ValueError("diffusion_steps must be between 1 and 50")
        if length_adjust <= 0:
            raise ValueError("length_adjust must be positive")
        if not 0 <= inference_cfg_rate <= 1:
            raise ValueError("inference_cfg_rate must be between 0 and 1")
        if pitch_shift < -12 or pitch_shift > 12:
            raise ValueError("pitch_shift must be between -12 and 12 semitones")

        # Initialize Seed-VC client with manual file handling
        client = Client("frascuchon/Seed-VC", download_files=False)

        # Prepare file handles for manual upload
        source_handle = handle_file(source_abs_path)
        target_handle = handle_file(target_abs_path)

        # Perform voice replacement
        result = client.predict(
            source_audio_path=source_handle,
            target_audio_path=target_handle,
            diffusion_steps=diffusion_steps,
            length_adjust=length_adjust,
            inference_cfg_rate=inference_cfg_rate,
            f0_condition=f0_condition,
            auto_f0_adjust=auto_f0_adjust,
            pitch_shift=pitch_shift,
            api_name="/predict_1",
        )

        # Create output directory
        output_dir = Path(DEFAULT_TEMP_DIR)
        output_dir.mkdir(exist_ok=True)

        # Generate output filename with timestamp
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        source_name = Path(source_abs_path).stem
        target_name = Path(target_abs_path).stem
        output_filename = (
            f"{source_name}_voice_replaced_by_{target_name}_{timestamp}.wav"
        )
        output_path = output_dir / output_filename

        # Handle result - check if it's a file path or needs manual download
        if hasattr(result, "url") and result.url:
            # Result is a file object with URL - download manually
            download_audio_from_url(result.url, str(output_path))
        elif isinstance(result, str) and os.path.exists(result):
            # Result is a local file path - copy it
            import shutil

            shutil.copy2(result, output_path)
        elif isinstance(result, (tuple, list)):
            import shutil

            # Only download the second item if multiple outputs
            item = result[0]
            if len(result) > 1:
                item = result[1]

            if url := item.get("url"):
                # Download each URL to a separate file
                item_output = str(output_path)
                download_audio_from_url(url, item_output)

            elif isinstance(item, str) and os.path.exists(item):
                # Copy each local file
                item_output = str(output_path)
                shutil.copy2(item, item_output)
            else:
                raise RuntimeError(f"Unexpected result format in tuple: {item}")

            shutil.move(item_output, output_path)
        else:
            # Result is audio data - save it directly
            import soundfile as sf

            sf.write(str(output_path), result, 22050)

        return str(output_path)

    except Exception as e:
        # Handle specific Seed-VC errors
        error_msg = str(e)
        if "403" in error_msg or "Forbidden" in error_msg:
            raise RuntimeError(
                "Seed-VC access denied. This may indicate:\n"
                "1. Files are in unsupported format\n"
                "2. Files are too large\n"
                "3. Temporary space restrictions\n"
                "4. Authentication required\n\n"
                "TROUBLESHOOTING:\n"
                "• Try different audio files (WAV, MP3, FLAC, M4A)\n"
                "• Use smaller files (< 30MB recommended)\n"
                "• Check if files are corrupted\n"
                "• Try again later if rate limited\n"
                "• Consider using a different voice source/target"
            )
        elif "404" in error_msg or "Not Found" in error_msg:
            raise RuntimeError(
                "Seed-VC cannot find one or both files. "
                "Check if:\n"
                "• Files exist and are accessible\n"
                "• File paths are correct\n"
                "• Files are in supported format (WAV, MP3, FLAC, M4A)\n"
                "• Manual download was successful"
            )
        elif "timeout" in error_msg.lower():
            raise RuntimeError(
                "Seed-VC connection timeout. "
                "Try:\n"
                "• Using fewer diffusion steps (5-10)\n"
                "• Smaller audio files\n"
                "• Processing again later\n"
                "• Checking internet connection"
            )
        else:
            raise RuntimeError(f"Voice replacement failed: {error_msg}")

    finally:
        # Always clean up temporary files
        if source_temp_file:
            cleanup_temp_file(source_temp_file)
        if target_temp_file:
            cleanup_temp_file(target_temp_file)


def replace_voice_wrapper(
    source_audio_path: str,
    target_audio_path: str,
    diffusion_steps: int = 10,
    length_adjust: float = 1.0,
    inference_cfg_rate: float = 0.7,
    f0_condition: bool = False,
    auto_f0_adjust: bool = True,
    pitch_shift: int = 0,
) -> str:
    """
    Wrapper function for voice replacement with error handling for MCP integration.

    Args:
        source_audio_path: Path to input audio file or URL
        target_audio_path: Path to target audio file or URL
        diffusion_steps: Number of diffusion steps (default: 10)
        length_adjust: Length adjustment factor (default: 1.0)
        inference_cfg_rate: CFG rate (default: 0.7)
        f0_condition: Use F0 conditioning (default: False)
        auto_f0_adjust: Auto-adjust F0 (default: True)
        pitch_shift: Pitch shift in semitones (default: 0)

    Returns:
        Path to generated audio file or error message

    Note for URL usage:
    Some URLs may be blocked by Seed-VC space restrictions.
    If URL processing fails with access errors, try:
    1. Download the file manually using your browser
    2. Save it locally and use the local file path
    3. Use a different audio source or target
    """
    try:
        return replace_voice(
            source_audio_path=source_audio_path,
            target_audio_path=target_audio_path,
            diffusion_steps=diffusion_steps,
            length_adjust=length_adjust,
            inference_cfg_rate=inference_cfg_rate,
            f0_condition=f0_condition,
            auto_f0_adjust=auto_f0_adjust,
            pitch_shift=pitch_shift,
        )
    except Exception as e:
        return f"Error: {str(e)}"


if __name__ == "__main__":
    """
    Script section for running voice replacement locally.
    
    Usage:
        python tools/voice_replacement.py source.wav target.wav
        python tools/voice_replacement.py source.wav target.wav --steps 15 --pitch 2
        python tools/voice_replacement.py https://example.com/source.wav target.wav
        python tools/voice_replacement.py source.wav https://example.com/target.mp3 --pitch 2
    """

    # Add parent directory to path for imports
    sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))

    parser = argparse.ArgumentParser(
        description="Voice replacement using Seed-VC",
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog="""
Examples:
  python tools/voice_replacement.py source.wav target.wav
  python tools/voice_replacement.py source.wav target.wav --steps 15 --pitch 2
  python tools/voice_replacement.py source.wav target.wav --f0-condition --no-auto-f0
  python tools/voice_replacement.py https://example.com/source.wav target.wav
  python tools/voice_replacement.py source.wav https://example.com/target.mp3 --pitch 2
        """,
    )

    parser.add_argument(
        "source", help="Source audio path or URL (voice to be replaced)"
    )
    parser.add_argument("target", help="Target audio path or URL (voice to use)")
    parser.add_argument(
        "--steps", type=int, default=10, help="Diffusion steps (1-50, default: 10)"
    )
    parser.add_argument(
        "--length",
        type=float,
        default=1.0,
        help="Length adjustment (0.1-3.0, default: 1.0)",
    )
    parser.add_argument(
        "--cfg",
        type=float,
        default=0.7,
        help="Inference CFG rate (0.0-1.0, default: 0.7)",
    )
    parser.add_argument(
        "--f0-condition", action="store_true", help="Enable F0 conditioning"
    )
    parser.add_argument(
        "--no-auto-f0", action="store_true", help="Disable auto F0 adjustment"
    )
    parser.add_argument(
        "--pitch",
        type=int,
        default=0,
        help="Pitch shift semitones (-12 to 12, default: 0)",
    )

    args = parser.parse_args()

    print("Voice Replacement Tool")
    print("=" * 30)
    print(f"Source: {args.source}")
    print(f"Target: {args.target}")
    print(f"Parameters: steps={args.steps}, length={args.length}, cfg={args.cfg}")
    print(
        f"F0 condition={args.f0_condition}, auto F0={not args.no_auto_f0}, pitch={args.pitch}"
    )
    print()

    try:
        result = replace_voice(
            source_audio_path=args.source,
            target_audio_path=args.target,
            diffusion_steps=args.steps,
            length_adjust=args.length,
            inference_cfg_rate=args.cfg,
            f0_condition=args.f0_condition,
            auto_f0_adjust=not args.no_auto_f0,
            pitch_shift=args.pitch,
        )

        print("✅ Voice replacement completed!")
        print(f"Output saved to: {result}")

    except Exception as e:
        print(f"❌ Error: {e}")
        sys.exit(1)