import argparse import os import ssl import sys import tempfile import urllib.request from datetime import datetime from pathlib import Path from typing import Optional from gradio_client import Client, handle_file from gradio_client.client import DEFAULT_TEMP_DIR # Handle imports for both module and script usage try: from tools.audio_info import validate_audio_path except ImportError: from audio_info import validate_audio_path def resolve_audio_path(audio_path: str) -> str: """ Resolve audio path - handle both local files and URLs. Args: audio_path: Path to local audio file or URL Returns: Path to local audio file (downloads if URL) Raises: ValueError: If path is invalid RuntimeError: If URL download fails """ if not audio_path: raise ValueError("Audio path cannot be empty") # Check if it's a URL if audio_path.startswith(("http://", "https://")): return download_audio_from_url(audio_path) else: # Handle local file return validate_audio_path(audio_path) def download_audio_from_url(url: str, output_path: Optional[str] = None) -> str: """ Download audio from URL to temporary file or specified output path. Args: url: URL to audio file output_path: Optional custom output path (if None, uses temp directory) Returns: Path to downloaded file Raises: RuntimeError: If download fails """ if output_path: temp_path = output_path else: temp_dir = tempfile.gettempdir() filename = f"voice_replacement_{datetime.now().strftime('%Y%m%d_%H%M%S')}.wav" temp_path = os.path.join(temp_dir, filename) # Try multiple download methods download_methods = [ # Method 1: Standard SSL context lambda: _download_with_ssl_context( url, temp_path, ssl.VerifyMode.CERT_REQUIRED ), # Method 2: Relaxed SSL (ignore cert errors) lambda: _download_with_ssl_context(url, temp_path, ssl.VerifyMode.CERT_NONE), # Method 3: No SSL verification lambda: _download_no_ssl(url, temp_path), ] last_error = None for i, download_method in enumerate(download_methods): try: download_method() if not os.path.exists(temp_path) or os.path.getsize(temp_path) == 0: raise RuntimeError(f"Downloaded file is empty or missing: {temp_path}") return temp_path except Exception as e: last_error = e if i < len(download_methods) - 1: # Clean up partial download and try next method if os.path.exists(temp_path): os.remove(temp_path) continue raise RuntimeError( f"Failed to download audio from URL {url}. Last error: {str(last_error)}" ) def _download_with_ssl_context( url: str, temp_path: str, verify_mode: ssl.VerifyMode ) -> None: """Download with specific SSL certificate mode.""" ssl_context = ssl.create_default_context() ssl_context.check_hostname = False ssl_context.verify_mode = verify_mode req = urllib.request.Request(url) req.add_header("User-Agent", "Mozilla/5.0 (compatible; Voice-Replacement-Tool/1.0)") with urllib.request.urlopen(req, context=ssl_context) as response: with open(temp_path, "wb") as f: f.write(response.read()) def _download_no_ssl(url: str, temp_path: str) -> None: """Download without SSL verification.""" req = urllib.request.Request(url) req.add_header("User-Agent", "Mozilla/5.0 (compatible; Voice-Replacement-Tool/1.0)") # Open without SSL context with urllib.request.urlopen(req) as response: with open(temp_path, "wb") as f: f.write(response.read()) def cleanup_temp_file(file_path: str) -> None: """ Clean up temporary file if it exists. Args: file_path: Path to temporary file """ try: if os.path.exists(file_path) and file_path.startswith(tempfile.gettempdir()): os.remove(file_path) except Exception: # Ignore cleanup errors pass def replace_voice( source_audio_path: str, target_audio_path: str, diffusion_steps: int = 10, length_adjust: float = 1.0, inference_cfg_rate: float = 0.7, f0_condition: bool = False, auto_f0_adjust: bool = True, pitch_shift: int = 0, ) -> str: """ Replace voice in source audio with voice from target audio using Seed-VC. This function uses Seed-VC Gradio space to perform voice conversion, replacing voice characteristics in source audio with those from target audio while preserving linguistic content and timing. Examples: >>> replace_voice("source.wav", "target.wav") # Returns 'path/to/source_voice_replaced_by_target_20251126_143022.wav' >>> replace_voice("https://example.com/source.wav", "target.wav", diffusion_steps=15) # Downloads source audio and replaces voice with target voice >>> replace_voice("source.wav", "https://example.com/voice.mp3", pitch_shift=2) # Downloads target voice and applies to source with pitch shift Args: source_audio_path: Path to source audio file or URL (voice to be replaced) Supports local files and HTTP/HTTPS URLs target_audio_path: Path to target audio file or URL (voice to use) Supports local files and HTTP/HTTPS URLs diffusion_steps: Number of diffusion steps for inference (default: 10) length_adjust: Length adjustment factor (default: 1.0) inference_cfg_rate: Classifier-free guidance rate (default: 0.7) f0_condition: Whether to use F0 conditioning (default: False) auto_f0_adjust: Whether to auto-adjust F0 (default: True) pitch_shift: Pitch shift in semitones (default: 0) Returns: Path to generated voice-replaced audio file Raises: FileNotFoundError: If source or target audio files don't exist ValueError: If parameters are invalid RuntimeError: If voice replacement fails """ source_temp_file = None target_temp_file = None try: # Resolve input paths (handle both URLs and local files) source_abs_path = resolve_audio_path(source_audio_path) target_abs_path = resolve_audio_path(target_audio_path) # Track temporary files for cleanup if source_audio_path.startswith(("http://", "https://")): source_temp_file = source_abs_path if target_audio_path.startswith(("http://", "https://")): target_temp_file = target_abs_path # Validate parameters if diffusion_steps < 1 or diffusion_steps > 50: raise ValueError("diffusion_steps must be between 1 and 50") if length_adjust <= 0: raise ValueError("length_adjust must be positive") if not 0 <= inference_cfg_rate <= 1: raise ValueError("inference_cfg_rate must be between 0 and 1") if pitch_shift < -12 or pitch_shift > 12: raise ValueError("pitch_shift must be between -12 and 12 semitones") # Initialize Seed-VC client with manual file handling client = Client("frascuchon/Seed-VC", download_files=False) # Prepare file handles for manual upload source_handle = handle_file(source_abs_path) target_handle = handle_file(target_abs_path) # Perform voice replacement result = client.predict( source_audio_path=source_handle, target_audio_path=target_handle, diffusion_steps=diffusion_steps, length_adjust=length_adjust, inference_cfg_rate=inference_cfg_rate, f0_condition=f0_condition, auto_f0_adjust=auto_f0_adjust, pitch_shift=pitch_shift, api_name="/predict_1", ) # Create output directory output_dir = Path(DEFAULT_TEMP_DIR) output_dir.mkdir(exist_ok=True) # Generate output filename with timestamp timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") source_name = Path(source_abs_path).stem target_name = Path(target_abs_path).stem output_filename = ( f"{source_name}_voice_replaced_by_{target_name}_{timestamp}.wav" ) output_path = output_dir / output_filename # Handle result - check if it's a file path or needs manual download if hasattr(result, "url") and result.url: # Result is a file object with URL - download manually download_audio_from_url(result.url, str(output_path)) elif isinstance(result, str) and os.path.exists(result): # Result is a local file path - copy it import shutil shutil.copy2(result, output_path) elif isinstance(result, (tuple, list)): import shutil # Only download the second item if multiple outputs item = result[0] if len(result) > 1: item = result[1] if url := item.get("url"): # Download each URL to a separate file item_output = str(output_path) download_audio_from_url(url, item_output) elif isinstance(item, str) and os.path.exists(item): # Copy each local file item_output = str(output_path) shutil.copy2(item, item_output) else: raise RuntimeError(f"Unexpected result format in tuple: {item}") shutil.move(item_output, output_path) else: # Result is audio data - save it directly import soundfile as sf sf.write(str(output_path), result, 22050) return str(output_path) except Exception as e: # Handle specific Seed-VC errors error_msg = str(e) if "403" in error_msg or "Forbidden" in error_msg: raise RuntimeError( "Seed-VC access denied. This may indicate:\n" "1. Files are in unsupported format\n" "2. Files are too large\n" "3. Temporary space restrictions\n" "4. Authentication required\n\n" "TROUBLESHOOTING:\n" "• Try different audio files (WAV, MP3, FLAC, M4A)\n" "• Use smaller files (< 30MB recommended)\n" "• Check if files are corrupted\n" "• Try again later if rate limited\n" "• Consider using a different voice source/target" ) elif "404" in error_msg or "Not Found" in error_msg: raise RuntimeError( "Seed-VC cannot find one or both files. " "Check if:\n" "• Files exist and are accessible\n" "• File paths are correct\n" "• Files are in supported format (WAV, MP3, FLAC, M4A)\n" "• Manual download was successful" ) elif "timeout" in error_msg.lower(): raise RuntimeError( "Seed-VC connection timeout. " "Try:\n" "• Using fewer diffusion steps (5-10)\n" "• Smaller audio files\n" "• Processing again later\n" "• Checking internet connection" ) else: raise RuntimeError(f"Voice replacement failed: {error_msg}") finally: # Always clean up temporary files if source_temp_file: cleanup_temp_file(source_temp_file) if target_temp_file: cleanup_temp_file(target_temp_file) def replace_voice_wrapper( source_audio_path: str, target_audio_path: str, diffusion_steps: int = 10, length_adjust: float = 1.0, inference_cfg_rate: float = 0.7, f0_condition: bool = False, auto_f0_adjust: bool = True, pitch_shift: int = 0, ) -> str: """ Wrapper function for voice replacement with error handling for MCP integration. Args: source_audio_path: Path to input audio file or URL target_audio_path: Path to target audio file or URL diffusion_steps: Number of diffusion steps (default: 10) length_adjust: Length adjustment factor (default: 1.0) inference_cfg_rate: CFG rate (default: 0.7) f0_condition: Use F0 conditioning (default: False) auto_f0_adjust: Auto-adjust F0 (default: True) pitch_shift: Pitch shift in semitones (default: 0) Returns: Path to generated audio file or error message Note for URL usage: Some URLs may be blocked by Seed-VC space restrictions. If URL processing fails with access errors, try: 1. Download the file manually using your browser 2. Save it locally and use the local file path 3. Use a different audio source or target """ try: return replace_voice( source_audio_path=source_audio_path, target_audio_path=target_audio_path, diffusion_steps=diffusion_steps, length_adjust=length_adjust, inference_cfg_rate=inference_cfg_rate, f0_condition=f0_condition, auto_f0_adjust=auto_f0_adjust, pitch_shift=pitch_shift, ) except Exception as e: return f"Error: {str(e)}" if __name__ == "__main__": """ Script section for running voice replacement locally. Usage: python tools/voice_replacement.py source.wav target.wav python tools/voice_replacement.py source.wav target.wav --steps 15 --pitch 2 python tools/voice_replacement.py https://example.com/source.wav target.wav python tools/voice_replacement.py source.wav https://example.com/target.mp3 --pitch 2 """ # Add parent directory to path for imports sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) parser = argparse.ArgumentParser( description="Voice replacement using Seed-VC", formatter_class=argparse.RawDescriptionHelpFormatter, epilog=""" Examples: python tools/voice_replacement.py source.wav target.wav python tools/voice_replacement.py source.wav target.wav --steps 15 --pitch 2 python tools/voice_replacement.py source.wav target.wav --f0-condition --no-auto-f0 python tools/voice_replacement.py https://example.com/source.wav target.wav python tools/voice_replacement.py source.wav https://example.com/target.mp3 --pitch 2 """, ) parser.add_argument( "source", help="Source audio path or URL (voice to be replaced)" ) parser.add_argument("target", help="Target audio path or URL (voice to use)") parser.add_argument( "--steps", type=int, default=10, help="Diffusion steps (1-50, default: 10)" ) parser.add_argument( "--length", type=float, default=1.0, help="Length adjustment (0.1-3.0, default: 1.0)", ) parser.add_argument( "--cfg", type=float, default=0.7, help="Inference CFG rate (0.0-1.0, default: 0.7)", ) parser.add_argument( "--f0-condition", action="store_true", help="Enable F0 conditioning" ) parser.add_argument( "--no-auto-f0", action="store_true", help="Disable auto F0 adjustment" ) parser.add_argument( "--pitch", type=int, default=0, help="Pitch shift semitones (-12 to 12, default: 0)", ) args = parser.parse_args() print("Voice Replacement Tool") print("=" * 30) print(f"Source: {args.source}") print(f"Target: {args.target}") print(f"Parameters: steps={args.steps}, length={args.length}, cfg={args.cfg}") print( f"F0 condition={args.f0_condition}, auto F0={not args.no_auto_f0}, pitch={args.pitch}" ) print() try: result = replace_voice( source_audio_path=args.source, target_audio_path=args.target, diffusion_steps=args.steps, length_adjust=args.length, inference_cfg_rate=args.cfg, f0_condition=args.f0_condition, auto_f0_adjust=not args.no_auto_f0, pitch_shift=args.pitch, ) print("✅ Voice replacement completed!") print(f"Output saved to: {result}") except Exception as e: print(f"❌ Error: {e}") sys.exit(1)