Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
| import os | |
| import subprocess | |
| import tempfile | |
| from pathlib import Path | |
| from typing import Optional | |
| import librosa | |
| import numpy as np | |
| import soundfile as sf | |
| def _load_audio(audio_path: str, mono: bool = False) -> tuple[np.ndarray, int]: | |
| """Load audio file with standard settings.""" | |
| y, sr = librosa.load(audio_path, sr=None, mono=mono, res_type="soxr_vhq") | |
| # Ensure consistent (samples, channels) format | |
| if not mono and y.ndim > 1 and y.shape[0] == 2: | |
| y = y.T | |
| return y, int(sr) | |
| def combine_tracks( | |
| track1_path: str, | |
| track2_path: str, | |
| weight1: float = 0.5, | |
| weight2: float = 0.5, | |
| output_path: Optional[str] = None, | |
| normalize: bool = True, | |
| fade_in: float = 0.0, | |
| fade_out: float = 0.0, | |
| output_format: str = "wav", | |
| ) -> str: | |
| """ | |
| Combine two audio tracks into a new single stereo audio track with adjustable mixing weights. | |
| This function mixes two audio files together with customizable balance, normalization, | |
| and fade effects. Useful for creating mashups, adding background music to vocals, | |
| or layering multiple audio sources. | |
| Args: | |
| track1_path: Path to first audio file (supports common formats: WAV, MP3, FLAC) | |
| track2_path: Path to second audio file (supports common formats: WAV, MP3, FLAC) | |
| weight1: Weight factor for first track (0.0-1.0, default: 0.5) | |
| 1.0 = full volume, 0.5 = half volume, 0.0 = silent | |
| weight2: Weight factor for second track (0.0-1.0, default: 0.5) | |
| 1.0 = full volume, 0.5 = half volume, 0.0 = silent | |
| output_path: Optional output file path (default: temporary file) | |
| normalize: Whether to normalize the final output to prevent clipping (default: True) | |
| fade_in: Fade in duration in seconds (default: 0.0) | |
| fade_out: Fade out duration in seconds (default: 0.0) | |
| output_format: Output format ('wav' or 'mp3', default: 'wav') | |
| Returns: | |
| Path to the combined audio file in specified format | |
| Examples: | |
| - weight1=0.8, weight2=0.2: First track dominates the mix | |
| - weight1=0.5, weight2=0.5: Equal balance between tracks | |
| - weight1=1.0, weight2=0.3: First track at full volume, second track quiet | |
| - fade_in=2.0, fade_out=3.0: Gradual volume increase and decrease | |
| Note: | |
| Both tracks are automatically resampled to match the higher sample rate | |
| Tracks of different lengths are padded with silence to match the longer one | |
| Output is saved in WAV format for maximum quality | |
| """ | |
| try: | |
| # Load both audio files | |
| y1, sr1 = _load_audio(track1_path, mono=False) | |
| y2, sr2 = _load_audio(track2_path, mono=False) | |
| # Ensure both tracks are stereo | |
| if y1.ndim == 1: | |
| y1 = np.stack([y1, y1]) | |
| if y2.ndim == 1: | |
| y2 = np.stack([y2, y2]) | |
| # Ensure same sample rate | |
| if sr1 != sr2: | |
| y2 = librosa.resample(y2, orig_sr=sr2, target_sr=sr1) | |
| sr2 = sr1 | |
| # Ensure same length | |
| max_length = max(y1.shape[1], y2.shape[1]) | |
| if y1.shape[1] < max_length: | |
| y1 = np.pad(y1, ((0, 0), (0, max_length - y1.shape[1])), mode="constant") | |
| if y2.shape[1] < max_length: | |
| y2 = np.pad(y2, ((0, 0), (0, max_length - y2.shape[1])), mode="constant") | |
| # Apply weights and combine | |
| combined = weight1 * y1 + weight2 * y2 | |
| # Apply fade in/out if specified | |
| if fade_in > 0: | |
| fade_samples = int(fade_in * sr1) | |
| if fade_samples > 0: | |
| fade_curve = np.linspace(0, 1, fade_samples) | |
| combined[:, :fade_samples] *= fade_curve | |
| if fade_out > 0: | |
| fade_samples = int(fade_out * sr1) | |
| if fade_samples > 0: | |
| fade_curve = np.linspace(1, 0, fade_samples) | |
| combined[:, -fade_samples:] *= fade_curve | |
| # Normalize if requested | |
| if normalize: | |
| max_val = np.max(np.abs(combined)) | |
| if max_val > 0: | |
| combined = combined / max_val * 0.95 | |
| # Save to file | |
| if output_path: | |
| os.makedirs(output_path, exist_ok=True) | |
| output_dir = output_path | |
| else: | |
| import tempfile as tf | |
| output_dir = tf.mkdtemp(suffix="_combined") | |
| final_audio_filename = os.path.join( | |
| output_dir, f"stereo_combined.{output_format.lower()}" | |
| ) | |
| if output_format.lower() == "mp3": | |
| # For MP3, save as WAV first then convert | |
| with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_wav: | |
| sf.write(temp_wav.name, combined.T, sr1, format="wav", subtype="PCM_16") | |
| # Convert to MP3 using ffmpeg | |
| cmd = [ | |
| "ffmpeg", | |
| "-y", | |
| "-i", | |
| temp_wav.name, | |
| "-c:a", | |
| "libmp3lame", | |
| "-b:a", | |
| "192k", | |
| final_audio_filename, | |
| ] | |
| subprocess.run(cmd, capture_output=True, check=True) | |
| # Clean up temp file | |
| os.unlink(temp_wav.name) | |
| else: | |
| sf.write( | |
| final_audio_filename, combined.T, sr1, format="wav", subtype="PCM_16" | |
| ) | |
| return final_audio_filename | |
| except Exception as e: | |
| raise RuntimeError(f"Error combining tracks: {str(e)}") | |
| def create_stereo_mix( | |
| left_track_path: str, | |
| right_track_path: str, | |
| output_path: Optional[str] = None, | |
| normalize: bool = True, | |
| output_format: str = "wav", | |
| ) -> str: | |
| """ | |
| Create a stereo track with one track in left channel and another in right channel. | |
| Args: | |
| left_track_path: Path to audio file for left channel | |
| right_track_path: Path to audio file for right channel | |
| output_path: Optional output file path (default: temp file) | |
| normalize: Whether to normalize the final output | |
| output_format: Output format ('wav' or 'mp3', default: 'wav') | |
| Returns: | |
| Path to the stereo audio file | |
| """ | |
| try: | |
| # Load both audio files | |
| y_left, sr_left = librosa.load(left_track_path, mono=True) | |
| y_right, sr_right = librosa.load(right_track_path, mono=True) | |
| # Ensure same sample rate | |
| if sr_left != sr_right: | |
| y_right = librosa.resample(y_right, orig_sr=sr_right, target_sr=sr_left) | |
| sr_right = sr_left | |
| # Ensure same length | |
| max_length = max(len(y_left), len(y_right)) | |
| if len(y_left) < max_length: | |
| y_left = np.pad(y_left, (0, max_length - len(y_left)), mode="constant") | |
| if len(y_right) < max_length: | |
| y_right = np.pad(y_right, (0, max_length - len(y_right)), mode="constant") | |
| # Create stereo array | |
| stereo = np.array([y_left, y_right]) | |
| # Normalize if requested | |
| if normalize: | |
| max_val = np.max(np.abs(stereo)) | |
| if max_val > 0: | |
| stereo = stereo / max_val * 0.95 | |
| # Save to file | |
| if output_path is None: | |
| output_path = tempfile.mkdtemp(suffix="_combined") | |
| else: | |
| os.makedirs(output_path, exist_ok=True) | |
| final_audio_filename = os.path.join( | |
| output_path, f"stereo_mix.{output_format.lower()}" | |
| ) | |
| if output_format.lower() == "mp3": | |
| # For MP3, save as WAV first then convert | |
| with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_wav: | |
| sf.write( | |
| temp_wav.name, stereo.T, sr_left, format="wav", subtype="PCM_16" | |
| ) | |
| # Convert to MP3 using ffmpeg | |
| cmd = [ | |
| "ffmpeg", | |
| "-y", | |
| "-i", | |
| temp_wav.name, | |
| "-c:a", | |
| "libmp3lame", | |
| "-b:a", | |
| "192k", | |
| final_audio_filename, | |
| ] | |
| subprocess.run(cmd, capture_output=True, check=True) | |
| # Clean up temp file | |
| os.unlink(temp_wav.name) | |
| else: | |
| sf.write( | |
| final_audio_filename, stereo.T, sr_left, format="wav", subtype="PCM_16" | |
| ) | |
| return final_audio_filename | |
| except Exception as e: | |
| raise RuntimeError(f"Error creating stereo mix: {str(e)}") | |
| def create_medley( | |
| vocals_path: str, | |
| instrumental_path: str, | |
| vocals_gain: float = 1.2, | |
| instrumental_gain: float = 0.9, | |
| compressor: str = "threshold=-18dB:ratio=3:attack=50:release=200", | |
| audio_codec: str = "libmp3lame", | |
| audio_bitrate: str = "192k", | |
| output_path: Optional[str] = None, | |
| ) -> str: | |
| """ | |
| Mix a vocal stem with an instrumental stem using professional audio processing. | |
| This function creates a polished medley by combining vocals and instrumental tracks | |
| with gain control and compression for professional-quality output. | |
| Args: | |
| vocals_path: Path to the vocals stem file (supports common formats: WAV, MP3, FLAC, M4A) | |
| instrumental_path: Path to the instrumental stem file (supports common formats: WAV, MP3, FLAC, M4A) | |
| vocals_gain: Linear gain applied to vocals stem (default: 1.2) | |
| 1.0 = unity gain, 1.2 = 20% boost, 0.8 = 20% reduction | |
| instrumental_gain: Linear gain applied to instrumental stem (default: 0.9) | |
| 1.0 = unity gain, 0.9 = 10% reduction, 1.1 = 10% boost | |
| compressor: FFmpeg compressor settings for peak control after mixing (default: "threshold=-18dB:ratio=3:attack=50:release=200") | |
| Format: "threshold=X:ratio=Y:attack=Z:release=W" | |
| threshold: Compression threshold in dB (-24 to -6 recommended) | |
| ratio: Compression ratio (2:1 to 4:1 typical) | |
| attack: Attack time in milliseconds (10-100 typical) | |
| release: Release time in milliseconds (100-500 typical) | |
| audio_codec: Target audio codec (default: "libmp3lame") | |
| Options: "libmp3lame" (MP3), "aac" (AAC), "flac" (FLAC), "pcm_s16le" (WAV) | |
| audio_bitrate: Audio bitrate for compressed formats (default: "192k") | |
| Options: "128k", "192k", "256k", "320k" for MP3/AAC | |
| output_path: Optional output file path (default: auto-generated temporary file) | |
| Returns: | |
| Path to the rendered medley file in specified format | |
| Examples: | |
| >>> create_medley("vocals.wav", "instrumental.wav") | |
| # Returns path to medley with default settings | |
| >>> create_medley("vocals.wav", "instrumental.wav", vocals_gain=1.5, instrumental_gain=0.8) | |
| # Returns path to medley with boosted vocals and reduced instrumental | |
| >>> create_medley("vocals.wav", "instrumental.wav", | |
| ... compressor="threshold=-12dB:ratio=4:attack=20:release=150") | |
| # Returns path to medley with aggressive compression | |
| >>> create_medley("vocals.wav", "instrumental.wav", | |
| ... audio_codec="flac", audio_bitrate="320k") | |
| # Returns path to high-quality FLAC medley | |
| Note: | |
| Uses FFmpeg for professional audio processing and mixing | |
| Automatic duration matching to the longer of the two inputs | |
| Compression prevents clipping and ensures consistent volume levels | |
| Output format determined by audio_codec parameter | |
| Temporary files are automatically cleaned up when output_path is not specified | |
| """ | |
| vocals = Path(vocals_path).expanduser().resolve() | |
| instrumental = Path(instrumental_path).expanduser().resolve() | |
| if not vocals.exists(): | |
| raise FileNotFoundError(f"Vocals stem not found: {vocals}") | |
| if not instrumental.exists(): | |
| raise FileNotFoundError(f"Instrumental stem not found: {instrumental}") | |
| medley_extension = ( | |
| "mp3" | |
| if audio_codec == "libmp3lame" | |
| else "wav" | |
| if audio_codec == "pcm_s16le" | |
| else "wav" | |
| ) | |
| if output_path is None: | |
| tmp_dir = tempfile.mkdtemp(prefix="mcp-medley-") | |
| output = ( | |
| Path(tmp_dir) | |
| / f"{vocals.name}_{instrumental.name}_medley.{medley_extension}" | |
| ) | |
| else: | |
| output = Path(output_path).expanduser().resolve() | |
| output.parent.mkdir(parents=True, exist_ok=True) | |
| # Enhanced stereo mixing with centered vocals and preserved instrumental stereo | |
| filter_complex = ( | |
| f"[0:a]volume={vocals_gain}," | |
| f"pan=stereo|c0=c0|c1=c0[v0];" | |
| f"[1:a]volume={instrumental_gain}[v1];" | |
| f"[v0][v1]amix=inputs=2:duration=longest:dropout_transition=2:weights='1.2 0.8'," | |
| f"acompressor={compressor}" | |
| ) | |
| cmd = [ | |
| "ffmpeg", | |
| "-y", | |
| "-i", | |
| str(vocals), | |
| "-i", | |
| str(instrumental), | |
| "-filter_complex", | |
| filter_complex, | |
| "-c:a", | |
| "pcm_s16le" if audio_codec == "wav" else audio_codec, | |
| "-ar", | |
| "48000", # Ensure sample rate consistency | |
| "-ac", | |
| "2", # Ensure stereo output | |
| ] | |
| if audio_bitrate and audio_codec in ["libmp3lame", "aac"]: | |
| cmd += ["-b:a", audio_bitrate] | |
| cmd.append(str(output)) | |
| completed = subprocess.run(cmd, capture_output=True, text=True) | |
| if completed.returncode != 0: | |
| raise RuntimeError( | |
| f"ffmpeg failed ({completed.returncode}):\n" | |
| f"STDOUT:\n{completed.stdout}\nSTDERR:\n{completed.stderr}" | |
| ) | |
| # Verify the output is stereo | |
| verify_stereo_output(str(output)) | |
| return str(output) | |
| def verify_stereo_output(output_path: str) -> None: | |
| """Verify that the output file is stereo and log the result.""" | |
| try: | |
| y, sr = sf.read(output_path) | |
| is_stereo = y.ndim > 1 and y.shape[1] == 2 | |
| duration = len(y) / sr | |
| print( | |
| f"✅ Output verification: {y.shape[1]} channels, {duration:.2f}s, stereo: {is_stereo}" | |
| ) | |
| except Exception as e: | |
| print(f"⚠️ Could not verify output: {e}") | |
| if __name__ == "__main__": | |
| import argparse | |
| parser = argparse.ArgumentParser(description="Combine audio tracks") | |
| subparsers = parser.add_subparsers(dest="command", help="Available commands") | |
| # Combine tracks with weights | |
| combine_parser = subparsers.add_parser( | |
| "combine", help="Combine two tracks with weights" | |
| ) | |
| combine_parser.add_argument("track1", help="Path to first audio file") | |
| combine_parser.add_argument("track2", help="Path to second audio file") | |
| combine_parser.add_argument( | |
| "--weight1", type=float, default=0.5, help="Weight for first track (0.0-1.0)" | |
| ) | |
| combine_parser.add_argument( | |
| "--weight2", type=float, default=0.5, help="Weight for second track (0.0-1.0)" | |
| ) | |
| combine_parser.add_argument( | |
| "--fade-in", type=float, default=0.0, help="Fade in duration in seconds" | |
| ) | |
| combine_parser.add_argument( | |
| "--fade-out", type=float, default=0.0, help="Fade out duration in seconds" | |
| ) | |
| combine_parser.add_argument( | |
| "--no-normalize", action="store_true", help="Disable normalization" | |
| ) | |
| combine_parser.add_argument( | |
| "--output", type=str, default="output", help="Output file path" | |
| ) | |
| combine_parser.add_argument( | |
| "--format", | |
| default="wav", | |
| choices=["wav", "mp3"], | |
| help="Output format (default: wav)", | |
| ) | |
| # Create stereo mix | |
| stereo_parser = subparsers.add_parser( | |
| "stereo", help="Create stereo mix (left/right channels)" | |
| ) | |
| stereo_parser.add_argument("left", help="Path to left channel audio file") | |
| stereo_parser.add_argument("right", help="Path to right channel audio file") | |
| stereo_parser.add_argument( | |
| "--no-normalize", action="store_true", help="Disable normalization" | |
| ) | |
| stereo_parser.add_argument( | |
| "--output", type=str, default="stereo_output", help="Output file path" | |
| ) | |
| stereo_parser.add_argument( | |
| "--format", | |
| default="wav", | |
| choices=["wav", "mp3"], | |
| help="Output format (default: wav)", | |
| ) | |
| # Create medley | |
| medley_parser = subparsers.add_parser( | |
| "medley", help="Create a vocal/instrumental medley using ffmpeg" | |
| ) | |
| medley_parser.add_argument("vocals", help="Path to vocals stem audio file") | |
| medley_parser.add_argument( | |
| "instrumental", help="Path to instrumental stem audio file" | |
| ) | |
| medley_parser.add_argument( | |
| "--vocals-gain", | |
| type=float, | |
| default=0.6, | |
| help="Linear gain for vocals (default: 0.6)", | |
| ) | |
| medley_parser.add_argument( | |
| "--instrumental-gain", | |
| type=float, | |
| default=1.2, | |
| help="Linear gain for instrumental (default: 1.2)", | |
| ) | |
| medley_parser.add_argument( | |
| "--compressor", | |
| type=str, | |
| default="threshold=-18dB:ratio=3:attack=50:release=200", | |
| help="FFmpeg acompressor parameters (default: threshold=-18dB:ratio=3:attack=50:release=200)", | |
| ) | |
| medley_parser.add_argument( | |
| "--audio-codec", | |
| type=str, | |
| default="libmp3lame", | |
| help="Target audio codec (default: libmp3lame)", | |
| ) | |
| medley_parser.add_argument( | |
| "--audio-bitrate", | |
| type=str, | |
| default="192k", | |
| help="Audio bitrate (default: 192k)", | |
| ) | |
| medley_parser.add_argument( | |
| "--output", type=str, help="Output file path (default: temporary file)" | |
| ) | |
| args = parser.parse_args() | |
| try: | |
| if args.command == "combine": | |
| output = combine_tracks( | |
| args.track1, | |
| args.track2, | |
| weight1=args.weight1, | |
| weight2=args.weight2, | |
| normalize=not args.no_normalize, | |
| fade_in=args.fade_in, | |
| fade_out=args.fade_out, | |
| output_path=args.output, | |
| output_format=args.format, | |
| ) | |
| print(f"Combined audio saved to: {output}") | |
| elif args.command == "stereo": | |
| output = create_stereo_mix( | |
| args.left, | |
| args.right, | |
| normalize=not args.no_normalize, | |
| output_path=args.output, | |
| output_format=args.format, | |
| ) | |
| print(f"Stereo mix saved to: {output}") | |
| elif args.command == "medley": | |
| output = create_medley( | |
| args.vocals, | |
| args.instrumental, | |
| output_path=args.output, | |
| vocals_gain=args.vocals_gain, | |
| instrumental_gain=args.instrumental_gain, | |
| compressor=args.compressor, | |
| audio_codec=args.audio_codec, | |
| audio_bitrate=args.audio_bitrate, | |
| ) | |
| print(f"Medley saved to: {output}") | |
| else: | |
| parser.print_help() | |
| except Exception as e: | |
| print(f"Error: {e}") | |
| exit(1) | |