Spaces:

frascuchon
/

music-mcp

Running on CPU Upgrade

App Files Files Community

frascuchon HF Staff commited on 12 days ago

Commit

f62bfdb

1 Parent(s): 2c6a090

fixing tools

Browse files

Files changed (5) hide show

tools/audio_cleaning.py +125 -25
tools/audio_cutting.py +4 -1
tools/audio_insertion.py +66 -14
tools/stems_separation.py +164 -142
tools/voice_replacement.py +4 -4

tools/audio_cleaning.py CHANGED Viewed

@@ -11,6 +11,9 @@ from scipy.signal import butter, lfilter, filtfilt
 def _load_audio(audio_path: str, mono: bool = False) -> tuple[np.ndarray, int]:
     """Load audio file with standard settings."""
     y, sr = librosa.load(audio_path, sr=None, mono=mono, res_type="soxr_vhq")
     return y, int(sr)
@@ -25,6 +28,20 @@ def detect_noise_profile(audio: np.ndarray, sample_rate: int) -> dict:
     Returns:
         Dictionary with noise profile information
     """
     # Compute spectral features for noise detection
     stft = librosa.stft(audio, n_fft=2048, hop_length=512)
     magnitude = np.abs(stft)
@@ -35,11 +52,11 @@ def detect_noise_profile(audio: np.ndarray, sample_rate: int) -> dict:
     # Detect steady noise (consistent low-frequency content)
     freqs = librosa.fft_frequencies(sr=sample_rate, n_fft=2048)
     low_freq_mask = freqs < 200  # Below 200 Hz
-    steady_noise = np.mean(magnitude[:, low_freq_mask], axis=1)
     # Detect hiss (high frequency noise)
     high_freq_mask = freqs > 4000  # Above 4 kHz
-    hiss_level = np.mean(magnitude[:, high_freq_mask], axis=1)
     # Compute overall noise characteristics
     signal_power = np.mean(magnitude**2, axis=1)
@@ -48,11 +65,12 @@ def detect_noise_profile(audio: np.ndarray, sample_rate: int) -> dict:
     return {
         "noise_floor": float(noise_floor),
-        "steady_noise": float(steady_noise),
-        "hiss_level": float(hiss_level),
-        "snr_estimate": float(snr_estimate),
         "has_significant_noise": bool(
-            steady_noise > noise_floor * 2 or hiss_level > noise_floor * 1.5
         ),
     }
@@ -71,6 +89,28 @@ def spectral_subtraction(
     Returns:
         Cleaned audio data
     """
     # Compute STFT of audio
     stft = librosa.stft(audio, n_fft=2048, hop_length=512)
     magnitude = np.abs(stft)
@@ -85,7 +125,7 @@ def spectral_subtraction(
     # Reconstruct audio
     cleaned_stft = cleaned_magnitude * np.exp(1j * phase)
-    cleaned_audio = librosa.istft(cleaned_stft, hop_length=512)
     return cleaned_audio
@@ -104,6 +144,24 @@ def adaptive_filter(
     Returns:
         Filtered audio data
     """
     if noise_type == "hiss":
         # High-pass filter to reduce hiss (above 4kHz)
         cutoff = 4000
@@ -197,7 +255,12 @@ def remove_noise(
             # High-pass filter for hiss removal
             cutoff = 4000 - sensitivity * 2000  # 2000-4000 Hz range
             b, a = butter(4, cutoff, fs=sample_rate, btype="high", output="ba")
-            filtered_audio = filtfilt(b, a, audio)
         elif noise_type == "hum":
             # Multiple notch filters for harmonics
@@ -217,13 +280,24 @@ def remove_noise(
                                 btype="bandstop",
                                 output="ba",
                             )
-                            filtered_audio = filtfilt(b, a, filtered_audio)
         elif noise_type == "rumble":
             # High-pass filter for rumble removal
             cutoff = 20 + sensitivity * 80  # 20-100 Hz range
             b, a = butter(4, cutoff, fs=sample_rate, btype="high", output="ba")
-            filtered_audio = filtfilt(b, a, audio)
         else:  # background or general
             # General noise reduction
@@ -233,9 +307,10 @@ def remove_noise(
             strength = 0.2 + sensitivity * 0.6
             filtered_audio = (1 - strength) * filtered_audio + strength * audio
-        # Normalize output
         max_val = np.max(np.abs(filtered_audio))
-        if max_val > 0:
             filtered_audio = filtered_audio / max_val * 0.95
         # Save output
@@ -244,13 +319,38 @@ def remove_noise(
         else:
             os.makedirs(output_path, exist_ok=True)
-        # Generate output filename
         input_filename = os.path.splitext(os.path.basename(audio_path))[0]
-        output_filename = f"{input_filename}_{noise_type}_removed.{output_format}"
         output_file = os.path.join(output_path, output_filename)
-        # Save processed audio
-        sf.write(output_file, filtered_audio.T, sample_rate)
         return output_file
@@ -278,7 +378,7 @@ def remove_noise_wrapper(audio_path: str, noise_reduction_factor: float = 0.5) -
 if __name__ == "__main__":
     """
     Script section for running audio cleaning locally.
     Usage:
         python tools/audio_cleaning.py input.wav
         python tools/audio_cleaning.py input.wav --reduction 0.7
@@ -317,16 +417,16 @@ Examples:
     print()
     try:
-        result = remove_noise_wrapper(
-            audio_path=args.audio_path, noise_reduction_factor=args.reduction
         )
-        if result.startswith("Error:"):
-            print(f"❌ {result}")
-            sys.exit(1)
-        else:
-            print("✅ Audio cleaning completed!")
-            print(f"Output saved to: {result}")
     except Exception as e:
         print(f"❌ Error: {e}")

 def _load_audio(audio_path: str, mono: bool = False) -> tuple[np.ndarray, int]:
     """Load audio file with standard settings."""
     y, sr = librosa.load(audio_path, sr=None, mono=mono, res_type="soxr_vhq")
+    # Ensure shape is (samples, channels) for stereo audio
+    if not mono and y.ndim > 1 and y.shape[0] == 2:
+        y = y.T
     return y, int(sr)
     Returns:
         Dictionary with noise profile information
     """
+    # Convert to mono for analysis if stereo
+    if audio.ndim > 1:
+        audio = np.mean(audio, axis=1)
+    # Ensure audio is long enough for STFT
+    if len(audio) < 2048:
+        return {
+            "noise_floor": 0.001,
+            "steady_noise": 0.001,
+            "hiss_level": 0.001,
+            "snr_estimate": 20.0,
+            "has_significant_noise": False,
+        }
     # Compute spectral features for noise detection
     stft = librosa.stft(audio, n_fft=2048, hop_length=512)
     magnitude = np.abs(stft)
     # Detect steady noise (consistent low-frequency content)
     freqs = librosa.fft_frequencies(sr=sample_rate, n_fft=2048)
     low_freq_mask = freqs < 200  # Below 200 Hz
+    steady_noise = np.mean(magnitude[low_freq_mask, :], axis=0)
     # Detect hiss (high frequency noise)
     high_freq_mask = freqs > 4000  # Above 4 kHz
+    hiss_level = np.mean(magnitude[high_freq_mask, :], axis=0)
     # Compute overall noise characteristics
     signal_power = np.mean(magnitude**2, axis=1)
     return {
         "noise_floor": float(noise_floor),
+        "steady_noise": float(np.mean(steady_noise)),
+        "hiss_level": float(np.mean(hiss_level)),
+        "snr_estimate": float(np.mean(snr_estimate)),
         "has_significant_noise": bool(
+            np.mean(steady_noise) > noise_floor * 2
+            or np.mean(hiss_level) > noise_floor * 1.5
         ),
     }
     Returns:
         Cleaned audio data
     """
+    # Handle stereo audio by processing each channel separately
+    if audio.ndim > 1:
+        cleaned_channels = []
+        for channel in range(audio.shape[1]):
+            channel_audio = audio[:, channel]
+            cleaned_channel = _process_channel_spectral_subtraction(
+                channel_audio, noise_profile, sample_rate
+            )
+            cleaned_channels.append(cleaned_channel)
+        return np.column_stack(cleaned_channels)
+    else:
+        return _process_channel_spectral_subtraction(audio, noise_profile, sample_rate)
+def _process_channel_spectral_subtraction(
+    audio: np.ndarray, noise_profile: dict, sample_rate: int
+) -> np.ndarray:
+    """Process a single channel with spectral subtraction."""
+    # Ensure audio is long enough for STFT
+    if len(audio) < 2048:
+        return audio
     # Compute STFT of audio
     stft = librosa.stft(audio, n_fft=2048, hop_length=512)
     magnitude = np.abs(stft)
     # Reconstruct audio
     cleaned_stft = cleaned_magnitude * np.exp(1j * phase)
+    cleaned_audio = librosa.istft(cleaned_stft, hop_length=512, length=len(audio))
     return cleaned_audio
     Returns:
         Filtered audio data
     """
+    # Handle stereo audio by processing each channel separately
+    if audio.ndim > 1:
+        filtered_channels = []
+        for channel in range(audio.shape[1]):
+            channel_audio = audio[:, channel]
+            filtered_channel = _process_channel_adaptive_filter(
+                channel_audio, sample_rate, noise_type
+            )
+            filtered_channels.append(filtered_channel)
+        return np.column_stack(filtered_channels)
+    else:
+        return _process_channel_adaptive_filter(audio, sample_rate, noise_type)
+def _process_channel_adaptive_filter(
+    audio: np.ndarray, sample_rate: int, noise_type: str = "general"
+) -> np.ndarray:
+    """Process a single channel with adaptive filtering."""
     if noise_type == "hiss":
         # High-pass filter to reduce hiss (above 4kHz)
         cutoff = 4000
             # High-pass filter for hiss removal
             cutoff = 4000 - sensitivity * 2000  # 2000-4000 Hz range
             b, a = butter(4, cutoff, fs=sample_rate, btype="high", output="ba")
+            if audio.ndim > 1:
+                filtered_audio = np.zeros_like(audio)
+                for channel in range(audio.shape[1]):
+                    filtered_audio[:, channel] = filtfilt(b, a, audio[:, channel])
+            else:
+                filtered_audio = filtfilt(b, a, audio)
         elif noise_type == "hum":
             # Multiple notch filters for harmonics
                                 btype="bandstop",
                                 output="ba",
                             )
+                            if filtered_audio.ndim > 1:
+                                for channel in range(filtered_audio.shape[1]):
+                                    filtered_audio[:, channel] = filtfilt(
+                                        b, a, filtered_audio[:, channel]
+                                    )
+                            else:
+                                filtered_audio = filtfilt(b, a, filtered_audio)
         elif noise_type == "rumble":
             # High-pass filter for rumble removal
             cutoff = 20 + sensitivity * 80  # 20-100 Hz range
             b, a = butter(4, cutoff, fs=sample_rate, btype="high", output="ba")
+            if audio.ndim > 1:
+                filtered_audio = np.zeros_like(audio)
+                for channel in range(audio.shape[1]):
+                    filtered_audio[:, channel] = filtfilt(b, a, audio[:, channel])
+            else:
+                filtered_audio = filtfilt(b, a, audio)
         else:  # background or general
             # General noise reduction
             strength = 0.2 + sensitivity * 0.6
             filtered_audio = (1 - strength) * filtered_audio + strength * audio
+        # Skip normalization to preserve original dynamics and pitch
+        # Only normalize if clipping would occur
         max_val = np.max(np.abs(filtered_audio))
+        if max_val > 1.0:
             filtered_audio = filtered_audio / max_val * 0.95
         # Save output
         else:
             os.makedirs(output_path, exist_ok=True)
+        # Generate output filename with timestamp
+        from datetime import datetime
+        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
         input_filename = os.path.splitext(os.path.basename(audio_path))[0]
+        output_filename = (
+            f"{input_filename}_{noise_type}_removed_{timestamp}.{output_format}"
+        )
         output_file = os.path.join(output_path, output_filename)
+        # Save using librosa's output function (most reliable)
+        # librosa expects (samples, channels) format
+        audio_for_saving = filtered_audio
+        try:
+            # Use librosa to save - this should preserve pitch correctly
+            sf.write(output_file, audio_for_saving, sample_rate)
+            print("Successfully saved audio file using librosa/soundfile")
+        except Exception as e:
+            print(f"librosa/soundfile failed: {e}")
+            # Try with FLAC format as fallback
+            try:
+                flac_path = output_file.replace(".wav", ".flac")
+                sf.write(flac_path, audio_for_saving, sample_rate, format="FLAC")
+                print(f"Successfully saved as FLAC: {flac_path}")
+                return flac_path
+            except Exception as e2:
+                print(f"FLAC also failed: {e2}")
+                raise RuntimeError("Could not save audio file with any method")
         return output_file
 if __name__ == "__main__":
     """
     Script section for running audio cleaning locally.
     Usage:
         python tools/audio_cleaning.py input.wav
         python tools/audio_cleaning.py input.wav --reduction 0.7
     print()
     try:
+        result = remove_noise(
+            audio_path=args.audio_path,
+            noise_type="general",
+            sensitivity=args.reduction,
+            output_path=args.output or "output",
+            output_format="wav",
         )
+        print("✅ Audio cleaning completed!")
+        print(f"Output saved to: {result}")
     except Exception as e:
         print(f"❌ Error: {e}")

tools/audio_cutting.py CHANGED Viewed

@@ -6,7 +6,10 @@ import librosa
 import numpy as np
 import soundfile as sf
-from .audio_info import validate_audio_path
 def cut_audio(

 import numpy as np
 import soundfile as sf
+try:
+    from .audio_info import validate_audio_path
+except ImportError:
+    from audio_info import validate_audio_path
 def cut_audio(

tools/audio_insertion.py CHANGED Viewed

@@ -10,6 +10,9 @@ import soundfile as sf
 def _load_audio(audio_path: str, mono: bool = False) -> tuple[np.ndarray, int]:
     """Load audio file with standard settings."""
     y, sr = librosa.load(audio_path, sr=None, mono=mono, res_type="soxr_vhq")
     return y, int(sr)
@@ -55,11 +58,19 @@ def apply_crossfade(
     # Create crossfade envelope
     fade_in = np.linspace(0, 1, fade_samples)
     fade_out = np.linspace(1, 0, fade_samples)
-    crossfade = fade_in * fade_out
     # Apply crossfade to section end
     section_end = section[-fade_samples:] if len(section) > fade_samples else section
-    section_end[:fade_samples] *= crossfade
     # Insert section into target
     insert_sample = int(len(target) * 0.5)  # Insert at middle
@@ -117,9 +128,20 @@ def insert_section(
         # Resample if needed
         if main_sr != section_sr:
-            section_audio = librosa.resample(
-                section_audio, orig_sr=section_sr, target_sr=main_sr
-            )
         # Calculate timing
         main_duration = len(main_audio) / main_sr
@@ -159,7 +181,7 @@ def insert_section(
         output_file = os.path.join(output_path, output_filename)
         # Save final audio
-        sf.write(output_file, final_audio.T, main_sr)
         return output_file
@@ -223,9 +245,22 @@ def insert_multiple_sections(
             # Resample if needed
             if section_sr != main_sr:
-                section_audio = librosa.resample(
-                    section_audio, orig_sr=section_sr, target_sr=main_sr
-                )
             # Calculate crossfade points
             fade_start, fade_end = detect_crossfade_point(
@@ -259,7 +294,7 @@ def insert_multiple_sections(
         output_file = os.path.join(output_path, output_filename)
         # Save final audio
-        sf.write(output_file, current_audio.T, main_sr)
         return output_file
@@ -327,9 +362,22 @@ def replace_section(
         # Resample replacement if needed
         if replacement_sr != main_sr:
-            replacement_audio = librosa.resample(
-                replacement_audio, orig_sr=replacement_sr, target_sr=main_sr
-            )
         # Trim replacement to specified duration
         replacement_duration = end_time - start_time
@@ -345,10 +393,14 @@ def replace_section(
         # Fade in replacement
         fade_in = np.linspace(0, 1, fade_samples)
         trimmed_replacement[:fade_samples] *= fade_in
         # Fade out at end of replacement
         fade_out = np.linspace(1, 0, fade_samples)
         trimmed_replacement[-fade_samples:] *= fade_out
         # Combine all parts
@@ -366,7 +418,7 @@ def replace_section(
         output_file = os.path.join(output_path, output_filename)
         # Save final audio
-        sf.write(output_file, final_audio.T, main_sr)
         return output_file

 def _load_audio(audio_path: str, mono: bool = False) -> tuple[np.ndarray, int]:
     """Load audio file with standard settings."""
     y, sr = librosa.load(audio_path, sr=None, mono=mono, res_type="soxr_vhq")
+    # Ensure consistent (samples, channels) format
+    if not mono and y.ndim > 1 and y.shape[0] == 2:
+        y = y.T
     return y, int(sr)
     # Create crossfade envelope
     fade_in = np.linspace(0, 1, fade_samples)
     fade_out = np.linspace(1, 0, fade_samples)
+    # Handle stereo audio
+    if section.ndim > 1:
+        crossfade = np.outer(fade_in * fade_out, np.ones(section.shape[1]))
+    else:
+        crossfade = fade_in * fade_out
     # Apply crossfade to section end
     section_end = section[-fade_samples:] if len(section) > fade_samples else section
+    if section_end.ndim > 1:
+        section_end[:fade_samples] *= crossfade
+    else:
+        section_end[:fade_samples] *= crossfade
     # Insert section into target
     insert_sample = int(len(target) * 0.5)  # Insert at middle
         # Resample if needed
         if main_sr != section_sr:
+            if section_audio.ndim > 1:
+                # Resample each channel separately
+                section_audio = np.array(
+                    [
+                        librosa.resample(
+                            section_audio[:, ch], orig_sr=section_sr, target_sr=main_sr
+                        )
+                        for ch in range(section_audio.shape[1])
+                    ]
+                ).T
+            else:
+                section_audio = librosa.resample(
+                    section_audio, orig_sr=section_sr, target_sr=main_sr
+                )
         # Calculate timing
         main_duration = len(main_audio) / main_sr
         output_file = os.path.join(output_path, output_filename)
         # Save final audio
+        sf.write(output_file, final_audio, main_sr)
         return output_file
             # Resample if needed
             if section_sr != main_sr:
+                if section_audio.ndim > 1:
+                    # Resample each channel separately
+                    section_audio = np.array(
+                        [
+                            librosa.resample(
+                                section_audio[:, ch],
+                                orig_sr=section_sr,
+                                target_sr=main_sr,
+                            )
+                            for ch in range(section_audio.shape[1])
+                        ]
+                    ).T
+                else:
+                    section_audio = librosa.resample(
+                        section_audio, orig_sr=section_sr, target_sr=main_sr
+                    )
             # Calculate crossfade points
             fade_start, fade_end = detect_crossfade_point(
         output_file = os.path.join(output_path, output_filename)
         # Save final audio
+        sf.write(output_file, current_audio, main_sr)
         return output_file
         # Resample replacement if needed
         if replacement_sr != main_sr:
+            if replacement_audio.ndim > 1:
+                # Resample each channel separately
+                replacement_audio = np.array(
+                    [
+                        librosa.resample(
+                            replacement_audio[:, ch],
+                            orig_sr=replacement_sr,
+                            target_sr=main_sr,
+                        )
+                        for ch in range(replacement_audio.shape[1])
+                    ]
+                ).T
+            else:
+                replacement_audio = librosa.resample(
+                    replacement_audio, orig_sr=replacement_sr, target_sr=main_sr
+                )
         # Trim replacement to specified duration
         replacement_duration = end_time - start_time
         # Fade in replacement
         fade_in = np.linspace(0, 1, fade_samples)
+        if trimmed_replacement.ndim > 1:
+            fade_in = np.outer(fade_in, np.ones(trimmed_replacement.shape[1]))
         trimmed_replacement[:fade_samples] *= fade_in
         # Fade out at end of replacement
         fade_out = np.linspace(1, 0, fade_samples)
+        if trimmed_replacement.ndim > 1:
+            fade_out = np.outer(fade_out, np.ones(trimmed_replacement.shape[1]))
         trimmed_replacement[-fade_samples:] *= fade_out
         # Combine all parts
         output_file = os.path.join(output_path, output_filename)
         # Save final audio
+        sf.write(output_file, final_audio, main_sr)
         return output_file

tools/stems_separation.py CHANGED Viewed

@@ -9,8 +9,51 @@ class Error(Exception):
     pass
 def separate_audio(
-    audio_path: str, output_path: Optional[str] = None, output_format: str = "wav"
 ) -> Tuple[str, str, str, str]:
     """
     Separate audio into vocals, drums, bass, and other stems using Demucs.
@@ -23,6 +66,10 @@ def separate_audio(
         audio_path: Path to the input audio file or URL (supports common formats: WAV, MP3, FLAC, M4A)
         output_path: Directory to save the separated stems (default: 'output' directory)
         output_format: Output format for separated stems ('wav' or 'mp3', default: 'wav')
     Returns:
         tuple[str, str, str, str]: Paths to the separated audio files in order:
@@ -38,7 +85,7 @@ def separate_audio(
         - Create instrumental versions by combining drums+bass+other
     Note:
-        Uses the htdemucs model which is optimized for high-quality separation
         Processing time depends on audio length and system performance
         Output files are saved in WAV format for maximum quality
     """
@@ -50,7 +97,7 @@ def separate_audio(
         output_dir = os.path.join(output_path, "separated")
         os.makedirs(output_dir, exist_ok=True)
-        # Run Demucs separation
         cmd = [
             "python",
             "-m",
@@ -58,63 +105,48 @@ def separate_audio(
             "--out",
             output_dir,
             "--name",
-            "htdemucs",
-            audio_path,
         ]
-        result = subprocess.run(cmd, capture_output=True, text=True)
-        if result.returncode != 0:
-            raise Error(f"Demucs separation failed: {result.stderr}")
         # Find the separated files
         track_name = Path(audio_path).stem
-        htdemucs_dir = os.path.join(output_dir, "htdemucs", track_name)
         # Original WAV files from Demucs
-        vocals_wav = os.path.join(htdemucs_dir, "vocals.wav")
-        drums_wav = os.path.join(htdemucs_dir, "drums.wav")
-        bass_wav = os.path.join(htdemucs_dir, "bass.wav")
-        other_wav = os.path.join(htdemucs_dir, "other.wav")
         # Verify all files exist
-        for file_path in [vocals_wav, drums_wav, bass_wav, other_wav]:
             if not os.path.exists(file_path):
                 raise Error(f"Separated file not found: {file_path}")
-        # Convert to requested format if needed
-        if output_format.lower() == "mp3":
-            vocals_path = vocals_wav.replace(".wav", ".mp3")
-            drums_path = drums_wav.replace(".wav", ".mp3")
-            bass_path = bass_wav.replace(".wav", ".mp3")
-            other_path = other_wav.replace(".wav", ".mp3")
-            # Convert each stem to MP3
-            for wav_file, mp3_file in [
-                (vocals_wav, vocals_path),
-                (drums_wav, drums_path),
-                (bass_wav, bass_path),
-                (other_wav, other_path),
-            ]:
-                cmd = [
-                    "ffmpeg",
-                    "-y",
-                    "-i",
-                    wav_file,
-                    "-c:a",
-                    "libmp3lame",
-                    "-b:a",
-                    "192k",
-                    mp3_file,
-                ]
-                subprocess.run(cmd, capture_output=True, check=True)
-        else:
-            # Use original WAV files
-            vocals_path = vocals_wav
-            drums_path = drums_wav
-            bass_path = bass_wav
-            other_path = other_wav
         return vocals_path, drums_path, bass_path, other_path
     except Exception as e:
@@ -186,7 +218,13 @@ def extract_selected_stems(
 def extract_vocal_non_vocal(
-    audio_path: str, output_path: Optional[str] = None, output_format: str = "wav"
 ) -> Tuple[str, str]:
     """
     Extract vocals and non-vocals (instrumental) stems from an audio file.
@@ -198,7 +236,11 @@ def extract_vocal_non_vocal(
     Args:
         audio_path: Path to the input audio file or URL (supports common formats: WAV, MP3, FLAC, M4A)
         output_path: Directory to save the separated stems (default: 'output' directory)
         output_format: Output format for stems ('wav' or 'mp3', default: 'wav')
     Returns:
         tuple[str, str]: Paths to (vocals_file, non_vocals_file)
@@ -214,105 +256,59 @@ def extract_vocal_non_vocal(
         Uses the same high-quality Demucs model as separate_audio
         Non-vocals track is automatically mixed and normalized
     """
-    # Extract all stems
-    all_stems = separate_audio(audio_path, output_path, output_format)
-    vocals_path, drums_path, bass_path, other_path = all_stems
-    # Create non-vocals by combining drums, bass, and other
     try:
-        # Load all non-vocal stems
-        import librosa
-        import numpy as np
-        import soundfile as sf
-        y_drums, sr_drums = librosa.load(drums_path, sr=None, mono=False)
-        y_bass, sr_bass = librosa.load(bass_path, sr=None, mono=False)
-        y_other, sr_other = librosa.load(other_path, sr=None, mono=False)
-        # Ensure same sample rate
-        target_sr = max(sr_drums, sr_bass, sr_other)
-        if sr_drums != target_sr:
-            y_drums = librosa.resample(y_drums, orig_sr=sr_drums, target_sr=target_sr)
-        if sr_bass != target_sr:
-            y_bass = librosa.resample(y_bass, orig_sr=sr_bass, target_sr=target_sr)
-        if sr_other != target_sr:
-            y_other = librosa.resample(y_other, orig_sr=sr_other, target_sr=target_sr)
-        # Ensure same shape
-        max_length = max(y_drums.shape[-1], y_bass.shape[-1], y_other.shape[-1])
-        def pad_to_length(y, target_length):
-            if y.shape[-1] < target_length:
-                if y.ndim == 1:
-                    return np.pad(y, (0, target_length - y.shape[-1]), mode="constant")
-                else:
-                    return np.pad(
-                        y, ((0, 0), (0, target_length - y.shape[-1])), mode="constant"
-                    )
-            return y
-        y_drums = pad_to_length(y_drums, max_length)
-        y_bass = pad_to_length(y_bass, max_length)
-        y_other = pad_to_length(y_other, max_length)
-        # Combine non-vocal stems
-        non_vocals = y_drums + y_bass + y_other
-        # Normalize to prevent clipping
-        max_val = np.max(np.abs(non_vocals))
-        if max_val > 0:
-            non_vocals = non_vocals / max_val * 0.95
-        # Save non-vocals file
-        if output_path:
-            os.makedirs(output_path, exist_ok=True)
-            non_vocals_filename = os.path.join(
-                output_path, f"non_vocals.{output_format.lower()}"
-            )
-        else:
-            non_vocals_filename = os.path.join(
-                os.path.dirname(drums_path), f"non_vocals.{output_format.lower()}"
-            )
-        if non_vocals.ndim == 2:
-            non_vocals = non_vocals.T
         if output_format.lower() == "mp3":
-            # For MP3, save as WAV first then convert
-            import tempfile
-            with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_wav:
-                sf.write(
-                    temp_wav.name, non_vocals, target_sr, format="wav", subtype="PCM_16"
-                )
-                # Convert to MP3 using ffmpeg
-                cmd = [
-                    "ffmpeg",
-                    "-y",
-                    "-i",
-                    temp_wav.name,
-                    "-c:a",
-                    "libmp3lame",
-                    "-b:a",
-                    "192k",
-                    non_vocals_filename,
-                ]
-                subprocess.run(cmd, capture_output=True, check=True)
-                # Clean up temp file
-                os.unlink(temp_wav.name)
-        else:
-            sf.write(
-                non_vocals_filename,
-                non_vocals,
-                target_sr,
-                format="wav",
-                subtype="PCM_16",
-            )
-        return vocals_path, non_vocals_filename
     except Exception as e:
         raise RuntimeError(f"Error creating non-vocals track: {str(e)}")
@@ -370,6 +366,26 @@ if __name__ == "__main__":
         choices=["wav", "mp3"],
         help="Output format (default: wav)",
     )
     # New selective stems command
     select_parser = subparsers.add_parser("select", help="Extract specific stems only")
@@ -429,7 +445,13 @@ if __name__ == "__main__":
     try:
         if args.command == "separate":
             vocals, drums, bass, other = separate_audio(
-                args.audio_path, args.output_dir, args.format
             )
             print(f"Vocals: {vocals}")
             print(f"Drums: {drums}")

     pass
+def run_command_with_streaming(cmd, description="Processing"):
+    """Run command with real-time output streaming"""
+    print(f"🎵 {description}...")
+    print(f"Command: {' '.join(str(c) for c in cmd)}")
+    print("━" * 60)
+    process = subprocess.Popen(
+        cmd,
+        stdout=subprocess.PIPE,
+        stderr=subprocess.STDOUT,
+        text=True,
+        universal_newlines=True,
+    )
+    # Stream output in real-time
+    return_code = None
+    while return_code is None:
+        if process.stdout:
+            line = process.stdout.readline()
+            if line:
+                print(line.strip())
+        return_code = process.poll()
+    if return_code != 0:
+        error_output = process.stderr.read() if process.stderr else ""
+        raise RuntimeError(
+            f"{description} failed (code {return_code}):\n{error_output}"
+        )
+    print("━" * 60)
+    print(f"✅ {description} completed successfully!")
+    return return_code
 def separate_audio(
+    audio_path: str,
+    output_path: Optional[str] = None,
+    output_format: str = "wav",
+    model: str = "hdemucs_mmi",
+    device: Optional[str] = None,
+    segment: Optional[int] = None,
+    jobs: int = 1,
 ) -> Tuple[str, str, str, str]:
     """
     Separate audio into vocals, drums, bass, and other stems using Demucs.
         audio_path: Path to the input audio file or URL (supports common formats: WAV, MP3, FLAC, M4A)
         output_path: Directory to save the separated stems (default: 'output' directory)
         output_format: Output format for separated stems ('wav' or 'mp3', default: 'wav')
+        model: Demucs model to use (default: 'hdemucs_mmi')
+        device: Device to use for processing (default: cuda if available else cpu)
+        segment: Set split size of each chunk to save memory (default: None)
+        jobs: Number of parallel jobs (default: 1)
     Returns:
         tuple[str, str, str, str]: Paths to the separated audio files in order:
         - Create instrumental versions by combining drums+bass+other
     Note:
+        Uses the hdemucs_mmi model which is optimized for high-quality separation
         Processing time depends on audio length and system performance
         Output files are saved in WAV format for maximum quality
     """
         output_dir = os.path.join(output_path, "separated")
         os.makedirs(output_dir, exist_ok=True)
+        # Build Demucs separation command with all parameters
         cmd = [
             "python",
             "-m",
             "--out",
             output_dir,
             "--name",
+            model,
+            "--jobs",
+            str(jobs),
         ]
+        # Add optional parameters if provided
+        if device:
+            cmd.extend(["--device", device])
+        if segment:
+            cmd.extend(["--segment", str(segment)])
+        # Add MP3 output if requested
+        if output_format.lower() == "mp3":
+            cmd.extend(["--mp3", "--mp3-bitrate", "192"])
+        cmd.append(audio_path)
+        # Run Demucs separation with real-time output
+        run_command_with_streaming(cmd, "Demucs stem separation")
         # Find the separated files
         track_name = Path(audio_path).stem
+        model_dir = os.path.join(output_dir, model, track_name)
         # Original WAV files from Demucs
+        vocals_path = os.path.join(model_dir, "vocals.wav")
+        drums_path = os.path.join(model_dir, "drums.wav")
+        bass_path = os.path.join(model_dir, "bass.wav")
+        other_path = os.path.join(model_dir, "other.wav")
+        # If MP3 output is requested, set the proper file names
+        if output_format.lower() == "mp3":
+            vocals_path = vocals_path.replace(".wav", ".mp3")
+            drums_path = drums_path.replace(".wav", ".mp3")
+            bass_path = bass_path.replace(".wav", ".mp3")
+            other_path = other_path.replace(".wav", ".mp3")
         # Verify all files exist
+        for file_path in [vocals_path, drums_path, bass_path, other_path]:
             if not os.path.exists(file_path):
                 raise Error(f"Separated file not found: {file_path}")
         return vocals_path, drums_path, bass_path, other_path
     except Exception as e:
 def extract_vocal_non_vocal(
+    audio_path: str,
+    output_path: str = "output",
+    model: str = "hdemucs_mmi",
+    output_format: str = "wav",
+    device: Optional[str] = None,
+    segment: Optional[int] = None,
+    jobs: int = 1,
 ) -> Tuple[str, str]:
     """
     Extract vocals and non-vocals (instrumental) stems from an audio file.
     Args:
         audio_path: Path to the input audio file or URL (supports common formats: WAV, MP3, FLAC, M4A)
         output_path: Directory to save the separated stems (default: 'output' directory)
+        model: Demucs model to use (default: 'hdemucs_mmi')
         output_format: Output format for stems ('wav' or 'mp3', default: 'wav')
+        device: Device to use for processing (default: cuda if available else cpu)
+        segment: Set split size of each chunk to save memory (default: None)
+        jobs: Number of parallel jobs (default: 1)
     Returns:
         tuple[str, str]: Paths to (vocals_file, non_vocals_file)
         Uses the same high-quality Demucs model as separate_audio
         Non-vocals track is automatically mixed and normalized
     """
     try:
+        output_dir = os.path.join(output_path, "separated")
+        os.makedirs(output_dir, exist_ok=True)
+        # Build Demucs separation command with all parameters
+        cmd = [
+            "python",
+            "-m",
+            "demucs.separate",
+            "--out",
+            output_dir,
+            "--name",
+            model,
+            "--jobs",
+            str(jobs),
+            "--two-stems",
+            "vocals",
+        ]
+        # Add optional parameters if provided
+        if device:
+            cmd.extend(["--device", device])
+        if segment:
+            cmd.extend(["--segment", str(segment)])
+        # Add MP3 output if requested
         if output_format.lower() == "mp3":
+            cmd.extend(["--mp3", "--mp3-bitrate", "192"])
+        cmd.append(audio_path)
+        # Run Demucs separation with real-time output
+        run_command_with_streaming(cmd, "Demucs stem separation")
+        # Find the separated files
+        track_name = Path(audio_path).stem
+        model_dir = os.path.join(output_dir, model, track_name)
+        # Original WAV files from Demucs
+        vocals_path = os.path.join(model_dir, "vocals.wav")
+        non_vocals_path = os.path.join(model_dir, "no_vocals.wav")
+        # If MP3 output is requested, set the proper file names
+        if output_format.lower() == "mp3":
+            vocals_path = vocals_path.replace(".wav", ".mp3")
+            non_vocals_path = non_vocals_path.replace(".wav", ".mp3")
+        # Verify all files exist
+        for file_path in [vocals_path, non_vocals_path]:
+            if not os.path.exists(file_path):
+                raise Error(f"Separated file not found: {file_path}")
+        return vocals_path, non_vocals_path
     except Exception as e:
         raise RuntimeError(f"Error creating non-vocals track: {str(e)}")
         choices=["wav", "mp3"],
         help="Output format (default: wav)",
     )
+    separate_parser.add_argument(
+        "--model",
+        default="htdemucs",
+        help="Demucs model to use (default: htdemucs)",
+    )
+    separate_parser.add_argument(
+        "--device",
+        help="Device to use for processing (default: cuda if available else cpu)",
+    )
+    separate_parser.add_argument(
+        "--segment",
+        type=float,
+        help="Set split size of each chunk to save memory",
+    )
+    separate_parser.add_argument(
+        "--jobs",
+        type=int,
+        default=1,
+        help="Number of parallel jobs (default: 1)",
+    )
     # New selective stems command
     select_parser = subparsers.add_parser("select", help="Extract specific stems only")
     try:
         if args.command == "separate":
             vocals, drums, bass, other = separate_audio(
+                args.audio_path,
+                args.output_dir,
+                args.format,
+                args.model,
+                args.device,
+                args.segment,
+                args.jobs,
             )
             print(f"Vocals: {vocals}")
             print(f"Drums: {drums}")

tools/voice_replacement.py CHANGED Viewed

@@ -1,4 +1,7 @@
 import ssl
 import tempfile
 import urllib.request
 from datetime import datetime
@@ -256,7 +259,7 @@ def replace_voice(
             if len(result) > 1:
                 item = result[1]
-            if url:= item.get("url"):
                 # Download each URL to a separate file
                 item_output = str(output_path)
                 download_audio_from_url(url, item_output)
@@ -381,9 +384,6 @@ if __name__ == "__main__":
         python tools/voice_replacement.py https://example.com/source.wav target.wav
         python tools/voice_replacement.py source.wav https://example.com/target.mp3 --pitch 2
     """
-    import argparse
-    import sys
-    import os
     # Add parent directory to path for imports
     sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))

+import argparse
+import os
 import ssl
+import sys
 import tempfile
 import urllib.request
 from datetime import datetime
             if len(result) > 1:
                 item = result[1]
+            if url := item.get("url"):
                 # Download each URL to a separate file
                 item_output = str(output_path)
                 download_audio_from_url(url, item_output)
         python tools/voice_replacement.py https://example.com/source.wav target.wav
         python tools/voice_replacement.py source.wav https://example.com/target.mp3 --pitch 2
     """
     # Add parent directory to path for imports
     sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))