SESA_Fast_Separation

Running

App Files Files Community

ASesYusuf1 commited on May 16, 2025

Commit

defb0b3

verified ·

1 Parent(s): 6242fc6

Update app.py

Browse files

Files changed (1) hide show

app.py +41 -72

app.py CHANGED Viewed

@@ -22,7 +22,7 @@ import gc
 import time
 from concurrent.futures import ThreadPoolExecutor, as_completed
 from threading import Lock
-import scipy
 # Logging setup
 logging.basicConfig(level=logging.INFO)
@@ -384,7 +384,7 @@ def roformer_separator(audio, model_key, seg_size, override_seg_size, overlap, p
             scipy.io.wavfile.write(temp_audio_path, sample_rate, data)
             audio = temp_audio_path
         if seg_size > 512:
-            logger.warning(f"Segment size {seg_size} is large, this may cause crashes on ZeroGPU.")
         override_seg_size = override_seg_size == "True"
         if os.path.exists(output_dir):
             shutil.rmtree(output_dir)
@@ -429,14 +429,13 @@ def roformer_separator(audio, model_key, seg_size, override_seg_size, overlap, p
             logger.info("GPU memory cleared")
 @spaces.GPU
-def auto_ensemble_process(audio, model_keys, seg_size=128, overlap=0.1, out_format="wav", use_tta="False", model_dir="/tmp/audio-separator-models/", output_dir="output", norm_thresh=0.9, amp_thresh=0.9, batch_size=1, ensemble_method="avg_wave", exclude_stems="", weights_str="", progress=gr.Progress(track_tqdm=True)):
     temp_audio_path = None
-    chunk_paths = []
     max_retries = 2
     start_time = time.time()
-    time_budget = 100  # seconds, to stay within ZeroGPU limit
-    max_models = 6  # Reasonable limit to prevent timeouts
-    gpu_lock = Lock()  # Ensure only one model uses GPU at a time
     try:
         if not audio:
@@ -444,12 +443,15 @@ def auto_ensemble_process(audio, model_keys, seg_size=128, overlap=0.1, out_form
         if not model_keys:
             raise ValueError("No models selected.")
         if len(model_keys) > max_models:
-            logger.warning(f"Selected {len(model_keys)} models, limiting to {max_models} to avoid ZeroGPU timeouts.")
             model_keys = model_keys[:max_models]
-        # Dynamic batch size adjustment
-        dynamic_batch_size = max(1, min(4, 1 + (6 - len(model_keys)) // 2))
-        logger.info(f"Using batch size: {dynamic_batch_size} for {len(model_keys)} models")
         if isinstance(audio, tuple):
             sample_rate, data = audio
@@ -457,28 +459,6 @@ def auto_ensemble_process(audio, model_keys, seg_size=128, overlap=0.1, out_form
             scipy.io.wavfile.write(temp_audio_path, sample_rate, data)
             audio = temp_audio_path
-        audio_data, sr = librosa.load(audio, sr=None, mono=False)
-        duration = librosa.get_duration(y=audio_data, sr=sr)
-        logger.info(f"Audio duration: {duration:.2f} seconds")
-        # Optimize chunking
-        chunk_duration = 300 if duration > 900 else duration
-        chunks = []
-        if duration > 900:
-            logger.info(f"Audio exceeds 15 minutes, splitting into {chunk_duration}-second chunks")
-            num_chunks = int(np.ceil(duration / chunk_duration))
-            for i in range(num_chunks):
-                start = i * chunk_duration * sr
-                end = min((i + 1) * chunk_duration * sr, audio_data.shape[-1])
-                chunk_data = audio_data[:, start:end] if audio_data.ndim == 2 else audio_data[start:end]
-                chunk_path = os.path.join("/tmp", f"chunk_{i}.wav")
-                sf.write(chunk_path, chunk_data.T if audio_data.ndim == 2 else chunk_data, sr)
-                chunks.append(chunk_path)
-                chunk_paths.append(chunk_path)
-                logger.info(f"Created chunk {i}: {chunk_path}")
-        else:
-            chunks = [audio]
         use_tta = use_tta == "True"
         if os.path.exists(output_dir):
             shutil.rmtree(output_dir)
@@ -490,9 +470,9 @@ def auto_ensemble_process(audio, model_keys, seg_size=128, overlap=0.1, out_form
         model_cache = {}
         all_stems = []
         model_stems = {model_key: {"vocals": [], "other": []} for model_key in model_keys}
-        total_tasks = len(model_keys) * len(chunks)
-        def process_model_chunk(model_key, chunk_path, chunk_idx, model_idx):
             with torch.no_grad():
                 for attempt in range(max_retries + 1):
                     try:
@@ -508,8 +488,8 @@ def auto_ensemble_process(audio, model_keys, seg_size=128, overlap=0.1, out_form
                         # Check time budget
                         elapsed = time.time() - start_time
                         if elapsed > time_budget:
-                            logger.error(f"Time budget ({time_budget}s) exceeded, aborting")
-                            raise TimeoutError("Processing exceeded time budget")
                         # Initialize separator
                         model_path = os.path.join(model_dir, model)
@@ -537,9 +517,9 @@ def auto_ensemble_process(audio, model_keys, seg_size=128, overlap=0.1, out_form
                         # Process with GPU lock
                         with gpu_lock:
-                            progress((model_idx + chunk_idx / len(chunks)) / len(model_keys), desc=f"Separating chunk {chunk_idx} with {model_key}")
-                            logger.info(f"Separating chunk {chunk_idx} with {model_key}")
-                            separation = separator.separate(chunk_path)
                             stems = [os.path.join(output_dir, file_name) for file_name in separation]
                             result = []
                             for stem in stems:
@@ -550,35 +530,30 @@ def auto_ensemble_process(audio, model_keys, seg_size=128, overlap=0.1, out_form
                                     result.append(stem)
                             return result
                     except Exception as e:
-                        logger.error(f"Error processing {model_key} chunk {chunk_idx}, attempt {attempt + 1}/{max_retries + 1}: {e}")
                         if attempt == max_retries:
-                            logger.error(f"Max retries reached for {model_key} chunk {chunk_idx}, skipping")
                             return []
                         time.sleep(1)
                     finally:
                         if torch.cuda.is_available():
                             torch.cuda.empty_cache()
-                            logger.info(f"Cleared CUDA cache after {model_key} chunk {chunk_idx}")
         # Parallel processing
         progress(0.1, desc="Starting model separations...")
         with ThreadPoolExecutor(max_workers=min(4, len(model_keys))) as executor:
-            future_to_task = {}
-            for model_idx, model_key in enumerate(model_keys):
-                for chunk_idx, chunk_path in enumerate(chunks):
-                    future = executor.submit(process_model_chunk, model_key, chunk_path, chunk_idx, model_idx)
-                    future_to_task[future] = (model_key, chunk_idx)
             for future in as_completed(future_to_task):
-                model_key, chunk_idx = future_to_task[future]
                 try:
                     stems = future.result()
                     if stems:
-                        logger.info(f"Completed {model_key} chunk {chunk_idx}")
                     else:
-                        logger.warning(f"No stems produced for {model_key} chunk {chunk_idx}")
                 except Exception as e:
-                    logger.error(f"Task {model_key} chunk {chunk_idx} failed: {e}")
         # Clear model cache
         model_cache.clear()
@@ -594,10 +569,9 @@ def auto_ensemble_process(audio, model_keys, seg_size=128, overlap=0.1, out_form
                 if stems_dict[stem_type]:
                     combined_path = os.path.join(output_dir, f"{base_name}_{stem_type}_{model_key.replace(' | ', '_').replace(' ', '_')}.wav")
                     try:
-                        with sf.SoundFile(combined_path, 'w', sr, channels=2 if audio_data.ndim == 2 else 1) as f:
-                            for stem_path in stems_dict[stem_type]:
-                                data, _ = librosa.load(stem_path, sr=sr, mono=False)
-                                f.write(data.T if data.ndim == 2 else data)
                         logger.info(f"Combined {stem_type} for {model_key}: {combined_path}")
                         if exclude_stems.strip() and stem_type.lower() in [s.strip().lower() for s in exclude_stems.split(',')]:
                             logger.info(f"Excluding {stem_type} for {model_key}")
@@ -642,19 +616,15 @@ def auto_ensemble_process(audio, model_keys, seg_size=128, overlap=0.1, out_form
             raise RuntimeError(error_msg)
     except Exception as e:
         logger.error(f"Ensemble error: {e}")
-        if "ZeroGPU" in str(e) or "aborted" in str(e).lower() or isinstance(e, TimeoutError):
-            error_msg = f"ZeroGPU task aborted or timed out. Try fewer models (max {max_models}), shorter audio, or uploading a local WAV file."
-        else:
-            error_msg = f"Ensemble error: {e}"
         raise RuntimeError(error_msg)
     finally:
-        for path in chunk_paths + ([temp_audio_path] if temp_audio_path and os.path.exists(temp_audio_path) else []):
             try:
-                if os.path.exists(path):
-                    os.remove(path)
-                    logger.info(f"Temporary file deleted: {path}")
             except Exception as e:
-                logger.warning(f"Failed to delete temporary file {path}: {e}")
         if torch.cuda.is_available():
             torch.cuda.empty_cache()
             logger.info("GPU memory cleared")
@@ -679,8 +649,7 @@ def create_interface():
     with gr.Blocks(title="🎵 SESA Fast Separation 🎵", css=CSS, elem_id="app-container") as app:
         gr.Markdown("<h1 class='header-text'>🎵 SESA Fast Separation 🎵</h1>")
         gr.Markdown("**Note**: If YouTube downloads fail, upload a valid cookies file or a local WAV file. [Cookie Instructions](https://github.com/yt-dlp/yt-dlp/wiki/Extractors#exporting-youtube-cookies)")
-        gr.Markdown("**Warning**: Audio files longer than 15 minutes are split into 5-minute chunks, increasing processing time.")
-        gr.Markdown("**ZeroGPU Notice**: Up to 6 models supported for ensemble. For long audio, use fewer models or a local WAV file to avoid timeouts.")
         with gr.Tabs():
             with gr.Tab("⚙️ Settings"):
                 with gr.Group(elem_classes="dubbing-theme"):
@@ -705,7 +674,7 @@ def create_interface():
                         roformer_category = gr.Dropdown(label="📚 Category", choices=list(ROFORMER_MODELS.keys()), value="General Purpose", interactive=True)
                         roformer_model = gr.Dropdown(label="🛠️ Model", choices=list(ROFORMER_MODELS["General Purpose"].keys()), interactive=True, allow_custom_value=True)
                     with gr.Row():
-                        roformer_seg_size = gr.Slider(32, 512, value=128, step=32, label="📏 Segment Size", interactive=True)
                         roformer_overlap = gr.Slider(2, 10, value=8, step=1, label="🔄 Overlap", interactive=True)
                     with gr.Row():
                         roformer_pitch_shift = gr.Slider(-12, 12, value=0, step=1, label="🎵 Pitch Shift", interactive=True)
@@ -717,7 +686,7 @@ def create_interface():
             with gr.Tab("🎚️ Auto Ensemble"):
                 with gr.Group(elem_classes="dubbing-theme"):
                     gr.Markdown("### Ensemble Processing")
-                    gr.Markdown("Note: If weights are not specified, equal weights (1.0) are applied. Max 6 models recommended to avoid ZeroGPU timeouts.")
                     with gr.Row():
                         ensemble_audio = gr.Audio(label="🎧 Upload Audio", type="filepath", interactive=True)
                         url_ensemble = gr.Textbox(label="🔗 Or Paste URL", placeholder="YouTube or audio URL", interactive=True)
@@ -729,7 +698,7 @@ def create_interface():
                         ensemble_category = gr.Dropdown(label="📚 Category", choices=list(ROFORMER_MODELS.keys()), value="Instrumentals", interactive=True)
                         ensemble_models = gr.Dropdown(label="🛠️ Models (Max 6)", choices=list(ROFORMER_MODELS["Instrumentals"].keys()), multiselect=True, interactive=True, allow_custom_value=True)
                     with gr.Row():
-                        ensemble_seg_size = gr.Slider(32, 512, value=128, step=32, label="📏 Segment Size", interactive=True)
                         ensemble_overlap = gr.Slider(2, 10, value=8, step=1, label="🔄 Overlap", interactive=True)
                         ensemble_use_tta = gr.Dropdown(choices=["True", "False"], value="False", label="🔍 Use TTA", interactive=True)
                     ensemble_method = gr.Dropdown(label="⚙️ Ensemble Method", choices=['avg_wave', 'median_wave', 'max_wave', 'min_wave', 'avg_fft', 'median_fft', 'max_fft', 'min_fft'], value='avg_wave', interactive=True)
@@ -763,7 +732,7 @@ def create_interface():
             fn=auto_ensemble_process,
             inputs=[
                 ensemble_audio, ensemble_models, ensemble_seg_size, ensemble_overlap,
-                output_format, ensemble_use_tta, model_file_dir, output_dir,
                 norm_threshold, amp_threshold, batch_size, ensemble_method,
                 ensemble_exclude_stems, ensemble_weights
             ],

 import time
 from concurrent.futures import ThreadPoolExecutor, as_completed
 from threading import Lock
+import scipy.io.wavfile
 # Logging setup
 logging.basicConfig(level=logging.INFO)
             scipy.io.wavfile.write(temp_audio_path, sample_rate, data)
             audio = temp_audio_path
         if seg_size > 512:
+            logger.warning(f"Segment size {seg_size} is large, this may cause issues.")
         override_seg_size = override_seg_size == "True"
         if os.path.exists(output_dir):
             shutil.rmtree(output_dir)
             logger.info("GPU memory cleared")
 @spaces.GPU
+def auto_ensemble_process(audio, model_keys, seg_size=64, overlap=0.1, out_format="wav", use_tta="False", model_dir="/tmp/audio-separator-models/", output_dir="output", norm_thresh=0.9, amp_thresh=0.9, batch_size=1, ensemble_method="avg_wave", exclude_stems="", weights_str="", progress=gr.Progress(track_tqdm=True)):
     temp_audio_path = None
     max_retries = 2
     start_time = time.time()
+    time_budget = 100  # seconds
+    max_models = 6
+    gpu_lock = Lock()
     try:
         if not audio:
         if not model_keys:
             raise ValueError("No models selected.")
         if len(model_keys) > max_models:
+            logger.warning(f"Selected {len(model_keys)} models, limiting to {max_models}.")
             model_keys = model_keys[:max_models]
+        # Dynamic batch size based on audio duration and model count
+        audio_data, sr = librosa.load(audio, sr=None, mono=False)
+        duration = librosa.get_duration(y=audio_data, sr=sr)
+        logger.info(f"Audio duration: {duration:.2f} seconds")
+        dynamic_batch_size = max(1, min(4, 1 + int(900 / (duration + 1)) - len(model_keys) // 2))
+        logger.info(f"Using batch size: {dynamic_batch_size} for {len(model_keys)} models, duration {duration:.2f}s")
         if isinstance(audio, tuple):
             sample_rate, data = audio
             scipy.io.wavfile.write(temp_audio_path, sample_rate, data)
             audio = temp_audio_path
         use_tta = use_tta == "True"
         if os.path.exists(output_dir):
             shutil.rmtree(output_dir)
         model_cache = {}
         all_stems = []
         model_stems = {model_key: {"vocals": [], "other": []} for model_key in model_keys}
+        total_tasks = len(model_keys)
+        def process_model(model_key, model_idx):
             with torch.no_grad():
                 for attempt in range(max_retries + 1):
                     try:
                         # Check time budget
                         elapsed = time.time() - start_time
                         if elapsed > time_budget:
+                            logger.error(f"Time budget ({time_budget}s) exceeded")
+                            raise TimeoutError("Processing took too long")
                         # Initialize separator
                         model_path = os.path.join(model_dir, model)
                         # Process with GPU lock
                         with gpu_lock:
+                            progress(0.3 + (model_idx / total_tasks) * 0.5, desc=f"Separating with {model_key}")
+                            logger.info(f"Separating with {model_key}")
+                            separation = separator.separate(audio)
                             stems = [os.path.join(output_dir, file_name) for file_name in separation]
                             result = []
                             for stem in stems:
                                     result.append(stem)
                             return result
                     except Exception as e:
+                        logger.error(f"Error processing {model_key}, attempt {attempt + 1}/{max_retries + 1}: {e}")
                         if attempt == max_retries:
+                            logger.error(f"Max retries reached for {model_key}, skipping")
                             return []
                         time.sleep(1)
                     finally:
                         if torch.cuda.is_available():
                             torch.cuda.empty_cache()
+                            logger.info(f"Cleared CUDA cache after {model_key}")
         # Parallel processing
         progress(0.1, desc="Starting model separations...")
         with ThreadPoolExecutor(max_workers=min(4, len(model_keys))) as executor:
+            future_to_task = {executor.submit(process_model, model_key, idx): model_key for idx, model_key in enumerate(model_keys)}
             for future in as_completed(future_to_task):
+                model_key = future_to_task[future]
                 try:
                     stems = future.result()
                     if stems:
+                        logger.info(f"Completed {model_key}")
                     else:
+                        logger.warning(f"No stems produced for {model_key}")
                 except Exception as e:
+                    logger.error(f"Task {model_key} failed: {e}")
         # Clear model cache
         model_cache.clear()
                 if stems_dict[stem_type]:
                     combined_path = os.path.join(output_dir, f"{base_name}_{stem_type}_{model_key.replace(' | ', '_').replace(' ', '_')}.wav")
                     try:
+                        data, _ = librosa.load(stems_dict[stem_type][0], sr=sr, mono=False)
+                        with sf.SoundFile(combined_path, 'w', sr, channels=2 if data.ndim == 2 else 1) as f:
+                            f.write(data.T if data.ndim == 2 else data)
                         logger.info(f"Combined {stem_type} for {model_key}: {combined_path}")
                         if exclude_stems.strip() and stem_type.lower() in [s.strip().lower() for s in exclude_stems.split(',')]:
                             logger.info(f"Excluding {stem_type} for {model_key}")
             raise RuntimeError(error_msg)
     except Exception as e:
         logger.error(f"Ensemble error: {e}")
+        error_msg = f"Processing failed. Try fewer models (max {max_models}), shorter audio, or uploading a local WAV file."
         raise RuntimeError(error_msg)
     finally:
+        if temp_audio_path and os.path.exists(temp_audio_path):
             try:
+                os.remove(temp_audio_path)
+                logger.info(f"Temporary file deleted: {temp_audio_path}")
             except Exception as e:
+                logger.warning(f"Failed to delete temporary file {temp_audio_path}: {e}")
         if torch.cuda.is_available():
             torch.cuda.empty_cache()
             logger.info("GPU memory cleared")
     with gr.Blocks(title="🎵 SESA Fast Separation 🎵", css=CSS, elem_id="app-container") as app:
         gr.Markdown("<h1 class='header-text'>🎵 SESA Fast Separation 🎵</h1>")
         gr.Markdown("**Note**: If YouTube downloads fail, upload a valid cookies file or a local WAV file. [Cookie Instructions](https://github.com/yt-dlp/yt-dlp/wiki/Extractors#exporting-youtube-cookies)")
+        gr.Markdown("**Tip**: For best results, use audio shorter than 15 minutes or fewer models (up to 6) to ensure smooth processing.")
         with gr.Tabs():
             with gr.Tab("⚙️ Settings"):
                 with gr.Group(elem_classes="dubbing-theme"):
                         roformer_category = gr.Dropdown(label="📚 Category", choices=list(ROFORMER_MODELS.keys()), value="General Purpose", interactive=True)
                         roformer_model = gr.Dropdown(label="🛠️ Model", choices=list(ROFORMER_MODELS["General Purpose"].keys()), interactive=True, allow_custom_value=True)
                     with gr.Row():
+                        roformer_seg_size = gr.Slider(32, 512, value=64, step=32, label="📏 Segment Size", interactive=True)
                         roformer_overlap = gr.Slider(2, 10, value=8, step=1, label="🔄 Overlap", interactive=True)
                     with gr.Row():
                         roformer_pitch_shift = gr.Slider(-12, 12, value=0, step=1, label="🎵 Pitch Shift", interactive=True)
             with gr.Tab("🎚️ Auto Ensemble"):
                 with gr.Group(elem_classes="dubbing-theme"):
                     gr.Markdown("### Ensemble Processing")
+                    gr.Markdown("Note: If weights are not specified, equal weights (1.0) are applied. Use up to 6 models for best results.")
                     with gr.Row():
                         ensemble_audio = gr.Audio(label="🎧 Upload Audio", type="filepath", interactive=True)
                         url_ensemble = gr.Textbox(label="🔗 Or Paste URL", placeholder="YouTube or audio URL", interactive=True)
                         ensemble_category = gr.Dropdown(label="📚 Category", choices=list(ROFORMER_MODELS.keys()), value="Instrumentals", interactive=True)
                         ensemble_models = gr.Dropdown(label="🛠️ Models (Max 6)", choices=list(ROFORMER_MODELS["Instrumentals"].keys()), multiselect=True, interactive=True, allow_custom_value=True)
                     with gr.Row():
+                        ensemble_seg_size = gr.Slider(32, 512, value=64, step=32, label="📏 Segment Size", interactive=True)
                         ensemble_overlap = gr.Slider(2, 10, value=8, step=1, label="🔄 Overlap", interactive=True)
                         ensemble_use_tta = gr.Dropdown(choices=["True", "False"], value="False", label="🔍 Use TTA", interactive=True)
                     ensemble_method = gr.Dropdown(label="⚙️ Ensemble Method", choices=['avg_wave', 'median_wave', 'max_wave', 'min_wave', 'avg_fft', 'median_fft', 'max_fft', 'min_fft'], value='avg_wave', interactive=True)
             fn=auto_ensemble_process,
             inputs=[
                 ensemble_audio, ensemble_models, ensemble_seg_size, ensemble_overlap,
+                output_format, ensemble_use_tta, model_dir, output_dir,
                 norm_threshold, amp_threshold, batch_size, ensemble_method,
                 ensemble_exclude_stems, ensemble_weights
             ],