SESA_Fast_Separation

Running

App Files Files Community

ASesYusuf1 commited on May 26, 2025

Commit

a3090f7

verified ·

1 Parent(s): 799e841

Update app.py

Browse files

Files changed (1) hide show

app.py +76 -19

app.py CHANGED Viewed

@@ -23,6 +23,7 @@ from concurrent.futures import ThreadPoolExecutor, as_completed
 from threading import Lock
 import scipy.io.wavfile
 import spaces
 # Logging setup
 logging.basicConfig(level=logging.INFO)
@@ -155,7 +156,7 @@ ROFORMER_MODELS = {
 OUTPUT_FORMATS = ['wav', 'flac', 'mp3', 'ogg', 'opus', 'm4a', 'aiff', 'ac3']
-# CSS (unchanged)
 CSS = """
 body {
     background: linear-gradient(to bottom, rgba(45, 11, 11, 0.9), rgba(0, 0, 0, 0.8)), url('/content/logo.jpg') no-repeat center center fixed;
@@ -383,12 +384,36 @@ def roformer_separator(audio, model_key, seg_size, override_seg_size, overlap, p
     if not audio:
         raise ValueError("No audio file provided.")
     temp_audio_path = None
     try:
-        if isinstance(audio, tuple):
-            sample_rate, data = audio
             temp_audio_path = os.path.join("/tmp", "temp_audio.wav")
             scipy.io.wavfile.write(temp_audio_path, sample_rate, data)
-            audio = temp_audio_path
         if seg_size > 512:
             logger.warning(f"Segment size {seg_size} is large, this may cause issues.")
         override_seg_size = override_seg_size == "True"
@@ -416,7 +441,7 @@ def roformer_separator(audio, model_key, seg_size, override_seg_size, overlap, p
         progress(0.2, desc="Loading model...")
         separator.load_model(model_filename=model)
         progress(0.7, desc="Separating audio...")
-        separation = separator.separate(audio)
         stems = [os.path.join(output_dir, file_name) for file_name in separation]
         file_list = []
         if exclude_stems.strip():
@@ -437,6 +462,9 @@ def roformer_separator(audio, model_key, seg_size, override_seg_size, overlap, p
         if temp_audio_path and os.path.exists(temp_audio_path):
             os.remove(temp_audio_path)
             logger.info(f"Temporary file deleted: {temp_audio_path}")
         if torch.cuda.is_available():
             torch.cuda.empty_cache()
             logger.info("GPU memory cleared")
@@ -444,6 +472,7 @@ def roformer_separator(audio, model_key, seg_size, override_seg_size, overlap, p
 @spaces.GPU
 def auto_ensemble_process(audio, model_keys, state, seg_size=64, overlap=0.1, out_format="wav", use_tta="False", model_dir="/tmp/audio-separator-models/", output_dir="output", norm_thresh=0.9, amp_thresh=0.9, batch_size=1, ensemble_method="avg_wave", exclude_stems="", weights_str="", progress=gr.Progress(track_tqdm=True)):
     temp_audio_path = None
     start_time = time.time()
     try:
         if not audio:
@@ -454,18 +483,40 @@ def auto_ensemble_process(audio, model_keys, state, seg_size=64, overlap=0.1, ou
             logger.warning(f"Selected {len(model_keys)} models, limiting to {max_models}.")
             model_keys = model_keys[:max_models]
         # Audio süresine göre dinamik batch size
-        audio_data, sr = librosa.load(audio, sr=None, mono=False)
         duration = librosa.get_duration(y=audio_data, sr=sr)
         logger.info(f"Audio duration: {duration:.2f} seconds")
         dynamic_batch_size = max(1, min(4, 1 + int(900 / (duration + 1)) - len(model_keys) // 2))
         logger.info(f"Using batch size: {dynamic_batch_size} for {len(model_keys)} models, duration {duration:.2f}s")
-        if isinstance(audio, tuple):
-            sample_rate, data = audio
             temp_audio_path = os.path.join("/tmp", "temp_audio.wav")
             scipy.io.wavfile.write(temp_audio_path, sample_rate, data)
-            audio = temp_audio_path
         # State kontrolü
         if not state:
@@ -607,7 +658,7 @@ def auto_ensemble_process(audio, model_keys, state, seg_size=64, overlap=0.1, ou
                     with gpu_lock:
                         progress(0.3, desc=f"Separating with {model_key}")
                         logger.info(f"Separating with {model_key}")
-                        separation = separator.separate(audio)
                         stems = [os.path.join(output_dir, file_name) for file_name in separation]
                         result = []
@@ -674,10 +725,16 @@ def auto_ensemble_process(audio, model_keys, state, seg_size=64, overlap=0.1, ou
                 logger.info(f"Temporary file deleted: {temp_audio_path}")
             except Exception as e:
                 logger.warning(f"Failed to delete temporary file {temp_audio_path}: {e}")
         if torch.cuda.is_available():
             torch.cuda.empty_cache()
-            logger.info("GPU memory cleared")
 def update_roformer_models(category):
     """Update Roformer model dropdown based on selected category."""
     choices = list(ROFORMER_MODELS.get(category, {}).keys()) or []
@@ -697,8 +754,8 @@ def download_audio_wrapper(url, cookie_file):
 def create_interface():
     with gr.Blocks(title="🎵 SESA Fast Separation 🎵", css=CSS, elem_id="app-container") as app:
         gr.Markdown("<h1 class='header-text'>🎵 SESA Fast Separation 🎵</h1>")
-        gr.Markdown("**Note**: If YouTube downloads fail, upload a valid cookies file or a local WAV file. [Cookie Instructions](https://github.com/yt-dlp/yt-dlp/wiki/Extractors#exporting-youtube-cookies)")
-        gr.Markdown("**Tip**: For best results, use audio shorter than 15 minutes or fewer models (up to 6) to ensure smooth processing.")
         # Gradio State bileşeni
         ensemble_state = gr.State(value={
             "current_audio": None,
@@ -720,8 +777,8 @@ def create_interface():
                 with gr.Group(elem_classes="dubbing-theme"):
                     gr.Markdown("### Audio Separation")
                     with gr.Row():
-                        roformer_audio = gr.Audio(label="🎧 Upload Audio", type="filepath", interactive=True)
-                        url_ro = gr.Textbox(label="🔗 Or Paste URL", placeholder="YouTube or audio URL", interactive=True)
                         cookies_ro = gr.File(label="🍪 Cookies File", file_types=[".txt"], interactive=True)
                         download_roformer = gr.Button("⬇️ Download", variant="secondary")
                     roformer_download_status = gr.Textbox(label="📢 Download Status", interactive=False)
@@ -745,8 +802,8 @@ def create_interface():
                     gr.Markdown("### Ensemble Processing")
                     gr.Markdown("Note: If weights are not specified, equal weights (1.0) are applied. Use up to 6 models for best results.")
                     with gr.Row():
-                        ensemble_audio = gr.Audio(label="🎧 Upload Audio", type="filepath", interactive=True)
-                        url_ensemble = gr.Textbox(label="🔗 Or Paste URL", placeholder="YouTube or audio URL", interactive=True)
                         cookies_ensemble = gr.File(label="🍪 Cookies File", file_types=[".txt"], interactive=True)
                         download_ensemble = gr.Button("⬇️ Download", variant="secondary")
                     ensemble_download_status = gr.Textbox(label="📢 Download Status", interactive=False)
@@ -790,7 +847,7 @@ def create_interface():
             fn=auto_ensemble_process,
             inputs=[
                 ensemble_audio, ensemble_models, ensemble_state, ensemble_seg_size, ensemble_overlap,
-                output_format, ensemble_use_tta, model_file_dir, output_dir,
                 norm_threshold, amp_threshold, batch_size, ensemble_method,
                 ensemble_exclude_stems, ensemble_weights
             ],

 from threading import Lock
 import scipy.io.wavfile
 import spaces
+import subprocess
 # Logging setup
 logging.basicConfig(level=logging.INFO)
 OUTPUT_FORMATS = ['wav', 'flac', 'mp3', 'ogg', 'opus', 'm4a', 'aiff', 'ac3']
+# CSS (değişmedi)
 CSS = """
 body {
     background: linear-gradient(to bottom, rgba(45, 11, 11, 0.9), rgba(0, 0, 0, 0.8)), url('/content/logo.jpg') no-repeat center center fixed;
     if not audio:
         raise ValueError("No audio file provided.")
     temp_audio_path = None
+    extracted_audio_path = None
     try:
+        # Giriş dosyasının uzantısını kontrol et
+        file_extension = os.path.splitext(audio)[1].lower().lstrip('.')
+        supported_video_formats = ['mp4', 'mov', 'avi', 'mkv', 'flv', 'wmv', 'webm', 'mpeg', 'mpg']
+        is_video = file_extension in supported_video_formats
+        # Eğer giriş bir video dosyasıysa, sesi çıkar
+        audio_to_process = audio
+        if is_video:
+            extracted_audio_path = os.path.join("/tmp", f"extracted_audio_{os.path.basename(audio)}.wav")
+            logger.info(f"Extracting audio from video file: {audio}")
+            ffmpeg_command = [
+                "ffmpeg", "-i", audio, "-vn", "-acodec", "pcm_s16le", "-ar", "44100", "-ac", "2",
+                extracted_audio_path, "-y"
+            ]
+            try:
+                subprocess.run(ffmpeg_command, check=True, capture_output=True, text=True)
+                logger.info(f"Audio extracted to: {extracted_audio_path}")
+                audio_to_process = extracted_audio_path
+            except subprocess.CalledProcessError as e:
+                logger.error(f"FFmpeg error: {e.stderr}")
+                raise RuntimeError(f"Failed to extract audio from video: {e.stderr}")
+        if isinstance(audio_to_process, tuple):
+            sample_rate, data = audio_to_process
             temp_audio_path = os.path.join("/tmp", "temp_audio.wav")
             scipy.io.wavfile.write(temp_audio_path, sample_rate, data)
+            audio_to_process = temp_audio_path
         if seg_size > 512:
             logger.warning(f"Segment size {seg_size} is large, this may cause issues.")
         override_seg_size = override_seg_size == "True"
         progress(0.2, desc="Loading model...")
         separator.load_model(model_filename=model)
         progress(0.7, desc="Separating audio...")
+        separation = separator.separate(audio_to_process)
         stems = [os.path.join(output_dir, file_name) for file_name in separation]
         file_list = []
         if exclude_stems.strip():
         if temp_audio_path and os.path.exists(temp_audio_path):
             os.remove(temp_audio_path)
             logger.info(f"Temporary file deleted: {temp_audio_path}")
+        if extracted_audio_path and os.path.exists(extracted_audio_path):
+            os.remove(extracted_audio_path)
+            logger.info(f"Extracted audio file deleted: {extracted_audio_path}")
         if torch.cuda.is_available():
             torch.cuda.empty_cache()
             logger.info("GPU memory cleared")
 @spaces.GPU
 def auto_ensemble_process(audio, model_keys, state, seg_size=64, overlap=0.1, out_format="wav", use_tta="False", model_dir="/tmp/audio-separator-models/", output_dir="output", norm_thresh=0.9, amp_thresh=0.9, batch_size=1, ensemble_method="avg_wave", exclude_stems="", weights_str="", progress=gr.Progress(track_tqdm=True)):
     temp_audio_path = None
+    extracted_audio_path = None
     start_time = time.time()
     try:
         if not audio:
             logger.warning(f"Selected {len(model_keys)} models, limiting to {max_models}.")
             model_keys = model_keys[:max_models]
+        # Giriş dosyasının uzantısını kontrol et
+        file_extension = os.path.splitext(audio)[1].lower().lstrip('.')
+        supported_video_formats = ['mp4', 'mov', 'avi', 'mkv', 'flv', 'wmv', 'webm', 'mpeg', 'mpg']
+        is_video = file_extension in supported_video_formats
+        # Eğer giriş bir video dosyasıysa, sesi çıkar
+        audio_to_process = audio
+        if is_video:
+            extracted_audio_path = os.path.join("/tmp", f"extracted_audio_{os.path.basename(audio)}.wav")
+            logger.info(f"Extracting audio from video file: {audio}")
+            ffmpeg_command = [
+                "ffmpeg", "-i", audio, "-vn", "-acodec", "pcm_s16le", "-ar", "44100", "-ac", "2",
+                extracted_audio_path, "-y"
+            ]
+            try:
+                subprocess.run(ffmpeg_command, check=True, capture_output=True, text=True)
+                logger.info(f"Audio extracted to: {extracted_audio_path}")
+                audio_to_process = extracted_audio_path
+            except subprocess.CalledProcessError as e:
+                logger.error(f"FFmpeg error: {e.stderr}")
+                raise RuntimeError(f"Failed to extract audio from video: {e.stderr}")
         # Audio süresine göre dinamik batch size
+        audio_data, sr = librosa.load(audio_to_process, sr=None, mono=False)
         duration = librosa.get_duration(y=audio_data, sr=sr)
         logger.info(f"Audio duration: {duration:.2f} seconds")
         dynamic_batch_size = max(1, min(4, 1 + int(900 / (duration + 1)) - len(model_keys) // 2))
         logger.info(f"Using batch size: {dynamic_batch_size} for {len(model_keys)} models, duration {duration:.2f}s")
+        if isinstance(audio_to_process, tuple):
+            sample_rate, data = audio_to_process
             temp_audio_path = os.path.join("/tmp", "temp_audio.wav")
             scipy.io.wavfile.write(temp_audio_path, sample_rate, data)
+            audio_to_process = temp_audio_path
         # State kontrolü
         if not state:
                     with gpu_lock:
                         progress(0.3, desc=f"Separating with {model_key}")
                         logger.info(f"Separating with {model_key}")
+                        separation = separator.separate(audio_to_process)
                         stems = [os.path.join(output_dir, file_name) for file_name in separation]
                         result = []
                 logger.info(f"Temporary file deleted: {temp_audio_path}")
             except Exception as e:
                 logger.warning(f"Failed to delete temporary file {temp_audio_path}: {e}")
+        if extracted_audio_path and os.path.exists(extracted_audio_path):
+            try:
+                os.remove(extracted_audio_path)
+                logger.info(f"Extracted audio file deleted: {extracted_audio_path}")
+            except Exception as e:
+                logger.warning(f"Failed to delete extracted audio file {extracted_audio_path}: {e}")
         if torch.cuda.is_available():
             torch.cuda.empty_cache()
+            logger.info("GPU memory cleared")
 def update_roformer_models(category):
     """Update Roformer model dropdown based on selected category."""
     choices = list(ROFORMER_MODELS.get(category, {}).keys()) or []
 def create_interface():
     with gr.Blocks(title="🎵 SESA Fast Separation 🎵", css=CSS, elem_id="app-container") as app:
         gr.Markdown("<h1 class='header-text'>🎵 SESA Fast Separation 🎵</h1>")
+        gr.Markdown("**Note**: If YouTube downloads fail, upload a valid cookies file or a local WAV/MP4/MOV file. [Cookie Instructions](https://github.com/yt-dlp/yt-dlp/wiki/Extractors#exporting-youtube-cookies)")
+        gr.Markdown("**Tip**: For best results, use audio/video shorter than 15 minutes or fewer models (up to 6) to ensure smooth processing.")
         # Gradio State bileşeni
         ensemble_state = gr.State(value={
             "current_audio": None,
                 with gr.Group(elem_classes="dubbing-theme"):
                     gr.Markdown("### Audio Separation")
                     with gr.Row():
+                        roformer_audio = gr.Audio(label="🎧 Upload Audio/Video", type="filepath", interactive=True)
+                        url_ro = gr.Textbox(label="🔗 Or Paste URL", placeholder="YouTube or audio/video URL", interactive=True)
                         cookies_ro = gr.File(label="🍪 Cookies File", file_types=[".txt"], interactive=True)
                         download_roformer = gr.Button("⬇️ Download", variant="secondary")
                     roformer_download_status = gr.Textbox(label="📢 Download Status", interactive=False)
                     gr.Markdown("### Ensemble Processing")
                     gr.Markdown("Note: If weights are not specified, equal weights (1.0) are applied. Use up to 6 models for best results.")
                     with gr.Row():
+                        ensemble_audio = gr.Audio(label="🎧 Upload Audio/Video", type="filepath", interactive=True)
+                        url_ensemble = gr.Textbox(label="🔗 Or Paste URL", placeholder="YouTube or audio/video URL", interactive=True)
                         cookies_ensemble = gr.File(label="🍪 Cookies File", file_types=[".txt"], interactive=True)
                         download_ensemble = gr.Button("⬇️ Download", variant="secondary")
                     ensemble_download_status = gr.Textbox(label="📢 Download Status", interactive=False)
             fn=auto_ensemble_process,
             inputs=[
                 ensemble_audio, ensemble_models, ensemble_state, ensemble_seg_size, ensemble_overlap,
+                output_format, ensemble_use_tta, model_dir, output_dir,
                 norm_threshold, amp_threshold, batch_size, ensemble_method,
                 ensemble_exclude_stems, ensemble_weights
             ],