ASesYusuf1 commited on
Commit
a3090f7
·
verified ·
1 Parent(s): 799e841

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +76 -19
app.py CHANGED
@@ -23,6 +23,7 @@ from concurrent.futures import ThreadPoolExecutor, as_completed
23
  from threading import Lock
24
  import scipy.io.wavfile
25
  import spaces
 
26
 
27
  # Logging setup
28
  logging.basicConfig(level=logging.INFO)
@@ -155,7 +156,7 @@ ROFORMER_MODELS = {
155
 
156
  OUTPUT_FORMATS = ['wav', 'flac', 'mp3', 'ogg', 'opus', 'm4a', 'aiff', 'ac3']
157
 
158
- # CSS (unchanged)
159
  CSS = """
160
  body {
161
  background: linear-gradient(to bottom, rgba(45, 11, 11, 0.9), rgba(0, 0, 0, 0.8)), url('/content/logo.jpg') no-repeat center center fixed;
@@ -383,12 +384,36 @@ def roformer_separator(audio, model_key, seg_size, override_seg_size, overlap, p
383
  if not audio:
384
  raise ValueError("No audio file provided.")
385
  temp_audio_path = None
 
386
  try:
387
- if isinstance(audio, tuple):
388
- sample_rate, data = audio
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
389
  temp_audio_path = os.path.join("/tmp", "temp_audio.wav")
390
  scipy.io.wavfile.write(temp_audio_path, sample_rate, data)
391
- audio = temp_audio_path
 
392
  if seg_size > 512:
393
  logger.warning(f"Segment size {seg_size} is large, this may cause issues.")
394
  override_seg_size = override_seg_size == "True"
@@ -416,7 +441,7 @@ def roformer_separator(audio, model_key, seg_size, override_seg_size, overlap, p
416
  progress(0.2, desc="Loading model...")
417
  separator.load_model(model_filename=model)
418
  progress(0.7, desc="Separating audio...")
419
- separation = separator.separate(audio)
420
  stems = [os.path.join(output_dir, file_name) for file_name in separation]
421
  file_list = []
422
  if exclude_stems.strip():
@@ -437,6 +462,9 @@ def roformer_separator(audio, model_key, seg_size, override_seg_size, overlap, p
437
  if temp_audio_path and os.path.exists(temp_audio_path):
438
  os.remove(temp_audio_path)
439
  logger.info(f"Temporary file deleted: {temp_audio_path}")
 
 
 
440
  if torch.cuda.is_available():
441
  torch.cuda.empty_cache()
442
  logger.info("GPU memory cleared")
@@ -444,6 +472,7 @@ def roformer_separator(audio, model_key, seg_size, override_seg_size, overlap, p
444
  @spaces.GPU
445
  def auto_ensemble_process(audio, model_keys, state, seg_size=64, overlap=0.1, out_format="wav", use_tta="False", model_dir="/tmp/audio-separator-models/", output_dir="output", norm_thresh=0.9, amp_thresh=0.9, batch_size=1, ensemble_method="avg_wave", exclude_stems="", weights_str="", progress=gr.Progress(track_tqdm=True)):
446
  temp_audio_path = None
 
447
  start_time = time.time()
448
  try:
449
  if not audio:
@@ -454,18 +483,40 @@ def auto_ensemble_process(audio, model_keys, state, seg_size=64, overlap=0.1, ou
454
  logger.warning(f"Selected {len(model_keys)} models, limiting to {max_models}.")
455
  model_keys = model_keys[:max_models]
456
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
457
  # Audio süresine göre dinamik batch size
458
- audio_data, sr = librosa.load(audio, sr=None, mono=False)
459
  duration = librosa.get_duration(y=audio_data, sr=sr)
460
  logger.info(f"Audio duration: {duration:.2f} seconds")
461
  dynamic_batch_size = max(1, min(4, 1 + int(900 / (duration + 1)) - len(model_keys) // 2))
462
  logger.info(f"Using batch size: {dynamic_batch_size} for {len(model_keys)} models, duration {duration:.2f}s")
463
 
464
- if isinstance(audio, tuple):
465
- sample_rate, data = audio
466
  temp_audio_path = os.path.join("/tmp", "temp_audio.wav")
467
  scipy.io.wavfile.write(temp_audio_path, sample_rate, data)
468
- audio = temp_audio_path
469
 
470
  # State kontrolü
471
  if not state:
@@ -607,7 +658,7 @@ def auto_ensemble_process(audio, model_keys, state, seg_size=64, overlap=0.1, ou
607
  with gpu_lock:
608
  progress(0.3, desc=f"Separating with {model_key}")
609
  logger.info(f"Separating with {model_key}")
610
- separation = separator.separate(audio)
611
  stems = [os.path.join(output_dir, file_name) for file_name in separation]
612
  result = []
613
 
@@ -674,10 +725,16 @@ def auto_ensemble_process(audio, model_keys, state, seg_size=64, overlap=0.1, ou
674
  logger.info(f"Temporary file deleted: {temp_audio_path}")
675
  except Exception as e:
676
  logger.warning(f"Failed to delete temporary file {temp_audio_path}: {e}")
 
 
 
 
 
 
677
  if torch.cuda.is_available():
678
  torch.cuda.empty_cache()
679
- logger.info("GPU memory cleared")
680
-
681
  def update_roformer_models(category):
682
  """Update Roformer model dropdown based on selected category."""
683
  choices = list(ROFORMER_MODELS.get(category, {}).keys()) or []
@@ -697,8 +754,8 @@ def download_audio_wrapper(url, cookie_file):
697
  def create_interface():
698
  with gr.Blocks(title="🎵 SESA Fast Separation 🎵", css=CSS, elem_id="app-container") as app:
699
  gr.Markdown("<h1 class='header-text'>🎵 SESA Fast Separation 🎵</h1>")
700
- gr.Markdown("**Note**: If YouTube downloads fail, upload a valid cookies file or a local WAV file. [Cookie Instructions](https://github.com/yt-dlp/yt-dlp/wiki/Extractors#exporting-youtube-cookies)")
701
- gr.Markdown("**Tip**: For best results, use audio shorter than 15 minutes or fewer models (up to 6) to ensure smooth processing.")
702
  # Gradio State bileşeni
703
  ensemble_state = gr.State(value={
704
  "current_audio": None,
@@ -720,8 +777,8 @@ def create_interface():
720
  with gr.Group(elem_classes="dubbing-theme"):
721
  gr.Markdown("### Audio Separation")
722
  with gr.Row():
723
- roformer_audio = gr.Audio(label="🎧 Upload Audio", type="filepath", interactive=True)
724
- url_ro = gr.Textbox(label="🔗 Or Paste URL", placeholder="YouTube or audio URL", interactive=True)
725
  cookies_ro = gr.File(label="🍪 Cookies File", file_types=[".txt"], interactive=True)
726
  download_roformer = gr.Button("⬇️ Download", variant="secondary")
727
  roformer_download_status = gr.Textbox(label="📢 Download Status", interactive=False)
@@ -745,8 +802,8 @@ def create_interface():
745
  gr.Markdown("### Ensemble Processing")
746
  gr.Markdown("Note: If weights are not specified, equal weights (1.0) are applied. Use up to 6 models for best results.")
747
  with gr.Row():
748
- ensemble_audio = gr.Audio(label="🎧 Upload Audio", type="filepath", interactive=True)
749
- url_ensemble = gr.Textbox(label="🔗 Or Paste URL", placeholder="YouTube or audio URL", interactive=True)
750
  cookies_ensemble = gr.File(label="🍪 Cookies File", file_types=[".txt"], interactive=True)
751
  download_ensemble = gr.Button("⬇️ Download", variant="secondary")
752
  ensemble_download_status = gr.Textbox(label="📢 Download Status", interactive=False)
@@ -790,7 +847,7 @@ def create_interface():
790
  fn=auto_ensemble_process,
791
  inputs=[
792
  ensemble_audio, ensemble_models, ensemble_state, ensemble_seg_size, ensemble_overlap,
793
- output_format, ensemble_use_tta, model_file_dir, output_dir,
794
  norm_threshold, amp_threshold, batch_size, ensemble_method,
795
  ensemble_exclude_stems, ensemble_weights
796
  ],
 
23
  from threading import Lock
24
  import scipy.io.wavfile
25
  import spaces
26
+ import subprocess
27
 
28
  # Logging setup
29
  logging.basicConfig(level=logging.INFO)
 
156
 
157
  OUTPUT_FORMATS = ['wav', 'flac', 'mp3', 'ogg', 'opus', 'm4a', 'aiff', 'ac3']
158
 
159
+ # CSS (değişmedi)
160
  CSS = """
161
  body {
162
  background: linear-gradient(to bottom, rgba(45, 11, 11, 0.9), rgba(0, 0, 0, 0.8)), url('/content/logo.jpg') no-repeat center center fixed;
 
384
  if not audio:
385
  raise ValueError("No audio file provided.")
386
  temp_audio_path = None
387
+ extracted_audio_path = None
388
  try:
389
+ # Giriş dosyasının uzantısını kontrol et
390
+ file_extension = os.path.splitext(audio)[1].lower().lstrip('.')
391
+ supported_video_formats = ['mp4', 'mov', 'avi', 'mkv', 'flv', 'wmv', 'webm', 'mpeg', 'mpg']
392
+ is_video = file_extension in supported_video_formats
393
+
394
+ # Eğer giriş bir video dosyasıysa, sesi çıkar
395
+ audio_to_process = audio
396
+ if is_video:
397
+ extracted_audio_path = os.path.join("/tmp", f"extracted_audio_{os.path.basename(audio)}.wav")
398
+ logger.info(f"Extracting audio from video file: {audio}")
399
+ ffmpeg_command = [
400
+ "ffmpeg", "-i", audio, "-vn", "-acodec", "pcm_s16le", "-ar", "44100", "-ac", "2",
401
+ extracted_audio_path, "-y"
402
+ ]
403
+ try:
404
+ subprocess.run(ffmpeg_command, check=True, capture_output=True, text=True)
405
+ logger.info(f"Audio extracted to: {extracted_audio_path}")
406
+ audio_to_process = extracted_audio_path
407
+ except subprocess.CalledProcessError as e:
408
+ logger.error(f"FFmpeg error: {e.stderr}")
409
+ raise RuntimeError(f"Failed to extract audio from video: {e.stderr}")
410
+
411
+ if isinstance(audio_to_process, tuple):
412
+ sample_rate, data = audio_to_process
413
  temp_audio_path = os.path.join("/tmp", "temp_audio.wav")
414
  scipy.io.wavfile.write(temp_audio_path, sample_rate, data)
415
+ audio_to_process = temp_audio_path
416
+
417
  if seg_size > 512:
418
  logger.warning(f"Segment size {seg_size} is large, this may cause issues.")
419
  override_seg_size = override_seg_size == "True"
 
441
  progress(0.2, desc="Loading model...")
442
  separator.load_model(model_filename=model)
443
  progress(0.7, desc="Separating audio...")
444
+ separation = separator.separate(audio_to_process)
445
  stems = [os.path.join(output_dir, file_name) for file_name in separation]
446
  file_list = []
447
  if exclude_stems.strip():
 
462
  if temp_audio_path and os.path.exists(temp_audio_path):
463
  os.remove(temp_audio_path)
464
  logger.info(f"Temporary file deleted: {temp_audio_path}")
465
+ if extracted_audio_path and os.path.exists(extracted_audio_path):
466
+ os.remove(extracted_audio_path)
467
+ logger.info(f"Extracted audio file deleted: {extracted_audio_path}")
468
  if torch.cuda.is_available():
469
  torch.cuda.empty_cache()
470
  logger.info("GPU memory cleared")
 
472
  @spaces.GPU
473
  def auto_ensemble_process(audio, model_keys, state, seg_size=64, overlap=0.1, out_format="wav", use_tta="False", model_dir="/tmp/audio-separator-models/", output_dir="output", norm_thresh=0.9, amp_thresh=0.9, batch_size=1, ensemble_method="avg_wave", exclude_stems="", weights_str="", progress=gr.Progress(track_tqdm=True)):
474
  temp_audio_path = None
475
+ extracted_audio_path = None
476
  start_time = time.time()
477
  try:
478
  if not audio:
 
483
  logger.warning(f"Selected {len(model_keys)} models, limiting to {max_models}.")
484
  model_keys = model_keys[:max_models]
485
 
486
+ # Giriş dosyasının uzantısını kontrol et
487
+ file_extension = os.path.splitext(audio)[1].lower().lstrip('.')
488
+ supported_video_formats = ['mp4', 'mov', 'avi', 'mkv', 'flv', 'wmv', 'webm', 'mpeg', 'mpg']
489
+ is_video = file_extension in supported_video_formats
490
+
491
+ # Eğer giriş bir video dosyasıysa, sesi çıkar
492
+ audio_to_process = audio
493
+ if is_video:
494
+ extracted_audio_path = os.path.join("/tmp", f"extracted_audio_{os.path.basename(audio)}.wav")
495
+ logger.info(f"Extracting audio from video file: {audio}")
496
+ ffmpeg_command = [
497
+ "ffmpeg", "-i", audio, "-vn", "-acodec", "pcm_s16le", "-ar", "44100", "-ac", "2",
498
+ extracted_audio_path, "-y"
499
+ ]
500
+ try:
501
+ subprocess.run(ffmpeg_command, check=True, capture_output=True, text=True)
502
+ logger.info(f"Audio extracted to: {extracted_audio_path}")
503
+ audio_to_process = extracted_audio_path
504
+ except subprocess.CalledProcessError as e:
505
+ logger.error(f"FFmpeg error: {e.stderr}")
506
+ raise RuntimeError(f"Failed to extract audio from video: {e.stderr}")
507
+
508
  # Audio süresine göre dinamik batch size
509
+ audio_data, sr = librosa.load(audio_to_process, sr=None, mono=False)
510
  duration = librosa.get_duration(y=audio_data, sr=sr)
511
  logger.info(f"Audio duration: {duration:.2f} seconds")
512
  dynamic_batch_size = max(1, min(4, 1 + int(900 / (duration + 1)) - len(model_keys) // 2))
513
  logger.info(f"Using batch size: {dynamic_batch_size} for {len(model_keys)} models, duration {duration:.2f}s")
514
 
515
+ if isinstance(audio_to_process, tuple):
516
+ sample_rate, data = audio_to_process
517
  temp_audio_path = os.path.join("/tmp", "temp_audio.wav")
518
  scipy.io.wavfile.write(temp_audio_path, sample_rate, data)
519
+ audio_to_process = temp_audio_path
520
 
521
  # State kontrolü
522
  if not state:
 
658
  with gpu_lock:
659
  progress(0.3, desc=f"Separating with {model_key}")
660
  logger.info(f"Separating with {model_key}")
661
+ separation = separator.separate(audio_to_process)
662
  stems = [os.path.join(output_dir, file_name) for file_name in separation]
663
  result = []
664
 
 
725
  logger.info(f"Temporary file deleted: {temp_audio_path}")
726
  except Exception as e:
727
  logger.warning(f"Failed to delete temporary file {temp_audio_path}: {e}")
728
+ if extracted_audio_path and os.path.exists(extracted_audio_path):
729
+ try:
730
+ os.remove(extracted_audio_path)
731
+ logger.info(f"Extracted audio file deleted: {extracted_audio_path}")
732
+ except Exception as e:
733
+ logger.warning(f"Failed to delete extracted audio file {extracted_audio_path}: {e}")
734
  if torch.cuda.is_available():
735
  torch.cuda.empty_cache()
736
+ logger.info("GPU memory cleared")
737
+
738
  def update_roformer_models(category):
739
  """Update Roformer model dropdown based on selected category."""
740
  choices = list(ROFORMER_MODELS.get(category, {}).keys()) or []
 
754
  def create_interface():
755
  with gr.Blocks(title="🎵 SESA Fast Separation 🎵", css=CSS, elem_id="app-container") as app:
756
  gr.Markdown("<h1 class='header-text'>🎵 SESA Fast Separation 🎵</h1>")
757
+ gr.Markdown("**Note**: If YouTube downloads fail, upload a valid cookies file or a local WAV/MP4/MOV file. [Cookie Instructions](https://github.com/yt-dlp/yt-dlp/wiki/Extractors#exporting-youtube-cookies)")
758
+ gr.Markdown("**Tip**: For best results, use audio/video shorter than 15 minutes or fewer models (up to 6) to ensure smooth processing.")
759
  # Gradio State bileşeni
760
  ensemble_state = gr.State(value={
761
  "current_audio": None,
 
777
  with gr.Group(elem_classes="dubbing-theme"):
778
  gr.Markdown("### Audio Separation")
779
  with gr.Row():
780
+ roformer_audio = gr.Audio(label="🎧 Upload Audio/Video", type="filepath", interactive=True)
781
+ url_ro = gr.Textbox(label="🔗 Or Paste URL", placeholder="YouTube or audio/video URL", interactive=True)
782
  cookies_ro = gr.File(label="🍪 Cookies File", file_types=[".txt"], interactive=True)
783
  download_roformer = gr.Button("⬇️ Download", variant="secondary")
784
  roformer_download_status = gr.Textbox(label="📢 Download Status", interactive=False)
 
802
  gr.Markdown("### Ensemble Processing")
803
  gr.Markdown("Note: If weights are not specified, equal weights (1.0) are applied. Use up to 6 models for best results.")
804
  with gr.Row():
805
+ ensemble_audio = gr.Audio(label="🎧 Upload Audio/Video", type="filepath", interactive=True)
806
+ url_ensemble = gr.Textbox(label="🔗 Or Paste URL", placeholder="YouTube or audio/video URL", interactive=True)
807
  cookies_ensemble = gr.File(label="🍪 Cookies File", file_types=[".txt"], interactive=True)
808
  download_ensemble = gr.Button("⬇️ Download", variant="secondary")
809
  ensemble_download_status = gr.Textbox(label="📢 Download Status", interactive=False)
 
847
  fn=auto_ensemble_process,
848
  inputs=[
849
  ensemble_audio, ensemble_models, ensemble_state, ensemble_seg_size, ensemble_overlap,
850
+ output_format, ensemble_use_tta, model_dir, output_dir,
851
  norm_threshold, amp_threshold, batch_size, ensemble_method,
852
  ensemble_exclude_stems, ensemble_weights
853
  ],