transx

Sleeping

App Files Files Community

salomonsky commited on Jan 10

Commit

90d9071

verified ·

1 Parent(s): 8132a8c

Update app.py

Browse files

Files changed (1) hide show

app.py +28 -54

app.py CHANGED Viewed

@@ -7,6 +7,10 @@ import torch
 import soundfile as sf
 import gradio as gr
 from pathlib import Path
 logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
 logger = logging.getLogger(__name__)
@@ -21,18 +25,13 @@ try:
 except ImportError:
     pass
 except Exception as e:
-    logger.warning(f"No se pudo aplicar el parche de seguridad de TTS: {e}")
-from transformers import pipeline
-from demucs.pretrained import get_model
-from demucs.apply import apply_model
-import librosa
 class ProcessingManager:
     def __init__(self):
         self.device = "cuda" if torch.cuda.is_available() else "cpu"
         self.models = {}
-        self.temp_dir = Path(tempfile.gettempdir()) / "song_translator"
         self.temp_dir.mkdir(exist_ok=True)
     def get_whisper(self, model_size="large-v3"):
@@ -46,23 +45,6 @@ class ProcessingManager:
             )
         return self.models[key]
-    def get_translator(self, src, tgt):
-        key = f"trans_{src}_{tgt}"
-        if key not in self.models:
-            try:
-                model_name = f"Helsinki-NLP/opus-mt-{src}-{tgt}"
-                self.models[key] = pipeline("translation", model=model_name, device=self.device)
-            except Exception:
-                # Fallback a NLLB si el par de idiomas no existe en Helsinki-NLP
-                self.models[key] = pipeline(
-                    "translation",
-                    model="facebook/nllb-200-distilled-600M",
-                    device=self.device,
-                    src_lang=f"{src}_Latn",
-                    tgt_lang=f"{tgt}_Latn"
-                )
-        return self.models[key]
     def get_demucs(self):
         if "demucs" not in self.models:
             self.models["demucs"] = get_model("htdemucs")
@@ -78,8 +60,7 @@ manager = ProcessingManager()
 def process_audio_pipeline(
     audio_path,
-    src_lang,
-    tgt_lang,
     speaker_ref_path,
     voice_cleanup_slider,
     pitch_shift,
@@ -88,6 +69,9 @@ def process_audio_pipeline(
     try:
         if not audio_path:
             raise ValueError("No audio file provided")
         progress(0.1, desc="Separating Vocals...")
         demucs_model = manager.get_demucs()
@@ -109,26 +93,20 @@ def process_audio_pipeline(
         sf.write(vocal_path, vocals.T, 44100)
         sf.write(inst_path, instrumental.T, 44100)
-        progress(0.3, desc="Transcribing...")
         whisper = manager.get_whisper()
-        transcription = whisper(str(vocal_path), generate_kwargs={"task": "transcribe", "language": src_lang})
         original_text = transcription["text"]
-        progress(0.5, desc="Translating...")
-        translator = manager.get_translator(src_lang, tgt_lang)
-        trans_output = translator(original_text)
-        translated_text = trans_output[0]['translation_text'] if isinstance(trans_output, list) else trans_output['translation_text']
-        progress(0.7, desc="Synthesizing Vocals...")
         tts_model = manager.get_tts()
-        ref_audio = speaker_ref_path if speaker_ref_path else str(vocal_path)
         output_tts_path = manager.temp_dir / "tts_output.wav"
         tts_model.tts_to_file(
-            text=translated_text,
-            speaker_wav=ref_audio,
-            language=tgt_lang,
             file_path=str(output_tts_path),
             split_sentences=True
         )
@@ -148,37 +126,34 @@ def process_audio_pipeline(
             str(vocal_path),
             str(inst_path),
             str(output_tts_path),
-            original_text,
-            translated_text
         )
     except Exception as e:
         logger.error(f"Pipeline failed: {str(e)}", exc_info=True)
-        return None, None, None, None, f"Error: {str(e)}", ""
 custom_css = """
 .container { max_width: 900px; margin: auto; }
 .gr-box { border-radius: 10px !important; border: 1px solid #e0e0e0; box-shadow: 0 4px 6px rgba(0,0,0,0.05); }
 """
-with gr.Blocks(title="AI Song Translator") as demo:
-    gr.Markdown("# 🎵 AI Song Translator Pro")
     with gr.Row():
         with gr.Column(scale=1, variant="panel"):
             gr.Markdown("### 1. Input & Settings")
             input_audio = gr.Audio(label="Source Song", type="filepath")
-            ref_audio = gr.Audio(label="Voice Reference (Optional)", type="filepath")
-            with gr.Row():
-                src_lang = gr.Dropdown(["en", "es", "fr", "it", "de", "pt", "ja"], value="en", label="Source")
-                tgt_lang = gr.Dropdown(["en", "es", "fr", "it", "de", "pt", "ja"], value="es", label="Target")
             with gr.Accordion("Advanced Audio", open=False):
                 cleanup = gr.Slider(0, 1, value=0.5, label="Voice Cleanup")
                 pitch = gr.Slider(-12, 12, value=0, step=1, label="Pitch Shift")
-            btn_process = gr.Button("🚀 Start Processing", variant="primary", size="lg")
         with gr.Column(scale=1, variant="panel"):
             gr.Markdown("### 2. Output Results")
@@ -186,18 +161,17 @@ with gr.Blocks(title="AI Song Translator") as demo:
             with gr.Tabs():
                 with gr.Tab("Lyrics"):
-                    orig_txt = gr.Textbox(label="Original Lyrics", lines=4, interactive=False)
-                    trans_txt = gr.Textbox(label="Translated Lyrics", lines=4, interactive=False)
                 with gr.Tab("Stems"):
-                    voc_out = gr.Audio(label="Extracted Vocals")
                     inst_out = gr.Audio(label="Instrumental")
-                    tts_out = gr.Audio(label="Raw TTS")
     btn_process.click(
         fn=process_audio_pipeline,
-        inputs=[input_audio, src_lang, tgt_lang, ref_audio, cleanup, pitch],
-        outputs=[final_output, voc_out, inst_out, tts_out, orig_txt, trans_txt]
     )
 if __name__ == "__main__":

 import soundfile as sf
 import gradio as gr
 from pathlib import Path
+import librosa
+from transformers import pipeline
+from demucs.pretrained import get_model
+from demucs.apply import apply_model
 logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
 logger = logging.getLogger(__name__)
 except ImportError:
     pass
 except Exception as e:
+    logger.warning(f"{e}")
 class ProcessingManager:
     def __init__(self):
         self.device = "cuda" if torch.cuda.is_available() else "cpu"
         self.models = {}
+        self.temp_dir = Path(tempfile.gettempdir()) / "voice_mask_pro"
         self.temp_dir.mkdir(exist_ok=True)
     def get_whisper(self, model_size="large-v3"):
             )
         return self.models[key]
     def get_demucs(self):
         if "demucs" not in self.models:
             self.models["demucs"] = get_model("htdemucs")
 def process_audio_pipeline(
     audio_path,
+    language,
     speaker_ref_path,
     voice_cleanup_slider,
     pitch_shift,
     try:
         if not audio_path:
             raise ValueError("No audio file provided")
+        if not speaker_ref_path:
+            raise ValueError("Reference voice (MP3) is required")
         progress(0.1, desc="Separating Vocals...")
         demucs_model = manager.get_demucs()
         sf.write(vocal_path, vocals.T, 44100)
         sf.write(inst_path, instrumental.T, 44100)
+        progress(0.4, desc="Transcribing...")
         whisper = manager.get_whisper()
+        transcription = whisper(str(vocal_path), generate_kwargs={"task": "transcribe", "language": language})
         original_text = transcription["text"]
+        progress(0.6, desc="Synthesizing with Reference Voice...")
         tts_model = manager.get_tts()
         output_tts_path = manager.temp_dir / "tts_output.wav"
         tts_model.tts_to_file(
+            text=original_text,
+            speaker_wav=speaker_ref_path,
+            language=language,
             file_path=str(output_tts_path),
             split_sentences=True
         )
             str(vocal_path),
             str(inst_path),
             str(output_tts_path),
+            original_text
         )
     except Exception as e:
         logger.error(f"Pipeline failed: {str(e)}", exc_info=True)
+        return None, None, None, None, f"Error: {str(e)}"
 custom_css = """
 .container { max_width: 900px; margin: auto; }
 .gr-box { border-radius: 10px !important; border: 1px solid #e0e0e0; box-shadow: 0 4px 6px rgba(0,0,0,0.05); }
 """
+with gr.Blocks(title="AI Voice Masker") as demo:
+    gr.Markdown("# 🎤 AI Voice Masker")
     with gr.Row():
         with gr.Column(scale=1, variant="panel"):
             gr.Markdown("### 1. Input & Settings")
             input_audio = gr.Audio(label="Source Song", type="filepath")
+            ref_audio = gr.Audio(label="Reference Voice (MP3 Required)", type="filepath")
+            language = gr.Dropdown(["en", "es", "fr", "it", "de", "pt", "ja"], value="es", label="Song Language")
             with gr.Accordion("Advanced Audio", open=False):
                 cleanup = gr.Slider(0, 1, value=0.5, label="Voice Cleanup")
                 pitch = gr.Slider(-12, 12, value=0, step=1, label="Pitch Shift")
+            btn_process = gr.Button("🚀 Start Masking", variant="primary", size="lg")
         with gr.Column(scale=1, variant="panel"):
             gr.Markdown("### 2. Output Results")
             with gr.Tabs():
                 with gr.Tab("Lyrics"):
+                    orig_txt = gr.Textbox(label="Transcribed Lyrics", lines=8, interactive=False)
                 with gr.Tab("Stems"):
+                    voc_out = gr.Audio(label="Original Vocals")
                     inst_out = gr.Audio(label="Instrumental")
+                    tts_out = gr.Audio(label="Generated Vocals (Raw)")
     btn_process.click(
         fn=process_audio_pipeline,
+        inputs=[input_audio, language, ref_audio, cleanup, pitch],
+        outputs=[final_output, voc_out, inst_out, tts_out, orig_txt]
     )
 if __name__ == "__main__":