transx

Sleeping

App Files Files Community

sedrukjglfhsdlkf commited on Jan 6

Commit

8b32add

verified ·

1 Parent(s): 1b1b46a

Update app.py

Browse files

Files changed (1) hide show

app.py +21 -35

app.py CHANGED Viewed

@@ -19,7 +19,7 @@ class ModelCache:
     def __init__(self):
         self.whisper = None
         self.translator = None
-        self.ace_step = None
         self.demucs = None
         self.device = "cuda" if torch.cuda.is_available() else "cpu"
@@ -70,16 +70,12 @@ class ModelCache:
             self.demucs.eval()
         return self.demucs
-    def load_ace_step(self):
-        if self.ace_step is None:
-            logger.info("Loading ACE-Step...")
-            try:
-                from ACE_Step import ACEStepModel
-                self.ace_step = ACEStepModel.from_pretrained("ace-step/ACE-Step").to(self.device)
-            except Exception as e:
-                logger.error(f"ACE-Step not available: {e}")
-                self.ace_step = None
-        return self.ace_step
 cache = ModelCache()
@@ -172,30 +168,23 @@ def enhance_vocals(
     inference_steps: int,
     progress=gr.Progress()
 ) -> Optional[str]:
-    progress(0.1, desc="Loading ACE-Step...")
-    model = cache.load_ace_step()
     if model is None:
-        logger.warning("ACE-Step not available, returning original vocals")
         return vocal_path
-    progress(0.3, desc="Loading audio...")
-    audio, sr = librosa.load(vocal_path, sr=24000)
-    audio_tensor = torch.from_numpy(audio).unsqueeze(0).to(cache.device)
     progress(0.5, desc="Generating enhanced vocals...")
-    output_audio = model.generate(
-        audio=audio_tensor,
         text=new_lyrics,
-        voice_prompt=voice_prompt,
-        guidance_scale=guidance_scale,
-        num_inference_steps=inference_steps
     )
-    progress(0.9, desc="Exporting audio...")
-    output_path = tempfile.NamedTemporaryFile(delete=False, suffix="_enhanced.wav").name
-    sf.write(output_path, output_audio.cpu().numpy().squeeze(), sr)
     progress(1.0, desc="Enhancement complete!")
     return output_path
@@ -334,13 +323,11 @@ def process_full_pipeline(
             f"❌ Error: {str(e)}",
             "", "", "", None, None, None, None
         )
-    finally:
-        pass
 with gr.Blocks(theme=gr.themes.Soft(), title="Professional Song Translator") as demo:
     gr.Markdown("""
     # 🎤 Professional Song Voice Translator
-    ### Translate songs while preserving your voice using ACE-Step
     """)
     with gr.Tabs():
@@ -350,8 +337,7 @@ with gr.Blocks(theme=gr.themes.Soft(), title="Professional Song Translator") as
                     gr.Markdown("### 📤 Input")
                     audio_input = gr.Audio(
                         label="Upload Song",
-                        type="filepath",
-                        format="wav"
                     )
                     gr.Markdown("### 🌍 Languages")
@@ -424,7 +410,7 @@ with gr.Blocks(theme=gr.themes.Soft(), title="Professional Song Translator") as
                         label="Model"
                     )
-            gr.Markdown("#### Voice Enhancement (ACE-Step)")
             voice_prompt = gr.Textbox(
                 label="Voice Style Prompt",
                 value="clear vocals, same voice style, natural singing",
@@ -503,7 +489,7 @@ with gr.Blocks(theme=gr.themes.Soft(), title="Professional Song Translator") as
             1. **Separation**: Extracts vocals and instrumental using Demucs
             2. **Transcription**: Converts vocals to text using Whisper
             3. **Translation**: Translates lyrics to target language
-            4. **Enhancement**: Regenerates vocals with ACE-Step preserving your voice
             5. **Alignment**: Matches timing to original audio
             6. **Mixing**: Combines enhanced vocals with original instrumental
@@ -539,4 +525,4 @@ if __name__ == "__main__":
         server_name="0.0.0.0",
         server_port=7860,
         share=False

     def __init__(self):
         self.whisper = None
         self.translator = None
+        self.tts = None
         self.demucs = None
         self.device = "cuda" if torch.cuda.is_available() else "cpu"
             self.demucs.eval()
         return self.demucs
+    def load_tts(self):
+        if self.tts is None:
+            logger.info("Loading TTS for voice cloning...")
+            from TTS.api import TTS
+            self.tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2").to(self.device)
+        return self.tts
 cache = ModelCache()
     inference_steps: int,
     progress=gr.Progress()
 ) -> Optional[str]:
+    progress(0.1, desc="Loading TTS...")
+    model = cache.load_tts()
     if model is None:
+        logger.warning("TTS not available, returning original vocals")
         return vocal_path
     progress(0.5, desc="Generating enhanced vocals...")
+    output_path = tempfile.NamedTemporaryFile(delete=False, suffix="_enhanced.wav").name
+    model.tts_to_file(
         text=new_lyrics,
+        file_path=output_path,
+        speaker_wav=vocal_path,
+        language="en"
     )
     progress(1.0, desc="Enhancement complete!")
     return output_path
             f"❌ Error: {str(e)}",
             "", "", "", None, None, None, None
         )
 with gr.Blocks(theme=gr.themes.Soft(), title="Professional Song Translator") as demo:
     gr.Markdown("""
     # 🎤 Professional Song Voice Translator
+    ### Translate songs while preserving your voice using TTS
     """)
     with gr.Tabs():
                     gr.Markdown("### 📤 Input")
                     audio_input = gr.Audio(
                         label="Upload Song",
+                        type="filepath"
                     )
                     gr.Markdown("### 🌍 Languages")
                         label="Model"
                     )
+            gr.Markdown("#### Voice Enhancement (TTS)")
             voice_prompt = gr.Textbox(
                 label="Voice Style Prompt",
                 value="clear vocals, same voice style, natural singing",
             1. **Separation**: Extracts vocals and instrumental using Demucs
             2. **Transcription**: Converts vocals to text using Whisper
             3. **Translation**: Translates lyrics to target language
+            4. **Enhancement**: Regenerates vocals with TTS preserving your voice
             5. **Alignment**: Matches timing to original audio
             6. **Mixing**: Combines enhanced vocals with original instrumental
         server_name="0.0.0.0",
         server_port=7860,
         share=False
+    )