Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -19,7 +19,7 @@ class ModelCache:
|
|
| 19 |
def __init__(self):
|
| 20 |
self.whisper = None
|
| 21 |
self.translator = None
|
| 22 |
-
self.
|
| 23 |
self.demucs = None
|
| 24 |
self.device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 25 |
|
|
@@ -70,16 +70,12 @@ class ModelCache:
|
|
| 70 |
self.demucs.eval()
|
| 71 |
return self.demucs
|
| 72 |
|
| 73 |
-
def
|
| 74 |
-
if self.
|
| 75 |
-
logger.info("Loading
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
|
| 79 |
-
except Exception as e:
|
| 80 |
-
logger.error(f"ACE-Step not available: {e}")
|
| 81 |
-
self.ace_step = None
|
| 82 |
-
return self.ace_step
|
| 83 |
|
| 84 |
cache = ModelCache()
|
| 85 |
|
|
@@ -172,30 +168,23 @@ def enhance_vocals(
|
|
| 172 |
inference_steps: int,
|
| 173 |
progress=gr.Progress()
|
| 174 |
) -> Optional[str]:
|
| 175 |
-
progress(0.1, desc="Loading
|
| 176 |
-
model = cache.
|
| 177 |
|
| 178 |
if model is None:
|
| 179 |
-
logger.warning("
|
| 180 |
return vocal_path
|
| 181 |
|
| 182 |
-
progress(0.3, desc="Loading audio...")
|
| 183 |
-
audio, sr = librosa.load(vocal_path, sr=24000)
|
| 184 |
-
audio_tensor = torch.from_numpy(audio).unsqueeze(0).to(cache.device)
|
| 185 |
-
|
| 186 |
progress(0.5, desc="Generating enhanced vocals...")
|
| 187 |
-
|
| 188 |
-
|
|
|
|
| 189 |
text=new_lyrics,
|
| 190 |
-
|
| 191 |
-
|
| 192 |
-
|
| 193 |
)
|
| 194 |
|
| 195 |
-
progress(0.9, desc="Exporting audio...")
|
| 196 |
-
output_path = tempfile.NamedTemporaryFile(delete=False, suffix="_enhanced.wav").name
|
| 197 |
-
sf.write(output_path, output_audio.cpu().numpy().squeeze(), sr)
|
| 198 |
-
|
| 199 |
progress(1.0, desc="Enhancement complete!")
|
| 200 |
return output_path
|
| 201 |
|
|
@@ -334,13 +323,11 @@ def process_full_pipeline(
|
|
| 334 |
f"❌ Error: {str(e)}",
|
| 335 |
"", "", "", None, None, None, None
|
| 336 |
)
|
| 337 |
-
finally:
|
| 338 |
-
pass
|
| 339 |
|
| 340 |
with gr.Blocks(theme=gr.themes.Soft(), title="Professional Song Translator") as demo:
|
| 341 |
gr.Markdown("""
|
| 342 |
# 🎤 Professional Song Voice Translator
|
| 343 |
-
### Translate songs while preserving your voice using
|
| 344 |
""")
|
| 345 |
|
| 346 |
with gr.Tabs():
|
|
@@ -350,8 +337,7 @@ with gr.Blocks(theme=gr.themes.Soft(), title="Professional Song Translator") as
|
|
| 350 |
gr.Markdown("### 📤 Input")
|
| 351 |
audio_input = gr.Audio(
|
| 352 |
label="Upload Song",
|
| 353 |
-
type="filepath"
|
| 354 |
-
format="wav"
|
| 355 |
)
|
| 356 |
|
| 357 |
gr.Markdown("### 🌍 Languages")
|
|
@@ -424,7 +410,7 @@ with gr.Blocks(theme=gr.themes.Soft(), title="Professional Song Translator") as
|
|
| 424 |
label="Model"
|
| 425 |
)
|
| 426 |
|
| 427 |
-
gr.Markdown("#### Voice Enhancement (
|
| 428 |
voice_prompt = gr.Textbox(
|
| 429 |
label="Voice Style Prompt",
|
| 430 |
value="clear vocals, same voice style, natural singing",
|
|
@@ -503,7 +489,7 @@ with gr.Blocks(theme=gr.themes.Soft(), title="Professional Song Translator") as
|
|
| 503 |
1. **Separation**: Extracts vocals and instrumental using Demucs
|
| 504 |
2. **Transcription**: Converts vocals to text using Whisper
|
| 505 |
3. **Translation**: Translates lyrics to target language
|
| 506 |
-
4. **Enhancement**: Regenerates vocals with
|
| 507 |
5. **Alignment**: Matches timing to original audio
|
| 508 |
6. **Mixing**: Combines enhanced vocals with original instrumental
|
| 509 |
|
|
@@ -539,4 +525,4 @@ if __name__ == "__main__":
|
|
| 539 |
server_name="0.0.0.0",
|
| 540 |
server_port=7860,
|
| 541 |
share=False
|
| 542 |
-
|
|
|
|
| 19 |
def __init__(self):
|
| 20 |
self.whisper = None
|
| 21 |
self.translator = None
|
| 22 |
+
self.tts = None
|
| 23 |
self.demucs = None
|
| 24 |
self.device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 25 |
|
|
|
|
| 70 |
self.demucs.eval()
|
| 71 |
return self.demucs
|
| 72 |
|
| 73 |
+
def load_tts(self):
|
| 74 |
+
if self.tts is None:
|
| 75 |
+
logger.info("Loading TTS for voice cloning...")
|
| 76 |
+
from TTS.api import TTS
|
| 77 |
+
self.tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2").to(self.device)
|
| 78 |
+
return self.tts
|
|
|
|
|
|
|
|
|
|
|
|
|
| 79 |
|
| 80 |
cache = ModelCache()
|
| 81 |
|
|
|
|
| 168 |
inference_steps: int,
|
| 169 |
progress=gr.Progress()
|
| 170 |
) -> Optional[str]:
|
| 171 |
+
progress(0.1, desc="Loading TTS...")
|
| 172 |
+
model = cache.load_tts()
|
| 173 |
|
| 174 |
if model is None:
|
| 175 |
+
logger.warning("TTS not available, returning original vocals")
|
| 176 |
return vocal_path
|
| 177 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 178 |
progress(0.5, desc="Generating enhanced vocals...")
|
| 179 |
+
output_path = tempfile.NamedTemporaryFile(delete=False, suffix="_enhanced.wav").name
|
| 180 |
+
|
| 181 |
+
model.tts_to_file(
|
| 182 |
text=new_lyrics,
|
| 183 |
+
file_path=output_path,
|
| 184 |
+
speaker_wav=vocal_path,
|
| 185 |
+
language="en"
|
| 186 |
)
|
| 187 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 188 |
progress(1.0, desc="Enhancement complete!")
|
| 189 |
return output_path
|
| 190 |
|
|
|
|
| 323 |
f"❌ Error: {str(e)}",
|
| 324 |
"", "", "", None, None, None, None
|
| 325 |
)
|
|
|
|
|
|
|
| 326 |
|
| 327 |
with gr.Blocks(theme=gr.themes.Soft(), title="Professional Song Translator") as demo:
|
| 328 |
gr.Markdown("""
|
| 329 |
# 🎤 Professional Song Voice Translator
|
| 330 |
+
### Translate songs while preserving your voice using TTS
|
| 331 |
""")
|
| 332 |
|
| 333 |
with gr.Tabs():
|
|
|
|
| 337 |
gr.Markdown("### 📤 Input")
|
| 338 |
audio_input = gr.Audio(
|
| 339 |
label="Upload Song",
|
| 340 |
+
type="filepath"
|
|
|
|
| 341 |
)
|
| 342 |
|
| 343 |
gr.Markdown("### 🌍 Languages")
|
|
|
|
| 410 |
label="Model"
|
| 411 |
)
|
| 412 |
|
| 413 |
+
gr.Markdown("#### Voice Enhancement (TTS)")
|
| 414 |
voice_prompt = gr.Textbox(
|
| 415 |
label="Voice Style Prompt",
|
| 416 |
value="clear vocals, same voice style, natural singing",
|
|
|
|
| 489 |
1. **Separation**: Extracts vocals and instrumental using Demucs
|
| 490 |
2. **Transcription**: Converts vocals to text using Whisper
|
| 491 |
3. **Translation**: Translates lyrics to target language
|
| 492 |
+
4. **Enhancement**: Regenerates vocals with TTS preserving your voice
|
| 493 |
5. **Alignment**: Matches timing to original audio
|
| 494 |
6. **Mixing**: Combines enhanced vocals with original instrumental
|
| 495 |
|
|
|
|
| 525 |
server_name="0.0.0.0",
|
| 526 |
server_port=7860,
|
| 527 |
share=False
|
| 528 |
+
)
|