sedrukjglfhsdlkf commited on
Commit
8b32add
·
verified ·
1 Parent(s): 1b1b46a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +21 -35
app.py CHANGED
@@ -19,7 +19,7 @@ class ModelCache:
19
  def __init__(self):
20
  self.whisper = None
21
  self.translator = None
22
- self.ace_step = None
23
  self.demucs = None
24
  self.device = "cuda" if torch.cuda.is_available() else "cpu"
25
 
@@ -70,16 +70,12 @@ class ModelCache:
70
  self.demucs.eval()
71
  return self.demucs
72
 
73
- def load_ace_step(self):
74
- if self.ace_step is None:
75
- logger.info("Loading ACE-Step...")
76
- try:
77
- from ACE_Step import ACEStepModel
78
- self.ace_step = ACEStepModel.from_pretrained("ace-step/ACE-Step").to(self.device)
79
- except Exception as e:
80
- logger.error(f"ACE-Step not available: {e}")
81
- self.ace_step = None
82
- return self.ace_step
83
 
84
  cache = ModelCache()
85
 
@@ -172,30 +168,23 @@ def enhance_vocals(
172
  inference_steps: int,
173
  progress=gr.Progress()
174
  ) -> Optional[str]:
175
- progress(0.1, desc="Loading ACE-Step...")
176
- model = cache.load_ace_step()
177
 
178
  if model is None:
179
- logger.warning("ACE-Step not available, returning original vocals")
180
  return vocal_path
181
 
182
- progress(0.3, desc="Loading audio...")
183
- audio, sr = librosa.load(vocal_path, sr=24000)
184
- audio_tensor = torch.from_numpy(audio).unsqueeze(0).to(cache.device)
185
-
186
  progress(0.5, desc="Generating enhanced vocals...")
187
- output_audio = model.generate(
188
- audio=audio_tensor,
 
189
  text=new_lyrics,
190
- voice_prompt=voice_prompt,
191
- guidance_scale=guidance_scale,
192
- num_inference_steps=inference_steps
193
  )
194
 
195
- progress(0.9, desc="Exporting audio...")
196
- output_path = tempfile.NamedTemporaryFile(delete=False, suffix="_enhanced.wav").name
197
- sf.write(output_path, output_audio.cpu().numpy().squeeze(), sr)
198
-
199
  progress(1.0, desc="Enhancement complete!")
200
  return output_path
201
 
@@ -334,13 +323,11 @@ def process_full_pipeline(
334
  f"❌ Error: {str(e)}",
335
  "", "", "", None, None, None, None
336
  )
337
- finally:
338
- pass
339
 
340
  with gr.Blocks(theme=gr.themes.Soft(), title="Professional Song Translator") as demo:
341
  gr.Markdown("""
342
  # 🎤 Professional Song Voice Translator
343
- ### Translate songs while preserving your voice using ACE-Step
344
  """)
345
 
346
  with gr.Tabs():
@@ -350,8 +337,7 @@ with gr.Blocks(theme=gr.themes.Soft(), title="Professional Song Translator") as
350
  gr.Markdown("### 📤 Input")
351
  audio_input = gr.Audio(
352
  label="Upload Song",
353
- type="filepath",
354
- format="wav"
355
  )
356
 
357
  gr.Markdown("### 🌍 Languages")
@@ -424,7 +410,7 @@ with gr.Blocks(theme=gr.themes.Soft(), title="Professional Song Translator") as
424
  label="Model"
425
  )
426
 
427
- gr.Markdown("#### Voice Enhancement (ACE-Step)")
428
  voice_prompt = gr.Textbox(
429
  label="Voice Style Prompt",
430
  value="clear vocals, same voice style, natural singing",
@@ -503,7 +489,7 @@ with gr.Blocks(theme=gr.themes.Soft(), title="Professional Song Translator") as
503
  1. **Separation**: Extracts vocals and instrumental using Demucs
504
  2. **Transcription**: Converts vocals to text using Whisper
505
  3. **Translation**: Translates lyrics to target language
506
- 4. **Enhancement**: Regenerates vocals with ACE-Step preserving your voice
507
  5. **Alignment**: Matches timing to original audio
508
  6. **Mixing**: Combines enhanced vocals with original instrumental
509
 
@@ -539,4 +525,4 @@ if __name__ == "__main__":
539
  server_name="0.0.0.0",
540
  server_port=7860,
541
  share=False
542
-
 
19
  def __init__(self):
20
  self.whisper = None
21
  self.translator = None
22
+ self.tts = None
23
  self.demucs = None
24
  self.device = "cuda" if torch.cuda.is_available() else "cpu"
25
 
 
70
  self.demucs.eval()
71
  return self.demucs
72
 
73
+ def load_tts(self):
74
+ if self.tts is None:
75
+ logger.info("Loading TTS for voice cloning...")
76
+ from TTS.api import TTS
77
+ self.tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2").to(self.device)
78
+ return self.tts
 
 
 
 
79
 
80
  cache = ModelCache()
81
 
 
168
  inference_steps: int,
169
  progress=gr.Progress()
170
  ) -> Optional[str]:
171
+ progress(0.1, desc="Loading TTS...")
172
+ model = cache.load_tts()
173
 
174
  if model is None:
175
+ logger.warning("TTS not available, returning original vocals")
176
  return vocal_path
177
 
 
 
 
 
178
  progress(0.5, desc="Generating enhanced vocals...")
179
+ output_path = tempfile.NamedTemporaryFile(delete=False, suffix="_enhanced.wav").name
180
+
181
+ model.tts_to_file(
182
  text=new_lyrics,
183
+ file_path=output_path,
184
+ speaker_wav=vocal_path,
185
+ language="en"
186
  )
187
 
 
 
 
 
188
  progress(1.0, desc="Enhancement complete!")
189
  return output_path
190
 
 
323
  f"❌ Error: {str(e)}",
324
  "", "", "", None, None, None, None
325
  )
 
 
326
 
327
  with gr.Blocks(theme=gr.themes.Soft(), title="Professional Song Translator") as demo:
328
  gr.Markdown("""
329
  # 🎤 Professional Song Voice Translator
330
+ ### Translate songs while preserving your voice using TTS
331
  """)
332
 
333
  with gr.Tabs():
 
337
  gr.Markdown("### 📤 Input")
338
  audio_input = gr.Audio(
339
  label="Upload Song",
340
+ type="filepath"
 
341
  )
342
 
343
  gr.Markdown("### 🌍 Languages")
 
410
  label="Model"
411
  )
412
 
413
+ gr.Markdown("#### Voice Enhancement (TTS)")
414
  voice_prompt = gr.Textbox(
415
  label="Voice Style Prompt",
416
  value="clear vocals, same voice style, natural singing",
 
489
  1. **Separation**: Extracts vocals and instrumental using Demucs
490
  2. **Transcription**: Converts vocals to text using Whisper
491
  3. **Translation**: Translates lyrics to target language
492
+ 4. **Enhancement**: Regenerates vocals with TTS preserving your voice
493
  5. **Alignment**: Matches timing to original audio
494
  6. **Mixing**: Combines enhanced vocals with original instrumental
495
 
 
525
  server_name="0.0.0.0",
526
  server_port=7860,
527
  share=False
528
+ )