sedrukjglfhsdlkf commited on
Commit
1b42d2d
·
verified ·
1 Parent(s): 8b32add

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +140 -335
app.py CHANGED
@@ -25,7 +25,6 @@ class ModelCache:
25
 
26
  def load_whisper(self, model_size: str = "large-v3"):
27
  if self.whisper is None:
28
- logger.info(f"Loading Whisper {model_size}...")
29
  self.whisper = pipeline(
30
  "automatic-speech-recognition",
31
  model=f"openai/whisper-{model_size}",
@@ -37,7 +36,6 @@ class ModelCache:
37
  def load_translator(self, src: str, tgt: str):
38
  model_key = f"{src}-{tgt}"
39
  if self.translator is None or getattr(self.translator, 'model_key', None) != model_key:
40
- logger.info(f"Loading translator {src} -> {tgt}...")
41
  try:
42
  model_name = f"Helsinki-NLP/opus-mt-{src}-{tgt}"
43
  tokenizer = AutoTokenizer.from_pretrained(model_name)
@@ -50,7 +48,6 @@ class ModelCache:
50
  )
51
  self.translator.model_key = model_key
52
  except:
53
- logger.info("Falling back to NLLB...")
54
  self.translator = pipeline(
55
  "translation",
56
  model="facebook/nllb-200-distilled-600M",
@@ -63,7 +60,6 @@ class ModelCache:
63
 
64
  def load_demucs(self, model_name: str = "htdemucs"):
65
  if self.demucs is None:
66
- logger.info(f"Loading Demucs {model_name}...")
67
  from demucs.pretrained import get_model
68
  self.demucs = get_model(model_name)
69
  self.demucs.cpu()
@@ -72,37 +68,24 @@ class ModelCache:
72
 
73
  def load_tts(self):
74
  if self.tts is None:
75
- logger.info("Loading TTS for voice cloning...")
76
  from TTS.api import TTS
77
  self.tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2").to(self.device)
78
  return self.tts
79
 
80
  cache = ModelCache()
81
 
82
- def separate_audio(
83
- audio_path: str,
84
- model_name: str = "htdemucs",
85
- progress=gr.Progress()
86
- ) -> Tuple[str, str]:
87
- progress(0.1, desc="Loading separation model...")
88
-
89
  from demucs.apply import apply_model
90
  model = cache.load_demucs(model_name)
91
-
92
- progress(0.3, desc="Loading audio...")
93
  wav, sr = librosa.load(audio_path, sr=44100, mono=False)
94
  wav = torch.from_numpy(wav).float()
95
-
96
  if wav.dim() == 1:
97
  wav = wav.unsqueeze(0).repeat(2, 1)
98
-
99
  wav = wav.unsqueeze(0)
100
 
101
- progress(0.5, desc="Separating vocals...")
102
  with torch.no_grad():
103
  sources = apply_model(model, wav)
104
 
105
- progress(0.8, desc="Exporting stems...")
106
  vocals = sources[0, :, 3].cpu().numpy()
107
  instrumental = sources[0, :, :3].sum(0).cpu().numpy()
108
 
@@ -112,89 +95,71 @@ def separate_audio(
112
  sf.write(vocal_path, vocals.T, sr)
113
  sf.write(inst_path, instrumental.T, sr)
114
 
115
- progress(1.0, desc="Separation complete!")
116
  return vocal_path, inst_path
117
 
118
- def transcribe_audio(
119
- audio_path: str,
120
- language: str,
121
- model_size: str,
122
- return_timestamps: bool,
123
- progress=gr.Progress()
124
- ) -> dict:
125
- progress(0.2, desc="Loading Whisper...")
126
  model = cache.load_whisper(model_size)
127
-
128
- progress(0.5, desc="Transcribing...")
129
- result = model(
130
  audio_path,
131
  return_timestamps=return_timestamps,
132
  generate_kwargs={"language": language, "task": "transcribe"}
133
  )
134
-
135
- progress(1.0, desc="Transcription complete!")
136
- return result
137
 
138
- def translate_text(
139
- text: str,
140
- src_lang: str,
141
- tgt_lang: str,
142
- max_length: int,
143
- progress=gr.Progress()
144
- ) -> str:
145
- progress(0.2, desc="Loading translator...")
146
  translator = cache.load_translator(src_lang, tgt_lang)
147
-
148
- progress(0.6, desc="Translating...")
149
  chunks = [text[i:i+max_length] for i in range(0, len(text), max_length)]
150
  translations = []
151
 
152
  for i, chunk in enumerate(chunks):
153
- progress((0.6 + 0.3 * (i/len(chunks))), desc=f"Translating chunk {i+1}/{len(chunks)}...")
154
  result = translator(chunk, max_length=max_length)
155
  if isinstance(result, list):
156
  translations.append(result[0]['translation_text'])
157
  else:
158
  translations.append(result['translation_text'])
159
 
160
- progress(1.0, desc="Translation complete!")
161
  return " ".join(translations)
162
 
163
- def enhance_vocals(
164
- vocal_path: str,
165
- new_lyrics: str,
166
- voice_prompt: str,
167
- guidance_scale: float,
168
- inference_steps: int,
169
- progress=gr.Progress()
170
- ) -> Optional[str]:
171
- progress(0.1, desc="Loading TTS...")
172
  model = cache.load_tts()
 
173
 
174
- if model is None:
175
- logger.warning("TTS not available, returning original vocals")
176
- return vocal_path
177
-
178
- progress(0.5, desc="Generating enhanced vocals...")
179
- output_path = tempfile.NamedTemporaryFile(delete=False, suffix="_enhanced.wav").name
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
180
 
181
  model.tts_to_file(
182
- text=new_lyrics,
183
  file_path=output_path,
184
  speaker_wav=vocal_path,
185
- language="en"
 
186
  )
187
 
188
- progress(1.0, desc="Enhancement complete!")
 
 
189
  return output_path
190
 
191
- def align_audio_duration(
192
- source_path: str,
193
- target_path: str,
194
- speed_range: Tuple[float, float],
195
- progress=gr.Progress()
196
- ) -> str:
197
- progress(0.3, desc="Loading audio files...")
198
  source = AudioSegment.from_file(source_path)
199
  target = AudioSegment.from_file(target_path)
200
 
@@ -204,11 +169,9 @@ def align_audio_duration(
204
  if target_duration == 0:
205
  return target_path
206
 
207
- progress(0.6, desc="Calculating alignment...")
208
  speed_ratio = target_duration / source_duration
209
  speed_ratio = max(speed_range[0], min(speed_range[1], speed_ratio))
210
 
211
- progress(0.8, desc="Adjusting speed...")
212
  adjusted = target._spawn(target.raw_data, overrides={
213
  "frame_rate": int(target.frame_rate * speed_ratio)
214
  })
@@ -216,27 +179,15 @@ def align_audio_duration(
216
 
217
  output_path = tempfile.NamedTemporaryFile(delete=False, suffix="_aligned.wav").name
218
  adjusted.export(output_path, format="wav")
219
-
220
- progress(1.0, desc="Alignment complete!")
221
  return output_path
222
 
223
- def mix_audio_stems(
224
- vocals_path: str,
225
- instrumental_path: str,
226
- vocal_volume: float,
227
- instrumental_volume: float,
228
- output_format: str,
229
- progress=gr.Progress()
230
- ) -> str:
231
- progress(0.3, desc="Loading stems...")
232
  vocals = AudioSegment.from_file(vocals_path)
233
  instrumental = AudioSegment.from_file(instrumental_path)
234
 
235
- progress(0.5, desc="Adjusting volumes...")
236
- vocals = vocals + vocal_volume
237
- instrumental = instrumental + instrumental_volume
238
 
239
- progress(0.7, desc="Mixing...")
240
  max_len = max(len(vocals), len(instrumental))
241
 
242
  if len(vocals) < max_len:
@@ -246,283 +197,137 @@ def mix_audio_stems(
246
 
247
  mixed = vocals.overlay(instrumental)
248
 
249
- progress(0.9, desc="Exporting...")
250
- output_path = tempfile.NamedTemporaryFile(delete=False, suffix=f".{output_format}").name
251
- mixed.export(output_path, format=output_format, bitrate="320k")
252
-
253
- progress(1.0, desc="Mixing complete!")
254
  return output_path
255
 
256
- def process_full_pipeline(
257
- audio_file: str,
258
- src_lang: str,
259
- tgt_lang: str,
260
- whisper_size: str,
261
- demucs_model: str,
262
- voice_prompt: str,
263
- guidance_scale: float,
264
- inference_steps: int,
265
- translation_max_length: int,
266
- speed_min: float,
267
- speed_max: float,
268
- vocal_volume: float,
269
- inst_volume: float,
270
- output_format: str,
271
- enable_timestamps: bool,
272
- progress=gr.Progress()
273
  ):
274
- temp_files = []
275
-
276
  try:
277
- progress(0, desc="Starting pipeline...")
278
-
279
- progress(0.05, desc="Step 1/6: Separating audio...")
280
  vocal_path, inst_path = separate_audio(audio_file, demucs_model, progress)
281
- temp_files.extend([vocal_path, inst_path])
282
 
283
- progress(0.2, desc="Step 2/6: Transcribing vocals...")
284
- transcription = transcribe_audio(vocal_path, src_lang, whisper_size, enable_timestamps, progress)
285
  original_lyrics = transcription['text']
286
  timestamps_info = json.dumps(transcription.get('chunks', []), indent=2) if enable_timestamps else ""
287
 
288
- progress(0.4, desc="Step 3/6: Translating lyrics...")
289
  translated_lyrics = translate_text(original_lyrics, src_lang, tgt_lang, translation_max_length, progress)
290
 
291
- progress(0.55, desc="Step 4/6: Enhancing vocals...")
292
- enhanced_vocal = enhance_vocals(
293
- vocal_path, translated_lyrics, voice_prompt,
294
- guidance_scale, inference_steps, progress
295
- )
296
- temp_files.append(enhanced_vocal)
297
-
298
- progress(0.75, desc="Step 5/6: Aligning audio...")
299
- aligned_vocal = align_audio_duration(vocal_path, enhanced_vocal, (speed_min, speed_max), progress)
300
- temp_files.append(aligned_vocal)
301
-
302
- progress(0.9, desc="Step 6/6: Mixing final audio...")
303
- final_audio = mix_audio_stems(
304
- aligned_vocal, inst_path, vocal_volume, inst_volume, output_format, progress
305
- )
306
-
307
- progress(1.0, desc="✅ Processing complete!")
308
-
309
  return (
310
- "✅ Processing complete!",
311
  original_lyrics,
312
  translated_lyrics,
313
  timestamps_info,
314
- vocal_path,
315
  inst_path,
316
- enhanced_vocal,
317
- final_audio
318
  )
319
-
320
  except Exception as e:
321
- logger.error(f"Pipeline error: {e}", exc_info=True)
322
- return (
323
- f"❌ Error: {str(e)}",
324
- "", "", "", None, None, None, None
 
 
 
 
 
 
 
 
 
 
 
 
325
  )
 
 
 
 
 
 
 
 
 
 
 
 
326
 
327
- with gr.Blocks(theme=gr.themes.Soft(), title="Professional Song Translator") as demo:
328
- gr.Markdown("""
329
- # 🎤 Professional Song Voice Translator
330
- ### Translate songs while preserving your voice using TTS
331
- """)
332
 
333
- with gr.Tabs():
334
- with gr.Tab("🎵 Main Pipeline"):
335
- with gr.Row():
336
- with gr.Column(scale=1):
337
- gr.Markdown("### 📤 Input")
338
- audio_input = gr.Audio(
339
- label="Upload Song",
340
- type="filepath"
341
- )
342
-
343
- gr.Markdown("### 🌍 Languages")
344
- with gr.Row():
345
- src_lang = gr.Dropdown(
346
- choices=["es", "en", "fr", "de", "it", "pt", "nl", "ru", "zh", "ja", "ko"],
347
- value="es",
348
- label="Source Language"
349
- )
350
- tgt_lang = gr.Dropdown(
351
- choices=["en", "es", "fr", "de", "it", "pt", "nl", "ru", "zh", "ja", "ko"],
352
- value="en",
353
- label="Target Language"
354
- )
355
-
356
- process_btn = gr.Button("🚀 Process Song", variant="primary", size="lg")
357
- status_box = gr.Textbox(label="Status", lines=2, interactive=False)
358
-
359
- with gr.Column(scale=1):
360
- gr.Markdown("### 📊 Results")
361
- final_output = gr.Audio(label="Final Mix", type="filepath")
362
-
363
- with gr.Accordion("🎼 Intermediate Outputs", open=False):
364
- vocal_output = gr.Audio(label="Extracted Vocals", type="filepath")
365
- inst_output = gr.Audio(label="Instrumental", type="filepath")
366
- enhanced_output = gr.Audio(label="Enhanced Vocals", type="filepath")
367
-
368
  with gr.Row():
369
- with gr.Column():
370
- original_lyrics = gr.Textbox(
371
- label="📝 Original Lyrics",
372
- lines=10,
373
- interactive=False
374
- )
375
- with gr.Column():
376
- translated_lyrics = gr.Textbox(
377
- label="🌍 Translated Lyrics",
378
- lines=10,
379
- interactive=False
380
- )
381
 
382
- with gr.Accordion("⏱️ Timestamps", open=False):
383
- timestamps_output = gr.Code(
384
- label="Detailed Timestamps (JSON)",
385
- language="json",
386
- lines=10
387
- )
388
-
389
- with gr.Tab("⚙️ Advanced Settings"):
390
- gr.Markdown("### 🎛️ Model Configuration")
391
 
392
- with gr.Row():
393
- with gr.Column():
394
- gr.Markdown("#### Transcription (Whisper)")
395
- whisper_size = gr.Dropdown(
396
- choices=["tiny", "base", "small", "medium", "large-v3"],
397
- value="large-v3",
398
- label="Model Size"
399
- )
400
- enable_timestamps = gr.Checkbox(
401
- label="Enable Timestamps",
402
- value=True
403
- )
 
 
404
 
405
- with gr.Column():
406
- gr.Markdown("#### Separation (Demucs)")
407
- demucs_model = gr.Dropdown(
408
- choices=["htdemucs", "htdemucs_ft", "mdx_extra"],
409
- value="htdemucs",
410
- label="Model"
411
- )
412
-
413
- gr.Markdown("#### Voice Enhancement (TTS)")
414
- voice_prompt = gr.Textbox(
415
- label="Voice Style Prompt",
416
- value="clear vocals, same voice style, natural singing",
417
- lines=2
418
- )
419
-
420
- with gr.Row():
421
- guidance_scale = gr.Slider(
422
- minimum=1.0,
423
- maximum=10.0,
424
- value=3.0,
425
- step=0.5,
426
- label="Guidance Scale"
427
- )
428
- inference_steps = gr.Slider(
429
- minimum=10,
430
- maximum=100,
431
- value=50,
432
- step=5,
433
- label="Inference Steps"
434
- )
435
-
436
- gr.Markdown("#### Translation")
437
- translation_max_length = gr.Slider(
438
- minimum=128,
439
- maximum=1024,
440
- value=512,
441
- step=64,
442
- label="Max Chunk Length"
443
- )
444
-
445
- gr.Markdown("#### Audio Alignment")
446
- with gr.Row():
447
- speed_min = gr.Slider(
448
- minimum=0.5,
449
- maximum=1.0,
450
- value=0.85,
451
- step=0.05,
452
- label="Min Speed Ratio"
453
- )
454
- speed_max = gr.Slider(
455
- minimum=1.0,
456
- maximum=1.5,
457
- value=1.15,
458
- step=0.05,
459
- label="Max Speed Ratio"
460
- )
461
-
462
- gr.Markdown("#### Final Mix")
463
- with gr.Row():
464
- vocal_volume = gr.Slider(
465
- minimum=-20,
466
- maximum=20,
467
- value=0,
468
- step=1,
469
- label="Vocal Volume (dB)"
470
- )
471
- inst_volume = gr.Slider(
472
- minimum=-20,
473
- maximum=20,
474
- value=-3,
475
- step=1,
476
- label="Instrumental Volume (dB)"
477
- )
478
-
479
- output_format = gr.Dropdown(
480
- choices=["wav", "mp3", "flac"],
481
- value="wav",
482
- label="Output Format"
483
- )
484
-
485
- with gr.Tab("ℹ️ Info"):
486
- gr.Markdown("""
487
- ## How It Works
488
-
489
- 1. **Separation**: Extracts vocals and instrumental using Demucs
490
- 2. **Transcription**: Converts vocals to text using Whisper
491
- 3. **Translation**: Translates lyrics to target language
492
- 4. **Enhancement**: Regenerates vocals with TTS preserving your voice
493
- 5. **Alignment**: Matches timing to original audio
494
- 6. **Mixing**: Combines enhanced vocals with original instrumental
495
-
496
- ## Tips
497
-
498
- - Use **large-v3** for best transcription quality
499
- - Adjust **guidance_scale** (2-4) for voice preservation
500
- - Higher **inference_steps** = better quality but slower
501
- - Keep speed ratios between 0.85-1.15 for natural sound
502
-
503
- ## Requirements
504
 
505
- GPU recommended for faster processing. CPU will work but slower.
506
- """)
 
 
 
 
 
 
 
 
 
 
 
507
 
508
- process_btn.click(
509
- fn=process_full_pipeline,
510
  inputs=[
511
- audio_input, src_lang, tgt_lang, whisper_size, demucs_model,
512
- voice_prompt, guidance_scale, inference_steps, translation_max_length,
513
- speed_min, speed_max, vocal_volume, inst_volume, output_format,
514
- enable_timestamps
515
  ],
516
- outputs=[
517
- status_box, original_lyrics, translated_lyrics, timestamps_output,
518
- vocal_output, inst_output, enhanced_output, final_output
519
- ]
520
  )
521
 
522
  if __name__ == "__main__":
523
- demo.queue(max_size=3)
524
- demo.launch(
525
- server_name="0.0.0.0",
526
- server_port=7860,
527
- share=False
528
- )
 
25
 
26
  def load_whisper(self, model_size: str = "large-v3"):
27
  if self.whisper is None:
 
28
  self.whisper = pipeline(
29
  "automatic-speech-recognition",
30
  model=f"openai/whisper-{model_size}",
 
36
  def load_translator(self, src: str, tgt: str):
37
  model_key = f"{src}-{tgt}"
38
  if self.translator is None or getattr(self.translator, 'model_key', None) != model_key:
 
39
  try:
40
  model_name = f"Helsinki-NLP/opus-mt-{src}-{tgt}"
41
  tokenizer = AutoTokenizer.from_pretrained(model_name)
 
48
  )
49
  self.translator.model_key = model_key
50
  except:
 
51
  self.translator = pipeline(
52
  "translation",
53
  model="facebook/nllb-200-distilled-600M",
 
60
 
61
  def load_demucs(self, model_name: str = "htdemucs"):
62
  if self.demucs is None:
 
63
  from demucs.pretrained import get_model
64
  self.demucs = get_model(model_name)
65
  self.demucs.cpu()
 
68
 
69
  def load_tts(self):
70
  if self.tts is None:
 
71
  from TTS.api import TTS
72
  self.tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2").to(self.device)
73
  return self.tts
74
 
75
  cache = ModelCache()
76
 
77
+ def separate_audio(audio_path, model_name, progress=gr.Progress()):
 
 
 
 
 
 
78
  from demucs.apply import apply_model
79
  model = cache.load_demucs(model_name)
 
 
80
  wav, sr = librosa.load(audio_path, sr=44100, mono=False)
81
  wav = torch.from_numpy(wav).float()
 
82
  if wav.dim() == 1:
83
  wav = wav.unsqueeze(0).repeat(2, 1)
 
84
  wav = wav.unsqueeze(0)
85
 
 
86
  with torch.no_grad():
87
  sources = apply_model(model, wav)
88
 
 
89
  vocals = sources[0, :, 3].cpu().numpy()
90
  instrumental = sources[0, :, :3].sum(0).cpu().numpy()
91
 
 
95
  sf.write(vocal_path, vocals.T, sr)
96
  sf.write(inst_path, instrumental.T, sr)
97
 
 
98
  return vocal_path, inst_path
99
 
100
+ def transcribe_audio(audio_path, language, model_size, return_timestamps):
 
 
 
 
 
 
 
101
  model = cache.load_whisper(model_size)
102
+ return model(
 
 
103
  audio_path,
104
  return_timestamps=return_timestamps,
105
  generate_kwargs={"language": language, "task": "transcribe"}
106
  )
 
 
 
107
 
108
+ def translate_text(text, src_lang, tgt_lang, max_length, progress=gr.Progress()):
 
 
 
 
 
 
 
109
  translator = cache.load_translator(src_lang, tgt_lang)
 
 
110
  chunks = [text[i:i+max_length] for i in range(0, len(text), max_length)]
111
  translations = []
112
 
113
  for i, chunk in enumerate(chunks):
 
114
  result = translator(chunk, max_length=max_length)
115
  if isinstance(result, list):
116
  translations.append(result[0]['translation_text'])
117
  else:
118
  translations.append(result['translation_text'])
119
 
 
120
  return " ".join(translations)
121
 
122
+ def apply_rvc_refinement(tts_output_path, original_vocal_path, progress=gr.Progress()):
 
 
 
 
 
 
 
 
123
  model = cache.load_tts()
124
+ output_rvc = tempfile.NamedTemporaryFile(delete=False, suffix="_rvc_refined.wav").name
125
 
126
+ try:
127
+ model.voice_conversion_to_file(
128
+ source_wav=original_vocal_path,
129
+ target_wav=tts_output_path,
130
+ file_path=output_rvc
131
+ )
132
+ return output_rvc
133
+ except Exception as e:
134
+ logger.error(f"RVC Refinement failed: {e}")
135
+ return tts_output_path
136
+
137
+ def generate_vocals(
138
+ vocal_path,
139
+ lyrics,
140
+ voice_prompt,
141
+ guidance_scale,
142
+ inference_steps,
143
+ use_rvc,
144
+ progress=gr.Progress()
145
+ ):
146
+ model = cache.load_tts()
147
+ output_path = tempfile.NamedTemporaryFile(delete=False, suffix="_generated.wav").name
148
 
149
  model.tts_to_file(
150
+ text=lyrics,
151
  file_path=output_path,
152
  speaker_wav=vocal_path,
153
+ language="en",
154
+ split_sentences=True
155
  )
156
 
157
+ if use_rvc:
158
+ output_path = apply_rvc_refinement(output_path, vocal_path, progress)
159
+
160
  return output_path
161
 
162
+ def align_audio_duration(source_path, target_path, speed_range):
 
 
 
 
 
 
163
  source = AudioSegment.from_file(source_path)
164
  target = AudioSegment.from_file(target_path)
165
 
 
169
  if target_duration == 0:
170
  return target_path
171
 
 
172
  speed_ratio = target_duration / source_duration
173
  speed_ratio = max(speed_range[0], min(speed_range[1], speed_ratio))
174
 
 
175
  adjusted = target._spawn(target.raw_data, overrides={
176
  "frame_rate": int(target.frame_rate * speed_ratio)
177
  })
 
179
 
180
  output_path = tempfile.NamedTemporaryFile(delete=False, suffix="_aligned.wav").name
181
  adjusted.export(output_path, format="wav")
 
 
182
  return output_path
183
 
184
+ def mix_audio_stems(vocals_path, instrumental_path, vocal_vol, inst_vol, fmt):
 
 
 
 
 
 
 
 
185
  vocals = AudioSegment.from_file(vocals_path)
186
  instrumental = AudioSegment.from_file(instrumental_path)
187
 
188
+ vocals = vocals + vocal_vol
189
+ instrumental = instrumental + inst_vol
 
190
 
 
191
  max_len = max(len(vocals), len(instrumental))
192
 
193
  if len(vocals) < max_len:
 
197
 
198
  mixed = vocals.overlay(instrumental)
199
 
200
+ output_path = tempfile.NamedTemporaryFile(delete=False, suffix=f".{fmt}").name
201
+ mixed.export(output_path, format=fmt, bitrate="320k")
 
 
 
202
  return output_path
203
 
204
+ def phase_1_analysis(
205
+ audio_file, src_lang, tgt_lang, whisper_size, demucs_model,
206
+ translation_max_length, enable_timestamps, progress=gr.Progress()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
207
  ):
 
 
208
  try:
209
+ progress(0.1, desc="Separating audio stems...")
 
 
210
  vocal_path, inst_path = separate_audio(audio_file, demucs_model, progress)
 
211
 
212
+ progress(0.4, desc="Transcribing vocals...")
213
+ transcription = transcribe_audio(vocal_path, src_lang, whisper_size, enable_timestamps)
214
  original_lyrics = transcription['text']
215
  timestamps_info = json.dumps(transcription.get('chunks', []), indent=2) if enable_timestamps else ""
216
 
217
+ progress(0.7, desc="Translating lyrics...")
218
  translated_lyrics = translate_text(original_lyrics, src_lang, tgt_lang, translation_max_length, progress)
219
 
220
+ progress(1.0, desc="Analysis complete. Please edit lyrics.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
221
  return (
 
222
  original_lyrics,
223
  translated_lyrics,
224
  timestamps_info,
225
+ vocal_path,
226
  inst_path,
227
+ "✅ Analysis Complete! You can now edit the lyrics below."
 
228
  )
 
229
  except Exception as e:
230
+ return "", "", "", None, None, f" Error: {str(e)}"
231
+
232
+ def phase_2_generation(
233
+ edited_lyrics, vocal_path, inst_path,
234
+ voice_prompt, guidance_scale, inference_steps, use_rvc,
235
+ speed_min, speed_max, vocal_volume, inst_volume, output_format,
236
+ progress=gr.Progress()
237
+ ):
238
+ if not vocal_path or not inst_path:
239
+ return None, None, None, "❌ Error: Please run analysis first."
240
+
241
+ try:
242
+ progress(0.1, desc="Generating vocals (TTS)...")
243
+ generated_raw = generate_vocals(
244
+ vocal_path, edited_lyrics, voice_prompt,
245
+ guidance_scale, inference_steps, use_rvc, progress
246
  )
247
+
248
+ progress(0.6, desc="Aligning audio...")
249
+ aligned_vocal = align_audio_duration(vocal_path, generated_raw, (speed_min, speed_max))
250
+
251
+ progress(0.8, desc="Mixing final audio...")
252
+ final_audio = mix_audio_stems(aligned_vocal, inst_path, vocal_volume, inst_volume, output_format)
253
+
254
+ progress(1.0, desc="Done!")
255
+ return generated_raw, aligned_vocal, final_audio, "✅ Song Generation Complete!"
256
+ except Exception as e:
257
+ logger.error(f"Generation error: {e}", exc_info=True)
258
+ return None, None, None, f"❌ Error: {str(e)}"
259
 
260
+ with gr.Blocks(theme=gr.themes.Soft(), title="Professional AI Dubbing") as demo:
261
+ vocal_state = gr.State()
262
+ inst_state = gr.State()
 
 
263
 
264
+ gr.Markdown("## 🎵 AI Song Translator with RVC & Lyrics Editor")
265
+
266
+ with gr.Row():
267
+ with gr.Column(scale=1):
268
+ gr.Markdown("### 1. Analysis & Translation")
269
+ audio_input = gr.Audio(label="Input Song", type="filepath")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
270
  with gr.Row():
271
+ src_lang = gr.Dropdown(choices=["es", "en", "fr", "de", "it", "ja", "ko"], value="es", label="Source")
272
+ tgt_lang = gr.Dropdown(choices=["en", "es", "fr", "de", "it", "ja", "ko"], value="en", label="Target")
 
 
 
 
 
 
 
 
 
 
273
 
274
+ with gr.Accordion("Analysis Settings", open=False):
275
+ whisper_size = gr.Dropdown(["base", "small", "large-v3"], value="large-v3", label="Whisper Model")
276
+ demucs_model = gr.Dropdown(["htdemucs", "htdemucs_ft"], value="htdemucs", label="Demucs Model")
277
+ enable_timestamps = gr.Checkbox(value=True, label="Timestamps")
278
+ translation_len = gr.Slider(128, 1024, 512, step=64, label="Translation Chunk")
279
+
280
+ analyze_btn = gr.Button("🔍 Analyze & Translate", variant="primary")
 
 
281
 
282
+ with gr.Column(scale=1):
283
+ gr.Markdown("### 2. Edit Lyrics")
284
+ original_txt = gr.Textbox(label="Original Lyrics", lines=8, interactive=False)
285
+ translated_txt = gr.Textbox(label="Translated Lyrics (Editable)", lines=8, interactive=True)
286
+ status_box = gr.Textbox(label="System Status", interactive=False)
287
+
288
+ gr.Markdown("---")
289
+
290
+ with gr.Row():
291
+ with gr.Column(scale=1):
292
+ gr.Markdown("### 3. Generation Settings")
293
+ with gr.Group():
294
+ use_rvc = gr.Checkbox(value=True, label="Enable RVC Refinement (Natural Sound)")
295
+ voice_prompt = gr.Textbox(value="clear vocals, high quality", label="Style Prompt")
296
 
297
+ with gr.Accordion("Advanced Mixing", open=False):
298
+ speed_min = gr.Slider(0.5, 1.0, 0.85, step=0.05, label="Min Speed")
299
+ speed_max = gr.Slider(1.0, 1.5, 1.15, step=0.05, label="Max Speed")
300
+ vocal_vol = gr.Slider(-10, 10, 0, label="Vocal dB")
301
+ inst_vol = gr.Slider(-10, 10, -3, label="Inst dB")
302
+ fmt = gr.Dropdown(["wav", "mp3"], value="wav", label="Format")
303
+ guidance = gr.Slider(1, 10, 3.0, step=0.5, label="Guidance")
304
+ steps = gr.Slider(10, 100, 30, step=5, label="Steps")
305
+
306
+ generate_btn = gr.Button("🎹 Generate Song", variant="stop", size="lg")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
307
 
308
+ with gr.Column(scale=1):
309
+ gr.Markdown("### 4. Final Output")
310
+ final_out = gr.Audio(label="Final Mixed Song")
311
+ with gr.Accordion("Stems", open=False):
312
+ raw_vocal_out = gr.Audio(label="Raw Generated Vocal")
313
+ aligned_vocal_out = gr.Audio(label="Aligned Vocal")
314
+ timestamps_out = gr.JSON(label="Timestamps")
315
+
316
+ analyze_btn.click(
317
+ fn=phase_1_analysis,
318
+ inputs=[audio_input, src_lang, tgt_lang, whisper_size, demucs_model, translation_len, enable_timestamps],
319
+ outputs=[original_txt, translated_txt, timestamps_out, vocal_state, inst_state, status_box]
320
+ )
321
 
322
+ generate_btn.click(
323
+ fn=phase_2_generation,
324
  inputs=[
325
+ translated_txt, vocal_state, inst_state,
326
+ voice_prompt, guidance, steps, use_rvc,
327
+ speed_min, speed_max, vocal_vol, inst_vol, fmt
 
328
  ],
329
+ outputs=[raw_vocal_out, aligned_vocal_out, final_out, status_box]
 
 
 
330
  )
331
 
332
  if __name__ == "__main__":
333
+ demo.queue().launch()