transx

Sleeping

App Files Files Community

sedrukjglfhsdlkf commited on Jan 6

Commit

1b42d2d

verified ·

1 Parent(s): 8b32add

Update app.py

Browse files

Files changed (1) hide show

app.py +140 -335

app.py CHANGED Viewed

@@ -25,7 +25,6 @@ class ModelCache:
     def load_whisper(self, model_size: str = "large-v3"):
         if self.whisper is None:
-            logger.info(f"Loading Whisper {model_size}...")
             self.whisper = pipeline(
                 "automatic-speech-recognition",
                 model=f"openai/whisper-{model_size}",
@@ -37,7 +36,6 @@ class ModelCache:
     def load_translator(self, src: str, tgt: str):
         model_key = f"{src}-{tgt}"
         if self.translator is None or getattr(self.translator, 'model_key', None) != model_key:
-            logger.info(f"Loading translator {src} -> {tgt}...")
             try:
                 model_name = f"Helsinki-NLP/opus-mt-{src}-{tgt}"
                 tokenizer = AutoTokenizer.from_pretrained(model_name)
@@ -50,7 +48,6 @@ class ModelCache:
                 )
                 self.translator.model_key = model_key
             except:
-                logger.info("Falling back to NLLB...")
                 self.translator = pipeline(
                     "translation",
                     model="facebook/nllb-200-distilled-600M",
@@ -63,7 +60,6 @@ class ModelCache:
     def load_demucs(self, model_name: str = "htdemucs"):
         if self.demucs is None:
-            logger.info(f"Loading Demucs {model_name}...")
             from demucs.pretrained import get_model
             self.demucs = get_model(model_name)
             self.demucs.cpu()
@@ -72,37 +68,24 @@ class ModelCache:
     def load_tts(self):
         if self.tts is None:
-            logger.info("Loading TTS for voice cloning...")
             from TTS.api import TTS
             self.tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2").to(self.device)
         return self.tts
 cache = ModelCache()
-def separate_audio(
-    audio_path: str,
-    model_name: str = "htdemucs",
-    progress=gr.Progress()
-) -> Tuple[str, str]:
-    progress(0.1, desc="Loading separation model...")
     from demucs.apply import apply_model
     model = cache.load_demucs(model_name)
-    progress(0.3, desc="Loading audio...")
     wav, sr = librosa.load(audio_path, sr=44100, mono=False)
     wav = torch.from_numpy(wav).float()
     if wav.dim() == 1:
         wav = wav.unsqueeze(0).repeat(2, 1)
     wav = wav.unsqueeze(0)
-    progress(0.5, desc="Separating vocals...")
     with torch.no_grad():
         sources = apply_model(model, wav)
-    progress(0.8, desc="Exporting stems...")
     vocals = sources[0, :, 3].cpu().numpy()
     instrumental = sources[0, :, :3].sum(0).cpu().numpy()
@@ -112,89 +95,71 @@ def separate_audio(
     sf.write(vocal_path, vocals.T, sr)
     sf.write(inst_path, instrumental.T, sr)
-    progress(1.0, desc="Separation complete!")
     return vocal_path, inst_path
-def transcribe_audio(
-    audio_path: str,
-    language: str,
-    model_size: str,
-    return_timestamps: bool,
-    progress=gr.Progress()
-) -> dict:
-    progress(0.2, desc="Loading Whisper...")
     model = cache.load_whisper(model_size)
-    progress(0.5, desc="Transcribing...")
-    result = model(
         audio_path,
         return_timestamps=return_timestamps,
         generate_kwargs={"language": language, "task": "transcribe"}
     )
-    progress(1.0, desc="Transcription complete!")
-    return result
-def translate_text(
-    text: str,
-    src_lang: str,
-    tgt_lang: str,
-    max_length: int,
-    progress=gr.Progress()
-) -> str:
-    progress(0.2, desc="Loading translator...")
     translator = cache.load_translator(src_lang, tgt_lang)
-    progress(0.6, desc="Translating...")
     chunks = [text[i:i+max_length] for i in range(0, len(text), max_length)]
     translations = []
     for i, chunk in enumerate(chunks):
-        progress((0.6 + 0.3 * (i/len(chunks))), desc=f"Translating chunk {i+1}/{len(chunks)}...")
         result = translator(chunk, max_length=max_length)
         if isinstance(result, list):
             translations.append(result[0]['translation_text'])
         else:
             translations.append(result['translation_text'])
-    progress(1.0, desc="Translation complete!")
     return " ".join(translations)
-def enhance_vocals(
-    vocal_path: str,
-    new_lyrics: str,
-    voice_prompt: str,
-    guidance_scale: float,
-    inference_steps: int,
-    progress=gr.Progress()
-) -> Optional[str]:
-    progress(0.1, desc="Loading TTS...")
     model = cache.load_tts()
-    if model is None:
-        logger.warning("TTS not available, returning original vocals")
-        return vocal_path
-    progress(0.5, desc="Generating enhanced vocals...")
-    output_path = tempfile.NamedTemporaryFile(delete=False, suffix="_enhanced.wav").name
     model.tts_to_file(
-        text=new_lyrics,
         file_path=output_path,
         speaker_wav=vocal_path,
-        language="en"
     )
-    progress(1.0, desc="Enhancement complete!")
     return output_path
-def align_audio_duration(
-    source_path: str,
-    target_path: str,
-    speed_range: Tuple[float, float],
-    progress=gr.Progress()
-) -> str:
-    progress(0.3, desc="Loading audio files...")
     source = AudioSegment.from_file(source_path)
     target = AudioSegment.from_file(target_path)
@@ -204,11 +169,9 @@ def align_audio_duration(
     if target_duration == 0:
         return target_path
-    progress(0.6, desc="Calculating alignment...")
     speed_ratio = target_duration / source_duration
     speed_ratio = max(speed_range[0], min(speed_range[1], speed_ratio))
-    progress(0.8, desc="Adjusting speed...")
     adjusted = target._spawn(target.raw_data, overrides={
         "frame_rate": int(target.frame_rate * speed_ratio)
     })
@@ -216,27 +179,15 @@ def align_audio_duration(
     output_path = tempfile.NamedTemporaryFile(delete=False, suffix="_aligned.wav").name
     adjusted.export(output_path, format="wav")
-    progress(1.0, desc="Alignment complete!")
     return output_path
-def mix_audio_stems(
-    vocals_path: str,
-    instrumental_path: str,
-    vocal_volume: float,
-    instrumental_volume: float,
-    output_format: str,
-    progress=gr.Progress()
-) -> str:
-    progress(0.3, desc="Loading stems...")
     vocals = AudioSegment.from_file(vocals_path)
     instrumental = AudioSegment.from_file(instrumental_path)
-    progress(0.5, desc="Adjusting volumes...")
-    vocals = vocals + vocal_volume
-    instrumental = instrumental + instrumental_volume
-    progress(0.7, desc="Mixing...")
     max_len = max(len(vocals), len(instrumental))
     if len(vocals) < max_len:
@@ -246,283 +197,137 @@ def mix_audio_stems(
     mixed = vocals.overlay(instrumental)
-    progress(0.9, desc="Exporting...")
-    output_path = tempfile.NamedTemporaryFile(delete=False, suffix=f".{output_format}").name
-    mixed.export(output_path, format=output_format, bitrate="320k")
-    progress(1.0, desc="Mixing complete!")
     return output_path
-def process_full_pipeline(
-    audio_file: str,
-    src_lang: str,
-    tgt_lang: str,
-    whisper_size: str,
-    demucs_model: str,
-    voice_prompt: str,
-    guidance_scale: float,
-    inference_steps: int,
-    translation_max_length: int,
-    speed_min: float,
-    speed_max: float,
-    vocal_volume: float,
-    inst_volume: float,
-    output_format: str,
-    enable_timestamps: bool,
-    progress=gr.Progress()
 ):
-    temp_files = []
     try:
-        progress(0, desc="Starting pipeline...")
-        progress(0.05, desc="Step 1/6: Separating audio...")
         vocal_path, inst_path = separate_audio(audio_file, demucs_model, progress)
-        temp_files.extend([vocal_path, inst_path])
-        progress(0.2, desc="Step 2/6: Transcribing vocals...")
-        transcription = transcribe_audio(vocal_path, src_lang, whisper_size, enable_timestamps, progress)
         original_lyrics = transcription['text']
         timestamps_info = json.dumps(transcription.get('chunks', []), indent=2) if enable_timestamps else ""
-        progress(0.4, desc="Step 3/6: Translating lyrics...")
         translated_lyrics = translate_text(original_lyrics, src_lang, tgt_lang, translation_max_length, progress)
-        progress(0.55, desc="Step 4/6: Enhancing vocals...")
-        enhanced_vocal = enhance_vocals(
-            vocal_path, translated_lyrics, voice_prompt,
-            guidance_scale, inference_steps, progress
-        )
-        temp_files.append(enhanced_vocal)
-        progress(0.75, desc="Step 5/6: Aligning audio...")
-        aligned_vocal = align_audio_duration(vocal_path, enhanced_vocal, (speed_min, speed_max), progress)
-        temp_files.append(aligned_vocal)
-        progress(0.9, desc="Step 6/6: Mixing final audio...")
-        final_audio = mix_audio_stems(
-            aligned_vocal, inst_path, vocal_volume, inst_volume, output_format, progress
-        )
-        progress(1.0, desc="✅ Processing complete!")
         return (
-            "✅ Processing complete!",
             original_lyrics,
             translated_lyrics,
             timestamps_info,
-            vocal_path,
             inst_path,
-            enhanced_vocal,
-            final_audio
         )
     except Exception as e:
-        logger.error(f"Pipeline error: {e}", exc_info=True)
-        return (
-            f"❌ Error: {str(e)}",
-            "", "", "", None, None, None, None
         )
-with gr.Blocks(theme=gr.themes.Soft(), title="Professional Song Translator") as demo:
-    gr.Markdown("""
-    # 🎤 Professional Song Voice Translator
-    ### Translate songs while preserving your voice using TTS
-    """)
-    with gr.Tabs():
-        with gr.Tab("🎵 Main Pipeline"):
-            with gr.Row():
-                with gr.Column(scale=1):
-                    gr.Markdown("### 📤 Input")
-                    audio_input = gr.Audio(
-                        label="Upload Song",
-                        type="filepath"
-                    )
-                    gr.Markdown("### 🌍 Languages")
-                    with gr.Row():
-                        src_lang = gr.Dropdown(
-                            choices=["es", "en", "fr", "de", "it", "pt", "nl", "ru", "zh", "ja", "ko"],
-                            value="es",
-                            label="Source Language"
-                        )
-                        tgt_lang = gr.Dropdown(
-                            choices=["en", "es", "fr", "de", "it", "pt", "nl", "ru", "zh", "ja", "ko"],
-                            value="en",
-                            label="Target Language"
-                        )
-                    process_btn = gr.Button("🚀 Process Song", variant="primary", size="lg")
-                    status_box = gr.Textbox(label="Status", lines=2, interactive=False)
-                with gr.Column(scale=1):
-                    gr.Markdown("### 📊 Results")
-                    final_output = gr.Audio(label="Final Mix", type="filepath")
-                    with gr.Accordion("🎼 Intermediate Outputs", open=False):
-                        vocal_output = gr.Audio(label="Extracted Vocals", type="filepath")
-                        inst_output = gr.Audio(label="Instrumental", type="filepath")
-                        enhanced_output = gr.Audio(label="Enhanced Vocals", type="filepath")
             with gr.Row():
-                with gr.Column():
-                    original_lyrics = gr.Textbox(
-                        label="📝 Original Lyrics",
-                        lines=10,
-                        interactive=False
-                    )
-                with gr.Column():
-                    translated_lyrics = gr.Textbox(
-                        label="🌍 Translated Lyrics",
-                        lines=10,
-                        interactive=False
-                    )
-            with gr.Accordion("⏱️ Timestamps", open=False):
-                timestamps_output = gr.Code(
-                    label="Detailed Timestamps (JSON)",
-                    language="json",
-                    lines=10
-                )
-        with gr.Tab("⚙️ Advanced Settings"):
-            gr.Markdown("### 🎛️ Model Configuration")
-            with gr.Row():
-                with gr.Column():
-                    gr.Markdown("#### Transcription (Whisper)")
-                    whisper_size = gr.Dropdown(
-                        choices=["tiny", "base", "small", "medium", "large-v3"],
-                        value="large-v3",
-                        label="Model Size"
-                    )
-                    enable_timestamps = gr.Checkbox(
-                        label="Enable Timestamps",
-                        value=True
-                    )
-                with gr.Column():
-                    gr.Markdown("#### Separation (Demucs)")
-                    demucs_model = gr.Dropdown(
-                        choices=["htdemucs", "htdemucs_ft", "mdx_extra"],
-                        value="htdemucs",
-                        label="Model"
-                    )
-            gr.Markdown("#### Voice Enhancement (TTS)")
-            voice_prompt = gr.Textbox(
-                label="Voice Style Prompt",
-                value="clear vocals, same voice style, natural singing",
-                lines=2
-            )
-            with gr.Row():
-                guidance_scale = gr.Slider(
-                    minimum=1.0,
-                    maximum=10.0,
-                    value=3.0,
-                    step=0.5,
-                    label="Guidance Scale"
-                )
-                inference_steps = gr.Slider(
-                    minimum=10,
-                    maximum=100,
-                    value=50,
-                    step=5,
-                    label="Inference Steps"
-                )
-            gr.Markdown("#### Translation")
-            translation_max_length = gr.Slider(
-                minimum=128,
-                maximum=1024,
-                value=512,
-                step=64,
-                label="Max Chunk Length"
-            )
-            gr.Markdown("#### Audio Alignment")
-            with gr.Row():
-                speed_min = gr.Slider(
-                    minimum=0.5,
-                    maximum=1.0,
-                    value=0.85,
-                    step=0.05,
-                    label="Min Speed Ratio"
-                )
-                speed_max = gr.Slider(
-                    minimum=1.0,
-                    maximum=1.5,
-                    value=1.15,
-                    step=0.05,
-                    label="Max Speed Ratio"
-                )
-            gr.Markdown("#### Final Mix")
-            with gr.Row():
-                vocal_volume = gr.Slider(
-                    minimum=-20,
-                    maximum=20,
-                    value=0,
-                    step=1,
-                    label="Vocal Volume (dB)"
-                )
-                inst_volume = gr.Slider(
-                    minimum=-20,
-                    maximum=20,
-                    value=-3,
-                    step=1,
-                    label="Instrumental Volume (dB)"
-                )
-            output_format = gr.Dropdown(
-                choices=["wav", "mp3", "flac"],
-                value="wav",
-                label="Output Format"
-            )
-        with gr.Tab("ℹ️ Info"):
-            gr.Markdown("""
-            ## How It Works
-            1. **Separation**: Extracts vocals and instrumental using Demucs
-            2. **Transcription**: Converts vocals to text using Whisper
-            3. **Translation**: Translates lyrics to target language
-            4. **Enhancement**: Regenerates vocals with TTS preserving your voice
-            5. **Alignment**: Matches timing to original audio
-            6. **Mixing**: Combines enhanced vocals with original instrumental
-            ## Tips
-            - Use **large-v3** for best transcription quality
-            - Adjust **guidance_scale** (2-4) for voice preservation
-            - Higher **inference_steps** = better quality but slower
-            - Keep speed ratios between 0.85-1.15 for natural sound
-            ## Requirements
-            GPU recommended for faster processing. CPU will work but slower.
-            """)
-    process_btn.click(
-        fn=process_full_pipeline,
         inputs=[
-            audio_input, src_lang, tgt_lang, whisper_size, demucs_model,
-            voice_prompt, guidance_scale, inference_steps, translation_max_length,
-            speed_min, speed_max, vocal_volume, inst_volume, output_format,
-            enable_timestamps
         ],
-        outputs=[
-            status_box, original_lyrics, translated_lyrics, timestamps_output,
-            vocal_output, inst_output, enhanced_output, final_output
-        ]
     )
 if __name__ == "__main__":
-    demo.queue(max_size=3)
-    demo.launch(
-        server_name="0.0.0.0",
-        server_port=7860,
-        share=False
-    )

     def load_whisper(self, model_size: str = "large-v3"):
         if self.whisper is None:
             self.whisper = pipeline(
                 "automatic-speech-recognition",
                 model=f"openai/whisper-{model_size}",
     def load_translator(self, src: str, tgt: str):
         model_key = f"{src}-{tgt}"
         if self.translator is None or getattr(self.translator, 'model_key', None) != model_key:
             try:
                 model_name = f"Helsinki-NLP/opus-mt-{src}-{tgt}"
                 tokenizer = AutoTokenizer.from_pretrained(model_name)
                 )
                 self.translator.model_key = model_key
             except:
                 self.translator = pipeline(
                     "translation",
                     model="facebook/nllb-200-distilled-600M",
     def load_demucs(self, model_name: str = "htdemucs"):
         if self.demucs is None:
             from demucs.pretrained import get_model
             self.demucs = get_model(model_name)
             self.demucs.cpu()
     def load_tts(self):
         if self.tts is None:
             from TTS.api import TTS
             self.tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2").to(self.device)
         return self.tts
 cache = ModelCache()
+def separate_audio(audio_path, model_name, progress=gr.Progress()):
     from demucs.apply import apply_model
     model = cache.load_demucs(model_name)
     wav, sr = librosa.load(audio_path, sr=44100, mono=False)
     wav = torch.from_numpy(wav).float()
     if wav.dim() == 1:
         wav = wav.unsqueeze(0).repeat(2, 1)
     wav = wav.unsqueeze(0)
     with torch.no_grad():
         sources = apply_model(model, wav)
     vocals = sources[0, :, 3].cpu().numpy()
     instrumental = sources[0, :, :3].sum(0).cpu().numpy()
     sf.write(vocal_path, vocals.T, sr)
     sf.write(inst_path, instrumental.T, sr)
     return vocal_path, inst_path
+def transcribe_audio(audio_path, language, model_size, return_timestamps):
     model = cache.load_whisper(model_size)
+    return model(
         audio_path,
         return_timestamps=return_timestamps,
         generate_kwargs={"language": language, "task": "transcribe"}
     )
+def translate_text(text, src_lang, tgt_lang, max_length, progress=gr.Progress()):
     translator = cache.load_translator(src_lang, tgt_lang)
     chunks = [text[i:i+max_length] for i in range(0, len(text), max_length)]
     translations = []
     for i, chunk in enumerate(chunks):
         result = translator(chunk, max_length=max_length)
         if isinstance(result, list):
             translations.append(result[0]['translation_text'])
         else:
             translations.append(result['translation_text'])
     return " ".join(translations)
+def apply_rvc_refinement(tts_output_path, original_vocal_path, progress=gr.Progress()):
     model = cache.load_tts()
+    output_rvc = tempfile.NamedTemporaryFile(delete=False, suffix="_rvc_refined.wav").name
+    try:
+        model.voice_conversion_to_file(
+            source_wav=original_vocal_path,
+            target_wav=tts_output_path,
+            file_path=output_rvc
+        )
+        return output_rvc
+    except Exception as e:
+        logger.error(f"RVC Refinement failed: {e}")
+        return tts_output_path
+def generate_vocals(
+    vocal_path,
+    lyrics,
+    voice_prompt,
+    guidance_scale,
+    inference_steps,
+    use_rvc,
+    progress=gr.Progress()
+):
+    model = cache.load_tts()
+    output_path = tempfile.NamedTemporaryFile(delete=False, suffix="_generated.wav").name
     model.tts_to_file(
+        text=lyrics,
         file_path=output_path,
         speaker_wav=vocal_path,
+        language="en",
+        split_sentences=True
     )
+    if use_rvc:
+        output_path = apply_rvc_refinement(output_path, vocal_path, progress)
     return output_path
+def align_audio_duration(source_path, target_path, speed_range):
     source = AudioSegment.from_file(source_path)
     target = AudioSegment.from_file(target_path)
     if target_duration == 0:
         return target_path
     speed_ratio = target_duration / source_duration
     speed_ratio = max(speed_range[0], min(speed_range[1], speed_ratio))
     adjusted = target._spawn(target.raw_data, overrides={
         "frame_rate": int(target.frame_rate * speed_ratio)
     })
     output_path = tempfile.NamedTemporaryFile(delete=False, suffix="_aligned.wav").name
     adjusted.export(output_path, format="wav")
     return output_path
+def mix_audio_stems(vocals_path, instrumental_path, vocal_vol, inst_vol, fmt):
     vocals = AudioSegment.from_file(vocals_path)
     instrumental = AudioSegment.from_file(instrumental_path)
+    vocals = vocals + vocal_vol
+    instrumental = instrumental + inst_vol
     max_len = max(len(vocals), len(instrumental))
     if len(vocals) < max_len:
     mixed = vocals.overlay(instrumental)
+    output_path = tempfile.NamedTemporaryFile(delete=False, suffix=f".{fmt}").name
+    mixed.export(output_path, format=fmt, bitrate="320k")
     return output_path
+def phase_1_analysis(
+    audio_file, src_lang, tgt_lang, whisper_size, demucs_model,
+    translation_max_length, enable_timestamps, progress=gr.Progress()
 ):
     try:
+        progress(0.1, desc="Separating audio stems...")
         vocal_path, inst_path = separate_audio(audio_file, demucs_model, progress)
+        progress(0.4, desc="Transcribing vocals...")
+        transcription = transcribe_audio(vocal_path, src_lang, whisper_size, enable_timestamps)
         original_lyrics = transcription['text']
         timestamps_info = json.dumps(transcription.get('chunks', []), indent=2) if enable_timestamps else ""
+        progress(0.7, desc="Translating lyrics...")
         translated_lyrics = translate_text(original_lyrics, src_lang, tgt_lang, translation_max_length, progress)
+        progress(1.0, desc="Analysis complete. Please edit lyrics.")
         return (
             original_lyrics,
             translated_lyrics,
             timestamps_info,
+            vocal_path,
             inst_path,
+            "✅ Analysis Complete! You can now edit the lyrics below."
         )
     except Exception as e:
+        return "", "", "", None, None, f"❌ Error: {str(e)}"
+def phase_2_generation(
+    edited_lyrics, vocal_path, inst_path,
+    voice_prompt, guidance_scale, inference_steps, use_rvc,
+    speed_min, speed_max, vocal_volume, inst_volume, output_format,
+    progress=gr.Progress()
+):
+    if not vocal_path or not inst_path:
+        return None, None, None, "❌ Error: Please run analysis first."
+    try:
+        progress(0.1, desc="Generating vocals (TTS)...")
+        generated_raw = generate_vocals(
+            vocal_path, edited_lyrics, voice_prompt,
+            guidance_scale, inference_steps, use_rvc, progress
         )
+        progress(0.6, desc="Aligning audio...")
+        aligned_vocal = align_audio_duration(vocal_path, generated_raw, (speed_min, speed_max))
+        progress(0.8, desc="Mixing final audio...")
+        final_audio = mix_audio_stems(aligned_vocal, inst_path, vocal_volume, inst_volume, output_format)
+        progress(1.0, desc="Done!")
+        return generated_raw, aligned_vocal, final_audio, "✅ Song Generation Complete!"
+    except Exception as e:
+        logger.error(f"Generation error: {e}", exc_info=True)
+        return None, None, None, f"❌ Error: {str(e)}"
+with gr.Blocks(theme=gr.themes.Soft(), title="Professional AI Dubbing") as demo:
+    vocal_state = gr.State()
+    inst_state = gr.State()
+    gr.Markdown("## 🎵 AI Song Translator with RVC & Lyrics Editor")
+    with gr.Row():
+        with gr.Column(scale=1):
+            gr.Markdown("### 1. Analysis & Translation")
+            audio_input = gr.Audio(label="Input Song", type="filepath")
             with gr.Row():
+                src_lang = gr.Dropdown(choices=["es", "en", "fr", "de", "it", "ja", "ko"], value="es", label="Source")
+                tgt_lang = gr.Dropdown(choices=["en", "es", "fr", "de", "it", "ja", "ko"], value="en", label="Target")
+            with gr.Accordion("Analysis Settings", open=False):
+                whisper_size = gr.Dropdown(["base", "small", "large-v3"], value="large-v3", label="Whisper Model")
+                demucs_model = gr.Dropdown(["htdemucs", "htdemucs_ft"], value="htdemucs", label="Demucs Model")
+                enable_timestamps = gr.Checkbox(value=True, label="Timestamps")
+                translation_len = gr.Slider(128, 1024, 512, step=64, label="Translation Chunk")
+            analyze_btn = gr.Button("🔍 Analyze & Translate", variant="primary")
+        with gr.Column(scale=1):
+            gr.Markdown("### 2. Edit Lyrics")
+            original_txt = gr.Textbox(label="Original Lyrics", lines=8, interactive=False)
+            translated_txt = gr.Textbox(label="Translated Lyrics (Editable)", lines=8, interactive=True)
+            status_box = gr.Textbox(label="System Status", interactive=False)
+    gr.Markdown("---")
+    with gr.Row():
+        with gr.Column(scale=1):
+            gr.Markdown("### 3. Generation Settings")
+            with gr.Group():
+                use_rvc = gr.Checkbox(value=True, label="Enable RVC Refinement (Natural Sound)")
+                voice_prompt = gr.Textbox(value="clear vocals, high quality", label="Style Prompt")
+            with gr.Accordion("Advanced Mixing", open=False):
+                speed_min = gr.Slider(0.5, 1.0, 0.85, step=0.05, label="Min Speed")
+                speed_max = gr.Slider(1.0, 1.5, 1.15, step=0.05, label="Max Speed")
+                vocal_vol = gr.Slider(-10, 10, 0, label="Vocal dB")
+                inst_vol = gr.Slider(-10, 10, -3, label="Inst dB")
+                fmt = gr.Dropdown(["wav", "mp3"], value="wav", label="Format")
+                guidance = gr.Slider(1, 10, 3.0, step=0.5, label="Guidance")
+                steps = gr.Slider(10, 100, 30, step=5, label="Steps")
+            generate_btn = gr.Button("🎹 Generate Song", variant="stop", size="lg")
+        with gr.Column(scale=1):
+            gr.Markdown("### 4. Final Output")
+            final_out = gr.Audio(label="Final Mixed Song")
+            with gr.Accordion("Stems", open=False):
+                raw_vocal_out = gr.Audio(label="Raw Generated Vocal")
+                aligned_vocal_out = gr.Audio(label="Aligned Vocal")
+                timestamps_out = gr.JSON(label="Timestamps")
+    analyze_btn.click(
+        fn=phase_1_analysis,
+        inputs=[audio_input, src_lang, tgt_lang, whisper_size, demucs_model, translation_len, enable_timestamps],
+        outputs=[original_txt, translated_txt, timestamps_out, vocal_state, inst_state, status_box]
+    )
+    generate_btn.click(
+        fn=phase_2_generation,
         inputs=[
+            translated_txt, vocal_state, inst_state,
+            voice_prompt, guidance, steps, use_rvc,
+            speed_min, speed_max, vocal_vol, inst_vol, fmt
         ],
+        outputs=[raw_vocal_out, aligned_vocal_out, final_out, status_box]
     )
 if __name__ == "__main__":
+    demo.queue().launch()