Spaces:

crackuser
/

voiceclone-dev

Running

App Files Files Community

crackuser commited on Sep 12, 2025

Commit

71d678c

verified ·

1 Parent(s): f61fc95

Update app.py

Browse files

Files changed (1) hide show

app.py +71 -86

app.py CHANGED Viewed

@@ -8,7 +8,6 @@ from contextlib import contextmanager
 warnings.filterwarnings("ignore")
 os.environ["COQUI_TOS_AGREED"] = "1"
 print("🚀 Starting Voice Cloning Studio...")
 @contextmanager
@@ -63,12 +62,25 @@ def load_whisper():
         return False
 def voice_to_voice_clone(reference_audio, input_audio, language="en"):
     try:
         if not reference_audio or not input_audio:
             return None, "❌ Please upload both reference and input audio files!"
         if not load_xtts_manual():
             return None, f"❌ XTTS loading failed!\nStatus: {MODEL_STATUS}"
         load_whisper()
         extracted_text = "Voice cloning demonstration."
         if WHISPER_MODEL:
             try:
@@ -79,8 +91,12 @@ def voice_to_voice_clone(reference_audio, input_audio, language="en"):
                 print(f"✅ Extracted: '{extracted_text[:100]}...'")
             except Exception as e:
                 print(f"⚠️ Whisper error: {e}")
         with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_file:
             output_path = tmp_file.name
         with patch_torch_load():
             TTS_MODEL.tts_to_file(
                 text=extracted_text,
@@ -88,99 +104,68 @@ def voice_to_voice_clone(reference_audio, input_audio, language="en"):
                 language=language,
                 file_path=output_path
             )
         if os.path.exists(output_path) and os.path.getsize(output_path) > 0:
-            return output_path, f"""✅ VOICE-TO-VOICE CLONING SUCCESS!
 📝 Content: '{extracted_text[:150]}...'
 🎭 Device: {DEVICE}
 🔧 Status: {MODEL_STATUS}
 """
         else:
             return None, "❌ Generated audio file is empty!"
     except Exception as e:
-        return None, f"❌ Voice cloning error: {str(e)}\nModel: {MODEL_STATUS}"
-with gr.Blocks(title="Voice Cloning Studio") as demo:
-    gr.HTML("""
-    <div style="text-align: center; padding: 25px;">
-        <h1>🎭 REAL Voice Cloning Studio</h1>
-        <p>Status: Models load on first use</p>
-    </div>
-    """)
-    with gr.Row():
-        with gr.Column():
-            reference_audio = gr.Audio(
-                label="🎤 Reference Audio (Voice to Clone)",
-                type="filepath",
-                sources=["upload", "microphone"]
-            )
-            input_audio = gr.Audio(
-                label="🎵 Input Audio (Content to Transform)",
-                type="filepath",
-                sources=["upload", "microphone"]
-            )
-            language = gr.Dropdown(
-                choices=[
-                    ("English", "en"),
-                    ("Spanish", "es"),
-                    ("French", "fr"),
-                    ("German", "de"),
-                    ("Italian", "it"),
-                    ("Portuguese", "pt"),
-                    ("Polish", "pl"),
-                    ("Turkish", "tr"),
-                    ("Russian", "ru"),
-                    ("Dutch", "nl"),
-                    ("Czech", "cs"),
-                    ("Arabic", "ar"),
-                    ("Chinese", "zh"),
-                    ("Japanese", "ja"),
-                    ("Korean", "ko"),
-                    ("Hindi", "hi"),
-                    ("Ukrainian", "uk"),
-                    ("Vietnamese", "vi"),
-                    ("Romanian", "ro"),
-                    ("Greek", "el"),
-                    ("Hebrew", "he"),
-                    ("Finnish", "fi"),
-                    ("Hungarian", "hu"),
-                    ("Swedish", "sv"),
-                    ("Catalan", "ca"),
-                    ("Indonesian", "id"),
-                    ("Malay", "ms"),
-                    ("Bulgarian", "bg"),
-                    ("Slovak", "sk"),
-                    ("Danish", "da"),
-                    ("Norwegian", "no"),
-                    ("Lithuanian", "lt"),
-                    ("Croatian", "hr"),
-                    ("Serbian", "sr"),
-                    ("Slovenian", "sl"),
-                    ("Estonian", "et"),
-                    ("Latvian", "lv"),
-                    ("Filipino", "fil"),
-                    ("Bengali", "bn"),
-                    ("Tamil", "ta"),
-                    ("Telugu", "te"),
-                    ("Urdu", "ur"),
-                    ("Farsi", "fa"),
-                    ("Thai", "th"),
-                ],
-                value="en",
-                label="Language"
-            )
-            clone_btn = gr.Button("Clone Voice", variant="primary", size="lg")
-        with gr.Column():
-            output_audio = gr.Audio(label="Cloned Voice Result")
-            status_output = gr.Textbox(label="Status", lines=12, interactive=False)
-    clone_btn.click(
-        fn=voice_to_voice_clone,
-        inputs=[reference_audio, input_audio, language],
-        outputs=[output_audio, status_output],
-        show_progress=True
-    )
 if __name__ == "__main__":
-    demo.launch()

 warnings.filterwarnings("ignore")
 os.environ["COQUI_TOS_AGREED"] = "1"
 print("🚀 Starting Voice Cloning Studio...")
 @contextmanager
         return False
 def voice_to_voice_clone(reference_audio, input_audio, language="en"):
+    """
+    Main voice cloning function - this will be called by both UI and API
+    """
     try:
+        print(f"🎭 Voice cloning request: {language}")
+        print(f"📁 Reference: {reference_audio}")
+        print(f"📁 Input: {input_audio}")
         if not reference_audio or not input_audio:
             return None, "❌ Please upload both reference and input audio files!"
+        # Load XTTS model
         if not load_xtts_manual():
             return None, f"❌ XTTS loading failed!\nStatus: {MODEL_STATUS}"
+        # Load Whisper for transcription
         load_whisper()
+        # Extract text from input audio
         extracted_text = "Voice cloning demonstration."
         if WHISPER_MODEL:
             try:
                 print(f"✅ Extracted: '{extracted_text[:100]}...'")
             except Exception as e:
                 print(f"⚠️ Whisper error: {e}")
+        # Generate cloned voice
         with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_file:
             output_path = tmp_file.name
+        print(f"🔄 Generating voice clone...")
         with patch_torch_load():
             TTS_MODEL.tts_to_file(
                 text=extracted_text,
                 language=language,
                 file_path=output_path
             )
+        # Verify output
         if os.path.exists(output_path) and os.path.getsize(output_path) > 0:
+            success_message = f"""✅ VOICE-TO-VOICE CLONING SUCCESS!
 📝 Content: '{extracted_text[:150]}...'
 🎭 Device: {DEVICE}
 🔧 Status: {MODEL_STATUS}
+📊 Output size: {os.path.getsize(output_path)} bytes
 """
+            print("✅ Voice cloning completed successfully!")
+            return output_path, success_message
         else:
             return None, "❌ Generated audio file is empty!"
     except Exception as e:
+        error_msg = f"❌ Voice cloning error: {str(e)}\nModel: {MODEL_STATUS}"
+        print(error_msg)
+        return None, error_msg
+# FIXED: Use gr.Interface instead of gr.Blocks for proper API exposure
+interface = gr.Interface(
+    fn=voice_to_voice_clone,
+    inputs=[
+        gr.Audio(
+            label="🎤 Reference Audio (Voice to Clone)",
+            type="filepath",
+            sources=["upload"]
+        ),
+        gr.Audio(
+            label="🎵 Input Audio (Content to Transform)",
+            type="filepath",
+            sources=["upload"]
+        ),
+        gr.Dropdown(
+            choices=[
+                "en", "es", "fr", "de", "it", "pt", "pl", "tr", "ru", "nl",
+                "cs", "ar", "zh", "ja", "ko", "hi", "uk", "vi", "ro", "el",
+                "he", "fi", "hu", "sv", "ca", "id", "ms", "bg", "sk", "da",
+                "no", "lt", "hr", "sr", "sl", "et", "lv", "fil", "bn", "ta",
+                "te", "ur", "fa", "th"
+            ],
+            value="en",
+            label="🌍 Language"
+        )
+    ],
+    outputs=[
+        gr.Audio(label="🎉 Cloned Voice Result"),
+        gr.Textbox(label="📋 Status", lines=8)
+    ],
+    title="🎭 REAL Voice Cloning Studio",
+    description="Transform any voice into any other voice using XTTS-v2 and Whisper AI models. Upload reference audio and input audio to get started.",
+    theme=gr.themes.Soft(),
+    allow_flagging="never",
+    api_name="voice_to_voice_clone"  # CRITICAL: This creates the API endpoint
+)
 if __name__ == "__main__":
+    print("🌐 Launching Voice Cloning Studio...")
+    interface.launch(
+        server_name="0.0.0.0",
+        server_port=7860,
+        share=False,
+        show_api=True,  # Shows API documentation
+        debug=True
+    )