Spaces:

Hematej
/

voice-clone-text-generate

Running

App Files Files Community

Hematej commited on Jun 2, 2025

Commit

8c99301

verified ·

1 Parent(s): 300bc4b

Update app.py

Browse files

Files changed (1) hide show

app.py +63 -63

app.py CHANGED Viewed

@@ -2,111 +2,111 @@ import gradio as gr
 from TTS.api import TTS
 import torch
 import os
 css = """
 #warning {background-color: #FFCCCB !important}
-.feedback label textarea {height: auto !important;
-                    font-size: 22px !important;
-                    font-weight: 800 !important;
-                    text-align: center !important;
-                    color: #801313 !important;
-                    padding: 0px !important}
 #alert {background-color: #fff !important}
 """
-# ✅ Check CPU/GPU availability before loading models
 device = "cuda" if torch.cuda.is_available() else "cpu"
-# ✅ Explicitly define `gpu` settings
 tts = TTS(model_name="tts_models/multilingual/multi-dataset/your_tts", progress_bar=False, gpu=(device=="cuda"))
-zh_tts = TTS(model_name="tts_models/zh-CN/baker/tacotron2-DDC-GST", progress_bar=False, gpu=(device=="cuda"))
-de_tts = TTS(model_name="tts_models/de/thorsten/vits", gpu=(device=="cuda"))
-es_tts = TTS(model_name="tts_models/es/mai/tacotron2-DDC", progress_bar=False, gpu=(device=="cuda"))
-# ✅ Ensure correct weight loading
 tts.to(device)
-zh_tts.to(device)
-de_tts.to(device)
-es_tts.to(device)
-def text_to_speech(text: str, speaker_wav: str, speaker_wav_file: str) -> str:
-    # ✅ Sanitize input text to avoid empty processing
-    text = text.strip().replace("\n", " ").replace("  ", " ")
-    if not text:
-        return "Error: No text provided."
-    return change_aud(text, speaker_wav, speaker_wav_file)
-def change_aud(text: str, speaker_wav: str, speaker_wav_file: str) -> str:
-    # ✅ Ensure speaker file is correctly assigned
-    if speaker_wav_file and not speaker_wav:
-        speaker_wav = speaker_wav_file
-    # ✅ Validate audio input
-    if not speaker_wav or not os.path.exists(speaker_wav):
-        return "Error: No valid speaker audio provided."
-    if speaker_wav.endswith(".mp3"):
-        return "Error: MP3 format not supported. Convert to WAV."
-    file_path = "output.wav"
     try:
-        tts.tts_to_file(text, speaker_wav=speaker_wav, language="en", file_path=file_path)
-        # ✅ Debugging print statement to confirm output generation
-        if os.path.exists(file_path) and os.path.getsize(file_path) > 0:
-            print(f"Generated file path: {file_path}, Size: {os.path.getsize(file_path)} bytes")
-            return file_path
         else:
-            return "Error: Output file was not properly generated."
     except Exception as e:
-        return f"Error generating cloned voice: {str(e)}"
-def show_error(text: str):
-    # ✅ Ensure function returns expected outputs for Gradio UI updates
-    return (
-        gr.update(visible=(text == ""), elem_id="warning", elem_classes="feedback"),
-        gr.update(visible=(text != ""))
-    )
-title = "Voice-Cloning-Demo"
 def toggle(choice: str):
     return (
         gr.update(visible=(choice == "mic"), value=None),
         gr.update(visible=(choice != "mic"), value=None)
     )
 def change_color(text_input: str):
     return gr.update(elem_id="warning" if len(text_input) == 0 else "alert", autofocus=(len(text_input) == 0))
 def clear_color(text_input: str, radio: str, error_box: str):
     return gr.update(elem_id="alert"), gr.update(value="mic"), gr.update(visible=False)
-with gr.Blocks(css="footer {visibility: hidden}") as demo:
     with gr.Row():
         with gr.Column():
-            text_input = gr.Textbox(label="Input the text", value="", max_lines=4, lines=4)
-            radio = gr.Radio(["mic", "file"], value="mic", label="How would you like to upload your audio?")
-            audio_input_mic = gr.Audio(label="Voice to clone", sources="microphone", type="filepath", visible=True)
-            audio_input_file = gr.Audio(label="Voice to clone", type="filepath", visible=False)
             with gr.Row():
                 with gr.Column():
                     btn_clear = gr.ClearButton([text_input, radio, audio_input_file])
                 with gr.Column():
-                    btn = gr.Button("Generate", variant="primary")
         with gr.Column():
-            audio_output = gr.Audio(label="Output", visible=True, autoplay=True, show_share_button=False)
-            error_box = gr.Textbox(label="WARNING", value="Input box cannot be blank!!", visible=False, container=True)
     btn_clear.add(audio_output)
-    btn.click(text_to_speech, inputs=[text_input, audio_input_mic, audio_input_file], outputs=audio_output)
-    btn.click(show_error, text_input, [error_box, audio_output])  # ✅ Fixed output values
     radio.change(toggle, radio, [audio_input_mic, audio_input_file])
     btn_clear.click(clear_color, [text_input, radio, error_box], [text_input, radio, error_box])
     btn.click(change_color, text_input, text_input)
-demo.launch()

 from TTS.api import TTS
 import torch
 import os
+from pydub import AudioSegment
+# CSS for warnings and styling
 css = """
 #warning {background-color: #FFCCCB !important}
+.feedback label textarea {
+    height: auto !important;
+    font-size: 22px !important;
+    font-weight: 800 !important;
+    text-align: center !important;
+    color: #801313 !important;
+    padding: 0px !important
+}
 #alert {background-color: #fff !important}
 """
+# Use GPU if available
 device = "cuda" if torch.cuda.is_available() else "cpu"
+# Load models
 tts = TTS(model_name="tts_models/multilingual/multi-dataset/your_tts", progress_bar=False, gpu=(device=="cuda"))
 tts.to(device)
+# Convert mp3 to wav
+def convert_mp3_to_wav(mp3_path: str) -> str:
+    wav_path = mp3_path.replace(".mp3", ".wav")
+    audio = AudioSegment.from_mp3(mp3_path)
+    audio.export(wav_path, format="wav")
+    return wav_path
+# Voice cloning function
+def text_to_speech(text: str, speaker_wav: str, speaker_wav_file: str):
+    text = text.strip().replace("\n", " ")
+    speaker_audio = speaker_wav_file or speaker_wav
+    if not text:
+        return None, "⚠️ Error: Text input is empty."
+    if not speaker_audio or not os.path.exists(speaker_audio):
+        return None, "⚠️ Error: No valid speaker audio provided."
+    if speaker_audio.endswith(".mp3"):
+        try:
+            speaker_audio = convert_mp3_to_wav(speaker_audio)
+        except Exception as e:
+            return None, f"⚠️ Error converting MP3 to WAV: {str(e)}"
+    output_path = "output.wav"
     try:
+        tts.tts_to_file(text=text, speaker_wav=speaker_audio, language="en", file_path=output_path)
+        if os.path.exists(output_path) and os.path.getsize(output_path) > 0:
+            return output_path, ""
         else:
+            return None, "⚠️ Error: Audio was not generated."
     except Exception as e:
+        return None, f"⚠️ Error during synthesis: {str(e)}"
+# Toggle mic/file input visibility
 def toggle(choice: str):
     return (
         gr.update(visible=(choice == "mic"), value=None),
         gr.update(visible=(choice != "mic"), value=None)
     )
+# Change alert style based on input
 def change_color(text_input: str):
     return gr.update(elem_id="warning" if len(text_input) == 0 else "alert", autofocus=(len(text_input) == 0))
+# Reset fields
 def clear_color(text_input: str, radio: str, error_box: str):
     return gr.update(elem_id="alert"), gr.update(value="mic"), gr.update(visible=False)
+# Show error or success
+def show_error(text: str):
+    return (
+        gr.update(visible=(text == ""), elem_id="warning", elem_classes="feedback"),
+        gr.update(visible=(text != ""))
+    )
+# Gradio UI
+with gr.Blocks(css=css) as demo:
     with gr.Row():
         with gr.Column():
+            text_input = gr.Textbox(label="Enter text to clone", value="", max_lines=4, lines=4)
+            radio = gr.Radio(["mic", "file"], value="mic", label="Upload speaker audio")
+            audio_input_mic = gr.Audio(label="Use Microphone", sources="microphone", type="filepath", visible=True)
+            audio_input_file = gr.Audio(label="Upload File (.wav/.mp3)", type="filepath", visible=False)
             with gr.Row():
                 with gr.Column():
                     btn_clear = gr.ClearButton([text_input, radio, audio_input_file])
                 with gr.Column():
+                    btn = gr.Button("Generate Voice", variant="primary")
         with gr.Column():
+            audio_output = gr.Audio(label="Generated Voice", visible=True, autoplay=True, show_share_button=False)
+            error_box = gr.Textbox(label="Status", value="Input box cannot be blank!!", visible=False, container=True)
+    # Event bindings
     btn_clear.add(audio_output)
+    btn.click(text_to_speech, inputs=[text_input, audio_input_mic, audio_input_file], outputs=[audio_output, error_box])
+    btn.click(show_error, text_input, [error_box, audio_output])
     radio.change(toggle, radio, [audio_input_mic, audio_input_file])
     btn_clear.click(clear_color, [text_input, radio, error_box], [text_input, radio, error_box])
     btn.click(change_color, text_input, text_input)
+# Launch the app
+demo.launch()