Spaces:

tester1hf
/

tests

Sleeping

App Files Files Community

tester1hf commited on Feb 18, 2025

Commit

a60e434

verified ·

1 Parent(s): 561919f

Update app.py

Browse files

Files changed (1) hide show

app.py +93 -61

app.py CHANGED Viewed

@@ -6,6 +6,7 @@ from pydub import AudioSegment
 import os
 import re
 import soundfile as sf
 # Security bypass and TOS agreement
 os.environ["COQUI_TOS_AGREED"] = "1"
@@ -19,19 +20,22 @@ torch.load = patched_torch_load
 # Initialize XTTS model
 device = "cuda" if torch.cuda.is_available() else "cpu"
-tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2").to(device)
 def extract_speaker_embedding(audio_path):
-    # Get conditioning latents using built-in method
-    gpt_cond_latent, speaker_embedding = tts.synthesizer.tts_model.get_conditioning_latents(audio_path=[audio_path])
-    # Save both latents for better voice cloning
-    embedding_path = "speaker_embedding.pth"
-    torch.save({
-        "gpt_cond_latent": gpt_cond_latent.cpu(),
-        "speaker_embedding": speaker_embedding.cpu()
-    }, embedding_path)
-    return embedding_path
 def split_text(text, max_length=182):
     sentences = []
@@ -59,63 +63,91 @@ def split_text(text, max_length=182):
     return processed
 def synthesize_speech(text, embedding_path):
-    # Load embeddings
-    embeddings = torch.load(embedding_path)
-    gpt_cond_latent = embeddings["gpt_cond_latent"].to(device)
-    speaker_embedding = embeddings["speaker_embedding"].to(device)
-    # Split text into manageable chunks
-    text_chunks = split_text(text)
-    # Synthesize each chunk
-    audio_chunks = []
-    for chunk in text_chunks:
-        wav = tts.synthesizer.tts_model.inference(
-            text=chunk,
-            language="ru",
-            gpt_cond_latent=gpt_cond_latent,
-            speaker_embedding=speaker_embedding,
-            temperature=0.7,
-            length_penalty=1.0,
-            repetition_penalty=2.0,
-        )
-        audio_chunks.append(np.array(wav["wav"].squeeze().cpu().numpy()))
-    # Combine and save audio
-    full_audio = np.concatenate(audio_chunks)
-    output_path = "output.wav"
-    sf.write(output_path, full_audio, 24000)
-    return output_path
 # Gradio Interface
-with gr.Blocks() as demo:
-    gr.Markdown("# XTTS v2 Voice Cloning Demo")
-    with gr.Tab("1. Extract Voice Embedding"):
-        gr.Markdown("Upload a Russian audio sample (3-10 seconds)")
         with gr.Row():
-            audio_input = gr.Audio(type="filepath", label="Input Audio")
-            embedding_output = gr.File(label="Embedding File")
-        extract_btn = gr.Button("Create Voice Embedding")
-        extract_btn.click(
-            extract_speaker_embedding,
-            inputs=audio_input,
-            outputs=embedding_output
-        )
-    with gr.Tab("2. Generate Speech"):
         gr.Markdown("Upload embedding and enter Russian text")
         with gr.Row():
-            text_input = gr.Textbox(label="Text", lines=4, placeholder="Enter text here...")
-            embedding_input = gr.File(label="Embedding File")
         with gr.Row():
-            audio_output = gr.Audio(label="Generated Speech", autoplay=True)
-        synth_btn = gr.Button("Generate Speech")
-        synth_btn.click(
-            synthesize_speech,
-            inputs=[text_input, embedding_input],
-            outputs=audio_output
-        )
 if __name__ == "__main__":
-    demo.launch(server_port=7860, share=True)

 import os
 import re
 import soundfile as sf
+import time
 # Security bypass and TOS agreement
 os.environ["COQUI_TOS_AGREED"] = "1"
 # Initialize XTTS model
 device = "cuda" if torch.cuda.is_available() else "cpu"
+tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2", gpu=True)
 def extract_speaker_embedding(audio_path):
+    try:
+        # Get conditioning latents using built-in method
+        gpt_cond_latent, speaker_embedding = tts.synthesizer.tts_model.get_conditioning_latents(audio_path=[audio_path])
+        # Save both latents
+        embedding_path = "speaker_embedding.pth"
+        torch.save({
+            "gpt_cond_latent": gpt_cond_latent.cpu(),
+            "speaker_embedding": speaker_embedding.cpu()
+        }, embedding_path)
+        return embedding_path
+    except Exception as e:
+        raise gr.Error(f"Error extracting embedding: {str(e)}")
 def split_text(text, max_length=182):
     sentences = []
     return processed
 def synthesize_speech(text, embedding_path):
+    try:
+        # Load embeddings
+        embeddings = torch.load(embedding_path)
+        gpt_cond_latent = embeddings["gpt_cond_latent"].to(device)
+        speaker_embedding = embeddings["speaker_embedding"].to(device)
+        # Split text into chunks
+        text_chunks = split_text(text)
+        # Synthesize each chunk
+        audio_chunks = []
+        for chunk in text_chunks:
+            start_time = time.time()
+            out = tts.synthesizer.tts_model.inference(
+                chunk,
+                "ru",
+                gpt_cond_latent,
+                speaker_embedding,
+                temperature=0.7,
+                length_penalty=1.0,
+                repetition_penalty=2.0,
+            )
+            # Convert tensor to numpy array properly
+            audio = out["wav"].squeeze().cpu().numpy()
+            audio_chunks.append(audio)
+        # Combine and save audio
+        full_audio = np.concatenate(audio_chunks)
+        output_path = "output.wav"
+        sf.write(output_path, full_audio, 24000)
+        return output_path
+    except Exception as e:
+        raise gr.Error(f"Error generating speech: {str(e)}")
 # Gradio Interface
+with gr.Blocks(theme=gr.themes.Soft()) as demo:
+    gr.Markdown("# 🐸 XTTS v2 Voice Cloning Demo")
+    with gr.Tab("🔊 Voice Embedding Creation"):
+        gr.Markdown("Upload a short Russian audio sample (3-10 seconds)")
         with gr.Row():
+            audio_input = gr.Audio(
+                sources=["upload", "microphone"],
+                type="filepath",
+                label="Input Audio",
+                waveform_options={"sample_rate": 24000}
+            )
+            embedding_output = gr.File(label="Saved Embedding")
+        extract_btn = gr.Button("Create Voice Embedding", variant="primary")
+    with gr.Tab("���� Speech Generation"):
         gr.Markdown("Upload embedding and enter Russian text")
         with gr.Row():
+            text_input = gr.Textbox(
+                label="Text Input",
+                placeholder="Enter text to synthesize...",
+                lines=4,
+                max_lines=10
+            )
+            embedding_input = gr.File(label="Upload Embedding File")
         with gr.Row():
+            audio_output = gr.Audio(
+                label="Generated Speech",
+                autoplay=True,
+                waveform_options={"sample_rate": 24000}
+            )
+        synth_btn = gr.Button("Generate Speech", variant="primary")
+    # Event handlers
+    extract_btn.click(
+        extract_speaker_embedding,
+        inputs=audio_input,
+        outputs=embedding_output
+    )
+    synth_btn.click(
+        synthesize_speech,
+        inputs=[text_input, embedding_input],
+        outputs=audio_output
+    )
 if __name__ == "__main__":
+    demo.launch(
+        server_name="0.0.0.0",
+        server_port=7860,
+        share=False,
+        show_error=True
+    )