import gradio as gr import torch import numpy as np from TTS.api import TTS from pydub import AudioSegment import os import re import soundfile as sf import time # Security bypass and TOS agreement os.environ["COQUI_TOS_AGREED"] = "1" # Patch torch.load for embedding loading original_torch_load = torch.load def patched_torch_load(*args, **kwargs): kwargs['weights_only'] = False return original_torch_load(*args, **kwargs) torch.load = patched_torch_load # Initialize XTTS model device = "cuda" if torch.cuda.is_available() else "cpu" tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2").to(device) def extract_speaker_embedding(audio_path): try: # Get conditioning latents using built-in method gpt_cond_latent, speaker_embedding = tts.synthesizer.tts_model.get_conditioning_latents(audio_path=[audio_path]) # Save both latents embedding_path = "speaker_embedding.pth" torch.save({ "gpt_cond_latent": gpt_cond_latent.cpu(), "speaker_embedding": speaker_embedding.cpu() }, embedding_path) return embedding_path except Exception as e: raise gr.Error(f"Error extracting embedding: {str(e)}") def split_text(text, max_length=182): sentences = [] current = [] current_len = 0 words = re.split(r'(\s+)', text) for word in words: if current_len + len(word) > max_length: sentences.append("".join(current).strip()) current = [] current_len = 0 current.append(word) current_len += len(word) if current: sentences.append("".join(current).strip()) processed = [] for s in sentences: if not s.endswith(('.','!','?')): s += '.' processed.append(s) return processed def synthesize_speech(text, embedding_path): try: # Load embeddings embeddings = torch.load(embedding_path) gpt_cond_latent = embeddings["gpt_cond_latent"].to(device) speaker_embedding = embeddings["speaker_embedding"].to(device) # Split text into chunks text_chunks = split_text(text) # Synthesize each chunk audio_chunks = [] for chunk in text_chunks: out = tts.synthesizer.tts_model.inference( chunk, "ru", gpt_cond_latent, speaker_embedding, temperature=0.7, length_penalty=1.0, repetition_penalty=2.0, ) # Handle both tensor and numpy array outputs wav = out["wav"].squeeze() if isinstance(wav, torch.Tensor): audio_chunks.append(wav.cpu().numpy()) else: audio_chunks.append(wav) # Combine and save audio full_audio = np.concatenate(audio_chunks) output_path = "output.wav" sf.write(output_path, full_audio, 24000) return output_path except Exception as e: raise gr.Error(f"Error generating speech: {str(e)}") # Gradio Interface with gr.Blocks(theme=gr.themes.Soft()) as demo: gr.Markdown("# 🐸 XTTS v2 Voice Cloning Demo") with gr.Tab("🔊 Voice Embedding Creation"): gr.Markdown("Upload a short Russian audio sample (3-10 seconds)") with gr.Row(): audio_input = gr.Audio( sources=["upload", "microphone"], type="filepath", label="Input Audio", waveform_options={"sample_rate": 24000} ) embedding_output = gr.File(label="Saved Embedding") extract_btn = gr.Button("Create Voice Embedding", variant="primary") with gr.Tab("📢 Speech Generation"): gr.Markdown("Upload embedding and enter Russian text") with gr.Row(): text_input = gr.Textbox( label="Text Input", placeholder="Enter text to synthesize...", lines=4, max_lines=10 ) embedding_input = gr.File(label="Upload Embedding File") with gr.Row(): audio_output = gr.Audio( label="Generated Speech", autoplay=True, waveform_options={"sample_rate": 24000} ) synth_btn = gr.Button("Generate Speech", variant="primary") # Event handlers extract_btn.click( extract_speaker_embedding, inputs=audio_input, outputs=embedding_output ) synth_btn.click( synthesize_speech, inputs=[text_input, embedding_input], outputs=audio_output ) if __name__ == "__main__": demo.launch( server_name="0.0.0.0", server_port=7860, share=False, show_error=True )