Spaces:

tester1hf
/

tests

Sleeping

File size: 4,865 Bytes

import gradio as gr
import torch
import numpy as np
from TTS.api import TTS
from pydub import AudioSegment
import os
import re
import soundfile as sf
import time

# Security bypass and TOS agreement
os.environ["COQUI_TOS_AGREED"] = "1"

# Patch torch.load for embedding loading
original_torch_load = torch.load
def patched_torch_load(*args, **kwargs):
    kwargs['weights_only'] = False
    return original_torch_load(*args, **kwargs)
torch.load = patched_torch_load

# Initialize XTTS model
device = "cuda" if torch.cuda.is_available() else "cpu"
tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2").to(device)

def extract_speaker_embedding(audio_path):
    try:
        # Get conditioning latents using built-in method
        gpt_cond_latent, speaker_embedding = tts.synthesizer.tts_model.get_conditioning_latents(audio_path=[audio_path])
        
        # Save both latents
        embedding_path = "speaker_embedding.pth"
        torch.save({
            "gpt_cond_latent": gpt_cond_latent.cpu(),
            "speaker_embedding": speaker_embedding.cpu()
        }, embedding_path)
        return embedding_path
    except Exception as e:
        raise gr.Error(f"Error extracting embedding: {str(e)}")

def split_text(text, max_length=182):
    sentences = []
    current = []
    current_len = 0
    
    words = re.split(r'(\s+)', text)
    for word in words:
        if current_len + len(word) > max_length:
            sentences.append("".join(current).strip())
            current = []
            current_len = 0
        current.append(word)
        current_len += len(word)
    
    if current:
        sentences.append("".join(current).strip())
    
    processed = []
    for s in sentences:
        if not s.endswith(('.','!','?')):
            s += '.'
        processed.append(s)
    
    return processed

def synthesize_speech(text, embedding_path):
    try:
        # Load embeddings
        embeddings = torch.load(embedding_path)
        gpt_cond_latent = embeddings["gpt_cond_latent"].to(device)
        speaker_embedding = embeddings["speaker_embedding"].to(device)
        
        # Split text into chunks
        text_chunks = split_text(text)
        
        # Synthesize each chunk
        audio_chunks = []
        for chunk in text_chunks:
            out = tts.synthesizer.tts_model.inference(
                chunk,
                "ru",
                gpt_cond_latent,
                speaker_embedding,
                temperature=0.7,
                length_penalty=1.0,
                repetition_penalty=2.0,
            )
            # Handle both tensor and numpy array outputs
            wav = out["wav"].squeeze()
            if isinstance(wav, torch.Tensor):
                audio_chunks.append(wav.cpu().numpy())
            else:
                audio_chunks.append(wav)
        
        # Combine and save audio
        full_audio = np.concatenate(audio_chunks)
        output_path = "output.wav"
        sf.write(output_path, full_audio, 24000)
        return output_path
    except Exception as e:
        raise gr.Error(f"Error generating speech: {str(e)}")

# Gradio Interface
with gr.Blocks(theme=gr.themes.Soft()) as demo:
    gr.Markdown("# 🐸 XTTS v2 Voice Cloning Demo")
    
    with gr.Tab("🔊 Voice Embedding Creation"):
        gr.Markdown("Upload a short Russian audio sample (3-10 seconds)")
        with gr.Row():
            audio_input = gr.Audio(
                sources=["upload", "microphone"],
                type="filepath",
                label="Input Audio",
                waveform_options={"sample_rate": 24000}
            )
            embedding_output = gr.File(label="Saved Embedding")
        extract_btn = gr.Button("Create Voice Embedding", variant="primary")
    
    with gr.Tab("📢 Speech Generation"):
        gr.Markdown("Upload embedding and enter Russian text")
        with gr.Row():
            text_input = gr.Textbox(
                label="Text Input",
                placeholder="Enter text to synthesize...",
                lines=4,
                max_lines=10
            )
            embedding_input = gr.File(label="Upload Embedding File")
        with gr.Row():
            audio_output = gr.Audio(
                label="Generated Speech",
                autoplay=True,
                waveform_options={"sample_rate": 24000}
            )
        synth_btn = gr.Button("Generate Speech", variant="primary")

    # Event handlers
    extract_btn.click(
        extract_speaker_embedding,
        inputs=audio_input,
        outputs=embedding_output
    )
    
    synth_btn.click(
        synthesize_speech,
        inputs=[text_input, embedding_input],
        outputs=audio_output
    )

if __name__ == "__main__":
    demo.launch(
        server_name="0.0.0.0",
        server_port=7860,
        share=False,
        show_error=True
    )