|
|
import gradio as gr |
|
|
import torch |
|
|
import numpy as np |
|
|
from TTS.api import TTS |
|
|
from pydub import AudioSegment |
|
|
import os |
|
|
import re |
|
|
import soundfile as sf |
|
|
import time |
|
|
|
|
|
|
|
|
os.environ["COQUI_TOS_AGREED"] = "1" |
|
|
|
|
|
|
|
|
original_torch_load = torch.load |
|
|
def patched_torch_load(*args, **kwargs): |
|
|
kwargs['weights_only'] = False |
|
|
return original_torch_load(*args, **kwargs) |
|
|
torch.load = patched_torch_load |
|
|
|
|
|
|
|
|
device = "cuda" if torch.cuda.is_available() else "cpu" |
|
|
tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2").to(device) |
|
|
|
|
|
def extract_speaker_embedding(audio_path): |
|
|
try: |
|
|
|
|
|
gpt_cond_latent, speaker_embedding = tts.synthesizer.tts_model.get_conditioning_latents(audio_path=[audio_path]) |
|
|
|
|
|
|
|
|
embedding_path = "speaker_embedding.pth" |
|
|
torch.save({ |
|
|
"gpt_cond_latent": gpt_cond_latent.cpu(), |
|
|
"speaker_embedding": speaker_embedding.cpu() |
|
|
}, embedding_path) |
|
|
return embedding_path |
|
|
except Exception as e: |
|
|
raise gr.Error(f"Error extracting embedding: {str(e)}") |
|
|
|
|
|
def split_text(text, max_length=182): |
|
|
sentences = [] |
|
|
current = [] |
|
|
current_len = 0 |
|
|
|
|
|
words = re.split(r'(\s+)', text) |
|
|
for word in words: |
|
|
if current_len + len(word) > max_length: |
|
|
sentences.append("".join(current).strip()) |
|
|
current = [] |
|
|
current_len = 0 |
|
|
current.append(word) |
|
|
current_len += len(word) |
|
|
|
|
|
if current: |
|
|
sentences.append("".join(current).strip()) |
|
|
|
|
|
processed = [] |
|
|
for s in sentences: |
|
|
if not s.endswith(('.','!','?')): |
|
|
s += '.' |
|
|
processed.append(s) |
|
|
|
|
|
return processed |
|
|
|
|
|
def synthesize_speech(text, embedding_path): |
|
|
try: |
|
|
|
|
|
embeddings = torch.load(embedding_path) |
|
|
gpt_cond_latent = embeddings["gpt_cond_latent"].to(device) |
|
|
speaker_embedding = embeddings["speaker_embedding"].to(device) |
|
|
|
|
|
|
|
|
text_chunks = split_text(text) |
|
|
|
|
|
|
|
|
audio_chunks = [] |
|
|
for chunk in text_chunks: |
|
|
out = tts.synthesizer.tts_model.inference( |
|
|
chunk, |
|
|
"ru", |
|
|
gpt_cond_latent, |
|
|
speaker_embedding, |
|
|
temperature=0.7, |
|
|
length_penalty=1.0, |
|
|
repetition_penalty=2.0, |
|
|
) |
|
|
|
|
|
wav = out["wav"].squeeze() |
|
|
if isinstance(wav, torch.Tensor): |
|
|
audio_chunks.append(wav.cpu().numpy()) |
|
|
else: |
|
|
audio_chunks.append(wav) |
|
|
|
|
|
|
|
|
full_audio = np.concatenate(audio_chunks) |
|
|
output_path = "output.wav" |
|
|
sf.write(output_path, full_audio, 24000) |
|
|
return output_path |
|
|
except Exception as e: |
|
|
raise gr.Error(f"Error generating speech: {str(e)}") |
|
|
|
|
|
|
|
|
with gr.Blocks(theme=gr.themes.Soft()) as demo: |
|
|
gr.Markdown("# πΈ XTTS v2 Voice Cloning Demo") |
|
|
|
|
|
with gr.Tab("π Voice Embedding Creation"): |
|
|
gr.Markdown("Upload a short Russian audio sample (3-10 seconds)") |
|
|
with gr.Row(): |
|
|
audio_input = gr.Audio( |
|
|
sources=["upload", "microphone"], |
|
|
type="filepath", |
|
|
label="Input Audio", |
|
|
waveform_options={"sample_rate": 24000} |
|
|
) |
|
|
embedding_output = gr.File(label="Saved Embedding") |
|
|
extract_btn = gr.Button("Create Voice Embedding", variant="primary") |
|
|
|
|
|
with gr.Tab("π’ Speech Generation"): |
|
|
gr.Markdown("Upload embedding and enter Russian text") |
|
|
with gr.Row(): |
|
|
text_input = gr.Textbox( |
|
|
label="Text Input", |
|
|
placeholder="Enter text to synthesize...", |
|
|
lines=4, |
|
|
max_lines=10 |
|
|
) |
|
|
embedding_input = gr.File(label="Upload Embedding File") |
|
|
with gr.Row(): |
|
|
audio_output = gr.Audio( |
|
|
label="Generated Speech", |
|
|
autoplay=True, |
|
|
waveform_options={"sample_rate": 24000} |
|
|
) |
|
|
synth_btn = gr.Button("Generate Speech", variant="primary") |
|
|
|
|
|
|
|
|
extract_btn.click( |
|
|
extract_speaker_embedding, |
|
|
inputs=audio_input, |
|
|
outputs=embedding_output |
|
|
) |
|
|
|
|
|
synth_btn.click( |
|
|
synthesize_speech, |
|
|
inputs=[text_input, embedding_input], |
|
|
outputs=audio_output |
|
|
) |
|
|
|
|
|
if __name__ == "__main__": |
|
|
demo.launch( |
|
|
server_name="0.0.0.0", |
|
|
server_port=7860, |
|
|
share=False, |
|
|
show_error=True |
|
|
) |