tests / app.py
tester1hf's picture
Update app.py
0529fb1 verified
import gradio as gr
import torch
import numpy as np
from TTS.api import TTS
from pydub import AudioSegment
import os
import re
import soundfile as sf
import time
# Security bypass and TOS agreement
os.environ["COQUI_TOS_AGREED"] = "1"
# Patch torch.load for embedding loading
original_torch_load = torch.load
def patched_torch_load(*args, **kwargs):
kwargs['weights_only'] = False
return original_torch_load(*args, **kwargs)
torch.load = patched_torch_load
# Initialize XTTS model
device = "cuda" if torch.cuda.is_available() else "cpu"
tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2").to(device)
def extract_speaker_embedding(audio_path):
try:
# Get conditioning latents using built-in method
gpt_cond_latent, speaker_embedding = tts.synthesizer.tts_model.get_conditioning_latents(audio_path=[audio_path])
# Save both latents
embedding_path = "speaker_embedding.pth"
torch.save({
"gpt_cond_latent": gpt_cond_latent.cpu(),
"speaker_embedding": speaker_embedding.cpu()
}, embedding_path)
return embedding_path
except Exception as e:
raise gr.Error(f"Error extracting embedding: {str(e)}")
def split_text(text, max_length=182):
sentences = []
current = []
current_len = 0
words = re.split(r'(\s+)', text)
for word in words:
if current_len + len(word) > max_length:
sentences.append("".join(current).strip())
current = []
current_len = 0
current.append(word)
current_len += len(word)
if current:
sentences.append("".join(current).strip())
processed = []
for s in sentences:
if not s.endswith(('.','!','?')):
s += '.'
processed.append(s)
return processed
def synthesize_speech(text, embedding_path):
try:
# Load embeddings
embeddings = torch.load(embedding_path)
gpt_cond_latent = embeddings["gpt_cond_latent"].to(device)
speaker_embedding = embeddings["speaker_embedding"].to(device)
# Split text into chunks
text_chunks = split_text(text)
# Synthesize each chunk
audio_chunks = []
for chunk in text_chunks:
out = tts.synthesizer.tts_model.inference(
chunk,
"ru",
gpt_cond_latent,
speaker_embedding,
temperature=0.7,
length_penalty=1.0,
repetition_penalty=2.0,
)
# Handle both tensor and numpy array outputs
wav = out["wav"].squeeze()
if isinstance(wav, torch.Tensor):
audio_chunks.append(wav.cpu().numpy())
else:
audio_chunks.append(wav)
# Combine and save audio
full_audio = np.concatenate(audio_chunks)
output_path = "output.wav"
sf.write(output_path, full_audio, 24000)
return output_path
except Exception as e:
raise gr.Error(f"Error generating speech: {str(e)}")
# Gradio Interface
with gr.Blocks(theme=gr.themes.Soft()) as demo:
gr.Markdown("# 🐸 XTTS v2 Voice Cloning Demo")
with gr.Tab("πŸ”Š Voice Embedding Creation"):
gr.Markdown("Upload a short Russian audio sample (3-10 seconds)")
with gr.Row():
audio_input = gr.Audio(
sources=["upload", "microphone"],
type="filepath",
label="Input Audio",
waveform_options={"sample_rate": 24000}
)
embedding_output = gr.File(label="Saved Embedding")
extract_btn = gr.Button("Create Voice Embedding", variant="primary")
with gr.Tab("πŸ“’ Speech Generation"):
gr.Markdown("Upload embedding and enter Russian text")
with gr.Row():
text_input = gr.Textbox(
label="Text Input",
placeholder="Enter text to synthesize...",
lines=4,
max_lines=10
)
embedding_input = gr.File(label="Upload Embedding File")
with gr.Row():
audio_output = gr.Audio(
label="Generated Speech",
autoplay=True,
waveform_options={"sample_rate": 24000}
)
synth_btn = gr.Button("Generate Speech", variant="primary")
# Event handlers
extract_btn.click(
extract_speaker_embedding,
inputs=audio_input,
outputs=embedding_output
)
synth_btn.click(
synthesize_speech,
inputs=[text_input, embedding_input],
outputs=audio_output
)
if __name__ == "__main__":
demo.launch(
server_name="0.0.0.0",
server_port=7860,
share=False,
show_error=True
)