Spaces:

tester1hf
/

tests

Sleeping

App Files Files Community

tests / app.py

tester1hf

Update app.py

0529fb1 verified 11 months ago

raw

history blame contribute delete

4.87 kB

	import gradio as gr
	import torch
	import numpy as np
	from TTS.api import TTS
	from pydub import AudioSegment
	import os
	import re
	import soundfile as sf
	import time

	# Security bypass and TOS agreement
	os.environ["COQUI_TOS_AGREED"] = "1"

	# Patch torch.load for embedding loading
	original_torch_load = torch.load
	def patched_torch_load(args, *kwargs):
	kwargs['weights_only'] = False
	return original_torch_load(args, *kwargs)
	torch.load = patched_torch_load

	# Initialize XTTS model
	device = "cuda" if torch.cuda.is_available() else "cpu"
	tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2").to(device)

	def extract_speaker_embedding(audio_path):
	try:
	# Get conditioning latents using built-in method
	gpt_cond_latent, speaker_embedding = tts.synthesizer.tts_model.get_conditioning_latents(audio_path=[audio_path])

	# Save both latents
	embedding_path = "speaker_embedding.pth"
	torch.save({
	"gpt_cond_latent": gpt_cond_latent.cpu(),
	"speaker_embedding": speaker_embedding.cpu()
	}, embedding_path)
	return embedding_path
	except Exception as e:
	raise gr.Error(f"Error extracting embedding: {str(e)}")

	def split_text(text, max_length=182):
	sentences = []
	current = []
	current_len = 0

	words = re.split(r'(\s+)', text)
	for word in words:
	if current_len + len(word) > max_length:
	sentences.append("".join(current).strip())
	current = []
	current_len = 0
	current.append(word)
	current_len += len(word)

	if current:
	sentences.append("".join(current).strip())

	processed = []
	for s in sentences:
	if not s.endswith(('.','!','?')):
	s += '.'
	processed.append(s)

	return processed

	def synthesize_speech(text, embedding_path):
	try:
	# Load embeddings
	embeddings = torch.load(embedding_path)
	gpt_cond_latent = embeddings["gpt_cond_latent"].to(device)
	speaker_embedding = embeddings["speaker_embedding"].to(device)

	# Split text into chunks
	text_chunks = split_text(text)

	# Synthesize each chunk
	audio_chunks = []
	for chunk in text_chunks:
	out = tts.synthesizer.tts_model.inference(
	chunk,
	"ru",
	gpt_cond_latent,
	speaker_embedding,
	temperature=0.7,
	length_penalty=1.0,
	repetition_penalty=2.0,
	)
	# Handle both tensor and numpy array outputs
	wav = out["wav"].squeeze()
	if isinstance(wav, torch.Tensor):
	audio_chunks.append(wav.cpu().numpy())
	else:
	audio_chunks.append(wav)

	# Combine and save audio
	full_audio = np.concatenate(audio_chunks)
	output_path = "output.wav"
	sf.write(output_path, full_audio, 24000)
	return output_path
	except Exception as e:
	raise gr.Error(f"Error generating speech: {str(e)}")

	# Gradio Interface
	with gr.Blocks(theme=gr.themes.Soft()) as demo:
	gr.Markdown("# 🐸 XTTS v2 Voice Cloning Demo")

	with gr.Tab("🔊 Voice Embedding Creation"):
	gr.Markdown("Upload a short Russian audio sample (3-10 seconds)")
	with gr.Row():
	audio_input = gr.Audio(
	sources=["upload", "microphone"],
	type="filepath",
	label="Input Audio",
	waveform_options={"sample_rate": 24000}
	)
	embedding_output = gr.File(label="Saved Embedding")
	extract_btn = gr.Button("Create Voice Embedding", variant="primary")

	with gr.Tab("📢 Speech Generation"):
	gr.Markdown("Upload embedding and enter Russian text")
	with gr.Row():
	text_input = gr.Textbox(
	label="Text Input",
	placeholder="Enter text to synthesize...",
	lines=4,
	max_lines=10
	)
	embedding_input = gr.File(label="Upload Embedding File")
	with gr.Row():
	audio_output = gr.Audio(
	label="Generated Speech",
	autoplay=True,
	waveform_options={"sample_rate": 24000}
	)
	synth_btn = gr.Button("Generate Speech", variant="primary")

	# Event handlers
	extract_btn.click(
	extract_speaker_embedding,
	inputs=audio_input,
	outputs=embedding_output
	)

	synth_btn.click(
	synthesize_speech,
	inputs=[text_input, embedding_input],
	outputs=audio_output
	)

	if __name__ == "__main__":
	demo.launch(
	server_name="0.0.0.0",
	server_port=7860,
	share=False,
	show_error=True
	)