Spaces:

NeuralFalcon
/

Pocket-TTS

Running

App Files Files Community

Pocket-TTS / app.py

NeuralFalcon

Update app.py

22ff792 verified 1 day ago

raw

history blame contribute delete

6.75 kB


	import gradio as gr
	import os
	import re
	import uuid
	import scipy.io.wavfile
	import torch
	from pocket_tts import TTSModel
	#for voice clone
	from huggingface_hub import login
	hf_token = os.getenv("HF_TOKEN")
	if hf_token:
	login(token=hf_token)

	print("Loading TTS Model...")
	try:
	tts_model = TTSModel.load_model()
	print("Model loaded successfully.")
	except Exception as e:
	print(f"Error loading model: {e}")

	def get_tts_file_name(text, language="en"):
	temp_audio_dir = "./ai_tts_voice/"
	os.makedirs(temp_audio_dir, exist_ok=True)

	clean = re.sub(r'[^a-zA-Z\s]', '', text or "")
	clean = clean.lower().strip().replace(" ", "_")[:20] or "audio"

	uid = uuid.uuid4().hex[:8].upper()
	language = language.lower().strip()

	return os.path.join(
	temp_audio_dir,
	f"{clean}_{language}_{uid}.wav"
	)

	DEFAULT_VOICES = [
	"alba", "marius", "javert", "jean",
	"fantine", "cosette", "eponine", "azelma"
	]

	def generate_speech(text, mode, preset_voice, clone_audio_path):
	if not text:
	raise gr.Error("Please enter text to generate speech.")

	state = None

	if mode == "Default Voices":
	print(f"Using preset voice: {preset_voice}")
	state = tts_model.get_state_for_audio_prompt(preset_voice)

	else:
	if not clone_audio_path:
	raise gr.Error("Please upload a reference audio file for cloning.")

	print(f"Cloning voice from: {clone_audio_path}")
	try:
	state = tts_model.get_state_for_audio_prompt(clone_audio_path)
	except Exception as e:
	error_msg = f"Error loading reference audio: {str(e)}. Please upload a valid WAV file."
	print(error_msg)
	raise gr.Error(error_msg)

	try:
	audio_tensor = tts_model.generate_audio(state, text)

	output_filename = get_tts_file_name(text)
	scipy.io.wavfile.write(output_filename, tts_model.sample_rate, audio_tensor.numpy())

	return output_filename
	except Exception as e:
	raise gr.Error(f"Generation failed: {str(e)}")

	def toggle_inputs(mode):
	if mode == "Default Voices":
	return gr.update(visible=True), gr.update(visible=False)
	else:
	return gr.update(visible=False), gr.update(visible=True)


	CUSTOM_CSS = """
	.gradio-container {
	font-family: 'SF Pro Display', -apple-system, BlinkMacSystemFont, sans-serif;
	}
	.header-container {
	text-align: center;
	margin-bottom: 20px;
	}
	.logo-img {
	margin: 0 auto;
	display: block;
	max-width: 100%;
	transition: transform 0.2s;
	}
	.logo-img:hover {
	transform: scale(1.02);
	opacity: 0.9;
	}
	.links-container a {
	text-decoration: none;
	color: #4a90e2;
	font-weight: 500;
	}
	.links-container a:hover {
	text-decoration: underline;
	}
	"""

	HEADER_HTML = """
	<div class="header-container" style="text-align:center;">

	<a href="https://kyutai.org/tts" target="_blank" title="Visit Kyutai TTS">
	<img src="https://raw.githubusercontent.com/kyutai-labs/pocket-tts/refs/heads/main/docs/logo.png"
	class="logo-img" width="200">
	</a>

	<div class="links-container"
	style="
	margin-top: 18px;
	display: flex;
	justify-content: center;
	align-items: center;
	gap: 14px;
	flex-wrap: wrap;
	">

	<a href="https://github.com/kyutai-labs/pocket-tts"
	target="_blank"
	style="text-decoration:none;">
	🐱 GitHub Repository
	</a>

	<span style="color: gray;">\|</span>

	<a href="https://huggingface.co/kyutai/pocket-tts"
	target="_blank"
	style="text-decoration:none;">
	🤗 Hugging Face Model Card
	</a>

	<span style="color: gray;">\|</span>

	<a href="https://colab.research.google.com/github/NeuralFalconYT/Voice-Clone/blob/main/Pocket_TTS_Colab.ipynb"
	target="_blank"
	style="
	display: inline-flex;
	align-items: center;
	">
	<img src="https://colab.research.google.com/assets/colab-badge.svg"
	alt="Open in Colab"
	height="26">
	</a>

	</div>

	<p style="font-size: 0.8em; color: gray; margin-top: 10px;">
	<i>Note: This is not an official demo from Kyutai Labs</i>
	</p>

	</div>
	"""



	with gr.Blocks(theme='JohnSmith9982/small_and_pretty', css=CUSTOM_CSS) as demo:
	gr.HTML(HEADER_HTML)

	with gr.Row():
	with gr.Column():
	text_input = gr.Textbox(
	label="Text Input",
	placeholder="Hi, how are you?",
	lines=3,
	value="Hi, how are you?"
	)

	mode_radio = gr.Radio(
	choices=["Default Voices", "Voice Clone"],
	value="Default Voices",
	label="TTS Mode"
	)

	with gr.Group():
	dropdown_input = gr.Dropdown(
	choices=DEFAULT_VOICES,
	value="alba",
	label="Select Voice",
	visible=True
	)

	audio_upload = gr.Audio(
	label="Upload Reference Audio (WAV recommended)",
	type="filepath",
	visible=False
	)

	generate_btn = gr.Button("Generate Audio", variant="primary")

	example_audio_url = "https://huggingface.co/kyutai/tts-voices/resolve/main/alba-mackenna/casual.wav"


	with gr.Column():
	output_audio = gr.Audio(label="Generated Speech", type="filepath")

	gr.Examples(
	examples=[
	["Hello, I am Fantine. Nice to meet you.", "Default Voices", "fantine", None],
	["I am Cosette, and the weather is lovely.", "Default Voices", "cosette", None],
	["Hey there, Eponine here.", "Default Voices", "eponine", None],
	["Greetings from Azelma.", "Default Voices", "azelma", None],
	["This is a voice cloning test using the uploaded reference audio.", "Voice Clone", None, example_audio_url],
	],
	inputs=[text_input, mode_radio, dropdown_input, audio_upload],
	label="Click on an Example to Try"
	)
	mode_radio.change(
	fn=toggle_inputs,
	inputs=[mode_radio],
	outputs=[dropdown_input, audio_upload]
	)

	generate_btn.click(
	fn=generate_speech,
	inputs=[text_input, mode_radio, dropdown_input, audio_upload],
	outputs=[output_audio]
	)

	if __name__ == "__main__":
	demo.queue().launch(share=False, debug=False)