Spaces:

herimor
/

voxtream2

Running on Zero

App Files Files Community

voxtream2 / app.py

herimor

Update package and model

a488840 1 day ago

raw

history blame contribute delete

12.5 kB

	import argparse
	import json
	import os
	import uuid
	from pathlib import Path

	# Disable PyTorch dynamo/inductor globally
	os.environ["TORCHDYNAMO_DISABLE"] = "1"
	os.environ["TORCHINDUCTOR_DISABLE"] = "1"
	import torch._dynamo as dynamo

	dynamo.config.suppress_errors = True

	import gradio as gr
	import numpy as np
	import soundfile as sf
	import spaces
	import torch

	from voxtream.config import SpeechGeneratorConfig
	from voxtream.generator import SpeechGenerator
	from voxtream.utils.generator import (
	DTYPE_MAP,
	existing_file,
	interpolate_speaking_rate_params,
	text_generator,
	)

	MIN_CHUNK_SEC = 0.01
	FADE_OUT_SEC = 0.10
	CUSTOM_CSS = """
	/* overall width */
	.gradio-container {max-width: 1100px !important}
	/* stack labels tighter and even heights */
	#cols .wrap > .form {gap: 10px}
	#left-col, #right-col {gap: 14px}
	/* make submit centered + bigger */
	#submit {width: 260px; margin: 10px auto 0 auto;}
	/* make clear align left and look secondary */
	#clear {width: 120px;}
	/* give audio a little breathing room */
	audio {outline: none;}
	"""


	def float32_to_int16(audio_float32: np.ndarray) -> np.ndarray:
	"""
	Convert float32 audio samples (-1.0 to 1.0) to int16 PCM samples.

	Parameters:
	audio_float32 (np.ndarray): Input float32 audio samples.

	Returns:
	np.ndarray: Output int16 audio samples.
	"""
	if audio_float32.dtype != np.float32:
	raise ValueError("Input must be a float32 numpy array")

	# Clip to avoid overflow after scaling
	audio_clipped = np.clip(audio_float32, -1.0, 1.0)

	# Scale and convert
	audio_int16 = (audio_clipped * 32767).astype(np.int16)

	return audio_int16


	def _clear_outputs():
	# clears the player + hides file (download btn mirrors file via .change)
	return gr.update(value=None), gr.update(value=None, visible=False)


	def demo_app(config: SpeechGeneratorConfig, demo_examples, synthesize_fn):
	with gr.Blocks(css=CUSTOM_CSS, title="VoXtream2") as demo:
	gr.Markdown("# VoXtream2 TTS demo")
	gr.Markdown(
	"⚠️ The initial latency can be high due to deployment on ZeroGPU. For faster inference, please try local deployment. For more details, please visit [VoXtream GitHub repo](https://github.com/herimor/voxtream)"
	)

	with gr.Row(equal_height=True, elem_id="cols"):
	with gr.Column(scale=1, elem_id="left-col"):
	prompt_audio = gr.Audio(
	sources=["microphone", "upload"],
	type="filepath",
	label=f"Prompt audio (3-10 sec of target voice. Max {config.max_prompt_sec} sec)",
	)
	with gr.Accordion("Advanced options", open=False):
	prompt_enhancement = gr.Checkbox(
	label="Prompt enhancement", value=True
	)
	voice_activity_detection = gr.Checkbox(
	label="Voice activity detection", value=True
	)
	streaming_input = gr.Checkbox(label="Streaming input", value=False)

	with gr.Column(scale=1, elem_id="right-col"):
	target_text = gr.Textbox(
	lines=3,
	max_length=config.max_phone_tokens,
	label=f"Target text (Required, max {config.max_phone_tokens} chars)",
	placeholder="What you want the model to say",
	)
	speaking_rate_control = gr.Slider(
	minimum=1,
	maximum=7,
	step=0.1,
	value=4,
	label="Speaking rate (syllables per second)",
	)
	enable_speaking_rate = gr.Checkbox(
	label="Use speaking rate control", value=True
	)
	enable_speaking_rate.change(
	fn=lambda enabled: gr.update(interactive=enabled),
	inputs=enable_speaking_rate,
	outputs=speaking_rate_control,
	)
	output_audio = gr.Audio(
	label="Synthesized audio",
	interactive=False,
	streaming=True,
	autoplay=True,
	show_download_button=False,
	show_share_button=False,
	visible=False,
	)

	# appears only when file is ready
	download_btn = gr.DownloadButton(
	"Download audio",
	visible=False,
	)

	with gr.Row():
	clear_btn = gr.Button("Clear", elem_id="clear", variant="secondary")
	submit_btn = gr.Button(
	"Submit", elem_id="submit", variant="primary", interactive=False
	)

	# Message box for validation errors
	validation_msg = gr.Markdown("", visible=False)

	# --- Validation logic ---
	def validate_inputs(audio, ttext):
	if not audio:
	return gr.update(
	visible=True, value="⚠️ Please provide a prompt audio."
	), gr.update(interactive=False)
	if not ttext.strip():
	return gr.update(
	visible=True, value="⚠️ Please provide target text."
	), gr.update(interactive=False)
	return gr.update(visible=False, value=""), gr.update(interactive=True)

	# Live validation whenever inputs change
	for inp in [prompt_audio, target_text]:
	inp.change(
	fn=validate_inputs,
	inputs=[prompt_audio, target_text],
	outputs=[validation_msg, submit_btn],
	)

	# clear outputs before streaming
	submit_btn.click(
	fn=lambda a, t: (
	gr.update(value=None, visible=True),
	gr.update(value=None, visible=False),
	),
	inputs=[prompt_audio, target_text],
	outputs=[output_audio, download_btn],
	show_progress="hidden",
	).then(
	fn=synthesize_fn,
	inputs=[
	prompt_audio,
	target_text,
	prompt_enhancement,
	voice_activity_detection,
	streaming_input,
	speaking_rate_control,
	enable_speaking_rate,
	],
	outputs=[output_audio, download_btn],
	)

	clear_btn.click(
	fn=lambda: (
	gr.update(value=None),
	gr.update(value=""),
	gr.update(value=None, visible=False), # output_audio
	gr.update(value=None, visible=False), # download_btn
	gr.update(visible=False, value=""), # validation_msg
	gr.update(interactive=False), # submit_btn
	),
	inputs=[],
	outputs=[
	prompt_audio,
	target_text,
	output_audio,
	download_btn,
	validation_msg,
	submit_btn,
	],
	)

	# --- Add Examples ---
	gr.Markdown("### Examples")
	ex = gr.Examples(
	examples=demo_examples,
	inputs=[
	prompt_audio,
	target_text,
	prompt_enhancement,
	voice_activity_detection,
	streaming_input,
	speaking_rate_control,
	enable_speaking_rate,
	],
	outputs=[output_audio, download_btn],
	fn=synthesize_fn,
	cache_examples=False,
	)

	ex.dataset.click(
	fn=_clear_outputs,
	inputs=[],
	outputs=[output_audio, download_btn],
	queue=False,
	).then(
	fn=validate_inputs,
	inputs=[prompt_audio, target_text],
	outputs=[validation_msg, submit_btn],
	queue=False,
	)

	demo.launch()


	def main():
	parser = argparse.ArgumentParser()
	parser.add_argument(
	"-c",
	"--config",
	type=existing_file,
	help="Path to the config file",
	default="configs/generator.json",
	)
	parser.add_argument(
	"--spk-rate-config",
	type=existing_file,
	help="Path to the speaking rate config file",
	default="configs/speaking_rate.json",
	)
	parser.add_argument(
	"--examples-config",
	type=existing_file,
	help="Path to the examples config file",
	default="assets/examples.json",
	)
	args = parser.parse_args()

	with open(args.config) as f:
	config = SpeechGeneratorConfig(**json.load(f))
	config.hf_token = os.environ.get("TOKEN")

	# Loading speaker encoder
	torch.hub.load(
	config.spk_enc_repo,
	config.spk_enc_model,
	model_name=config.spk_enc_model_name,
	train_type=config.spk_enc_train_type,
	dataset=config.spk_enc_dataset,
	trust_repo=True,
	verbose=False,
	)

	with open(args.spk_rate_config) as f:
	speaking_rate_config = json.load(f)

	with open(args.examples_config) as f:
	examples_config = json.load(f)
	demo_examples = examples_config.get("examples", [])

	speech_generator = SpeechGenerator(config)
	CHUNK_SIZE = int(config.mimi_sr * MIN_CHUNK_SEC)

	@spaces.GPU
	def synthesize_fn(
	prompt_audio_path,
	target_text,
	prompt_enhancement,
	voice_activity_detection,
	streaming_input,
	speaking_rate_control,
	enable_speaking_rate,
	):
	if next(speech_generator.model.parameters()).device.type == "cpu":
	speech_generator.model.to("cuda")
	speech_generator.mimi.to("cuda")
	speech_generator.ctx.mimi_prompt.to("cuda")
	speech_generator.ctx.spk_enc.to("cuda")
	speech_generator.ctx.device = "cuda"
	speech_generator.ctx.dtype = DTYPE_MAP["cuda"]

	if not prompt_audio_path or not target_text:
	return None, gr.update(value=None, visible=False)

	if enable_speaking_rate:
	duration_state, weight, cfg_gamma = interpolate_speaking_rate_params(
	speaking_rate_config, speaking_rate_control
	)
	else:
	duration_state, weight, cfg_gamma = None, None, None

	stream = speech_generator.generate_stream(
	prompt_audio_path=Path(prompt_audio_path),
	text=text_generator(target_text) if streaming_input else target_text,
	target_spk_rate_cnt=duration_state,
	spk_rate_weight=weight,
	cfg_gamma=cfg_gamma,
	enhance_prompt=prompt_enhancement,
	apply_vad=voice_activity_detection,
	)

	buffer = []
	buffer_len = 0
	total_buffer = []

	for frame, _ in stream:
	buffer.append(frame)
	total_buffer.append(frame)
	buffer_len += frame.shape[0]

	if buffer_len >= CHUNK_SIZE:
	audio = np.concatenate(buffer)
	yield (config.mimi_sr, float32_to_int16(audio)), None

	# Reset buffer and length
	buffer = []
	buffer_len = 0

	# Handle any remaining audio in the buffer
	if buffer_len > 0:
	final = np.concatenate(buffer)
	nfade = min(int(config.mimi_sr * FADE_OUT_SEC), final.shape[0])
	if nfade > 0:
	fade = np.linspace(1.0, 0.0, nfade, dtype=np.float32)
	final[-nfade:] *= fade
	yield (config.mimi_sr, float32_to_int16(final)), None

	# Save the full audio to a file for download
	if len(total_buffer) > 0:
	full_audio = np.concatenate(total_buffer)
	nfade = min(int(config.mimi_sr * FADE_OUT_SEC), full_audio.shape[0])
	if nfade > 0:
	fade = np.linspace(1.0, 0.0, nfade, dtype=np.float32)
	full_audio[-nfade:] *= fade

	file_path = f"/tmp/voxtream_{uuid.uuid4().hex}.wav"
	sf.write(file_path, float32_to_int16(full_audio), config.mimi_sr)

	yield None, gr.update(value=file_path, visible=True)
	else:
	yield None, gr.update(value=None, visible=False)

	demo_app(config, demo_examples, synthesize_fn)


	if __name__ == "__main__":
	main()