Spaces:

ntt123
/

diffusion-speech-360h

Runtime error

App Files Files Community

diffusion-speech-360h / app.py

ntt123

initialize model on GPU

ab294e8 about 1 year ago

raw

history blame contribute delete

1.72 kB

	import gradio as gr
	import numpy as np
	import spaces
	import torch

	from synthesize import synthesize


	# initialize model
	audio, sample_rate = synthesize(
	text="Hello",
	duration_model_config="./train_duration_dit_s.yaml",
	acoustic_model_config="./train_acoustic_dit_b.yaml",
	duration_model_checkpoint="./duration_model_0120000.pt",
	acoustic_model_checkpoint="./acoustic_model_0140000.pt",
	speaker_id=0,
	cfg_scale=4.0,
	num_sampling_steps=100,
	)


	@spaces.GPU
	def text_to_speech(text, speaker_id, cfg_scale, num_sampling_steps):
	audio, sample_rate = synthesize(
	text=text,
	duration_model_config="./train_duration_dit_s.yaml",
	acoustic_model_config="./train_acoustic_dit_b.yaml",
	duration_model_checkpoint="./duration_model_0120000.pt",
	acoustic_model_checkpoint="./acoustic_model_0140000.pt",
	speaker_id=speaker_id,
	cfg_scale=cfg_scale,
	num_sampling_steps=num_sampling_steps,
	)
	return (sample_rate, audio)


	speaker_ids = [str(i) for i in range(100)]
	sampling_steps = [100, 250, 500, 1000]

	demo = gr.Interface(
	fn=text_to_speech,
	inputs=[
	gr.Textbox(label="Text", value="Text to Speech with Diffusion Transformer"),
	gr.Dropdown(choices=speaker_ids, label="Speaker ID", value="0"),
	gr.Slider(minimum=0, maximum=10, value=4.0, label="CFG Scale"),
	gr.Dropdown(choices=sampling_steps, label="Sampling Steps", value=100),
	],
	outputs=gr.Audio(label="Generated Speech"),
	title="Text to Speech with Diffusion Transformer",
	description="Enter text, select a speaker ID (0-99), and adjust the CFG scale to generate speech.",
	flagging_options=None,
	)

	demo.launch()