mikuTTS

Running

App Files Files Community

mikuTTS / app.py

NeoPy

Update app.py

6a2fdd3 verified 3 months ago

raw

history blame contribute delete

7.24 kB

	import spaces # in windows env, delete related to "spaces"
	@spaces.GPU
	def gpu():
	pass

	import asyncio
	import datetime
	import logging
	import os
	import time
	import traceback

	import edge_tts
	import gradio as gr
	import librosa
	import torch
	from huggingface_hub import snapshot_download



	logging.getLogger("numba").setLevel(logging.WARNING)
	logging.getLogger("markdown_it").setLevel(logging.WARNING)
	logging.getLogger("urllib3").setLevel(logging.WARNING)
	logging.getLogger("matplotlib").setLevel(logging.WARNING)

	limitation = os.getenv("SYSTEM") == "spaces"


	# Edge TTS
	edge_output_filename = "edge_output.mp3"
	tts_voice_list = asyncio.get_event_loop().run_until_complete(edge_tts.list_voices())
	tts_voices = [f"{v['ShortName']}-{v['Gender']}" for v in tts_voice_list]

	# RVC models
	model_root = snapshot_download(repo_id="NoCrypt/miku_RVC", token=os.getenv("TOKEN", None))
	models = [d for d in os.listdir(model_root) if os.path.isdir(f"{model_root}/{d}")]
	models.sort()

	initial_md = """
	![banner that says mikutts](https://huggingface.co/spaces/NoCrypt/mikuTTS/resolve/main/imgs/banner_mikutts.webp)
	"""


	from main.inference import run_inference_script

	def tts(
	model_name,
	speed,
	tts_text,
	tts_voice,
	f0_up_key,
	f0_method,
	index_rate,
	protect,
	filter_radius=3,
	resample_sr=0,
	):
	print("------------------")
	print(datetime.datetime.now())
	print("tts_text:")
	print(tts_text)
	print(f"tts_voice: {tts_voice}, speed: {speed}")
	print(f"Model name: {model_name}")
	print(f"F0: {f0_method}, Key: {f0_up_key}, Index: {index_rate}, Protect: {protect}")
	try:
	if limitation and len(tts_text) > 1000:
	print("Error: Text too long")
	return (
	f"Text characters should be at most 1000 in this huggingface space, but got {len(tts_text)} characters.",
	None,
	None,
	)
	t0 = time.time()
	if speed >= 0:
	speed_str = f"+{speed}%"
	else:
	speed_str = f"{speed}%"

	# Fix: Extract just the ShortName from the voice selection
	voice_name = tts_voice.split("-")[0]

	asyncio.run(
	edge_tts.Communicate(
	tts_text, voice_name, rate=speed_str
	).save(edge_output_filename)
	)
	t1 = time.time()
	edge_time = t1 - t0
	audio, sr = librosa.load(edge_output_filename, sr=16000, mono=True)
	duration = len(audio) / sr
	print(f"Audio duration: {duration}s")
	if limitation and duration >= 200:
	print("Error: Audio too long")
	return (
	f"Audio should be less than 200 seconds in this huggingface space, but got {duration}s.",
	edge_output_filename,
	None,
	)

	# Fix: Use edge_output_filename as input_path and fix typo in f0_method
	audio_opt, times, tgt_sr = run_inference_script(
	model_name=model_name,
	input_path=edge_output_filename,
	pitch=f0_up_key,
	f0_method=f0_method, # Fixed typo from f0_metho
	index_rate=index_rate,
	protect=protect,
	filter_radius=filter_radius,
	resample_sr=resample_sr,
	)
	if tgt_sr != resample_sr >= 16000:
	tgt_sr = resample_sr
	info = f"Success. Time: edge-tts: {edge_time}s, npy: {times[0]}s, f0: {times[1]}s, infer: {times[2]}s"
	print(info)
	return (
	info,
	edge_output_filename,
	(tgt_sr, audio_opt),
	)
	except EOFError:
	info = (
	"It seems that the edge-tts output is not valid. "
	"This may occur when the input text and the speaker do not match. "
	"For example, maybe you entered Japanese (without alphabets) text but chose non-Japanese speaker?"
	)
	print(info)
	return info, None, None
	except:
	info = traceback.format_exc()
	print(info)
	return info, None, None




	with gr.Blocks() as app:
	gr.Markdown(initial_md)
	gr.Markdown("# MikuTTS V3")
	gr.Markdown("# Modern - Stylish")
	with gr.Row():
	with gr.Column():
	model_name = gr.Dropdown(
	label="Model",
	choices=models,
	value=models[0],
	)
	f0_key_up = gr.Number(
	label="Tune",
	value=6,
	)
	with gr.Column():
	f0_method = gr.Radio(
	label="Pitch extraction method (pm: very fast, low quality, rmvpe: a little slow, high quality)",
	choices=["pm", "rmvpe"], # harvest and crepe is too slow
	value="rmvpe",
	interactive=True,
	)
	index_rate = gr.Slider(
	minimum=0,
	maximum=1,
	label="Index rate",
	value=1,
	interactive=True,
	)
	protect0 = gr.Slider(
	minimum=0,
	maximum=0.5,
	label="Protect",
	value=0.33,
	step=0.01,
	interactive=True,
	)
	with gr.Row():
	with gr.Column():
	tts_voice = gr.Dropdown(
	label="Edge-tts speaker (format: language-Country-Name-Gender), make sure the gender matches the model",
	choices=tts_voices,
	allow_custom_value=False,
	value="ja-JP-NanamiNeural-Female",
	)
	speed = gr.Slider(
	minimum=-100,
	maximum=100,
	label="Speech speed (%)",
	value=0,
	step=10,
	interactive=True,
	)
	tts_text = gr.Textbox(label="Input Text", value="こんにちは、私の名前は初音ミクです!")
	with gr.Column():
	but0 = gr.Button("Convert", variant="primary")
	info_text = gr.Textbox(label="Output info", scale=4)
	with gr.Column():
	with gr.Accordion("Edge Voice", open=False):
	edge_tts_output = gr.Audio(label="Edge Voice", type="filepath")
	tts_output = gr.Audio(label="Result")
	but0.click(
	tts,
	[
	model_name,
	speed,
	tts_text,
	tts_voice,
	f0_key_up,
	f0_method,
	index_rate,
	protect0,
	],
	[info_text, edge_tts_output, tts_output],
	)
	with gr.Row():
	examples = gr.Examples(
	examples_per_page=100,
	examples=[
	["こんにちは、私の名前は初音ミクです!", "ja-JP-NanamiNeural-Female", 6],
	["Hello there. My name is Hatsune Miku!","en-CA-ClaraNeural-Female", 6],
	["Halo. Nama saya Hatsune Miku!","id-ID-GadisNeural-Female", 4],
	["Halo. Jenengku Hatsune Miku!","jv-ID-SitiNeural-Female", 10],
	],
	inputs=[tts_text, tts_voice, f0_key_up],
	)

	app.launch(theme="NeoPy=Soft", ssr_mode=False)