Spaces:

xunyi
/

SMIIP-NV_Finetuned_CosyVoice2

Build error

App Files Files Community

SMIIP-NV_Finetuned_CosyVoice2 / app.py

xunyi

Update app.py

14a76c6 verified 11 months ago

raw

history blame contribute delete

4.63 kB

	import os
	import sys

	# Add local code directories to path
	CODE_ROOT = os.path.abspath(os.path.dirname(__file__))
	sys.path.insert(0, os.path.join(CODE_ROOT, 'third_party', 'Matcha-TTS'))
	sys.path.insert(0, os.path.join(CODE_ROOT, 'cosyvoice'))

	import gradio as gr
	import numpy as np
	import torch
	import torchaudio
	import librosa
	from cosyvoice.cli.cosyvoice import CosyVoice, CosyVoice2
	from cosyvoice.utils.file_utils import load_wav, logging
	from cosyvoice.utils.common import set_all_random_seed
	from huggingface_hub import snapshot_download

	# Model repository and location of weights
	MODEL_REPO = "xunyi/SMIIP-NV_finetune_CosyVoice2"
	SUBFOLDER = "pretrained_models/CosyVoice2-0.5B"

	# Download model repo to cache and locate subfolder
	repo_local = snapshot_download(repo_id=MODEL_REPO, repo_type='model')
	model_dir = os.path.join(repo_local, SUBFOLDER)

	# Global settings
	max_val = 0.8
	cosyvoice = None
	prompt_sr = 16000
	default_data = None


	def postprocess(speech, top_db=60, hop_length=220, win_length=440):
	# Convert to NumPy and ensure mono
	speech_np = speech.cpu().numpy()
	if speech_np.ndim > 1:
	speech_np = speech_np[0]
	# Trim silence
	trimmed_np, _ = librosa.effects.trim(
	speech_np,
	top_db=top_db,
	frame_length=win_length,
	hop_length=hop_length,
	)
	# Back to tensor
	tensor = torch.from_numpy(trimmed_np).to(speech.device).float()
	# Ensure 2D
	if tensor.ndim == 1:
	tensor = tensor.unsqueeze(0)
	# Normalize
	peak = tensor.abs().max()
	if peak > max_val:
	tensor = tensor / peak * max_val
	# Append 0.2s silence
	pad = torch.zeros(
	1,
	int(cosyvoice.sample_rate * 0.2),
	device=tensor.device,
	dtype=tensor.dtype,
	)
	return torch.cat([tensor, pad], dim=1)


	def generate_audio(tts_text, prompt_upload, prompt_record, prompt_text):
	global cosyvoice, default_data
	wav_path = prompt_upload or prompt_record
	if cosyvoice is None:
	return None
	if wav_path is None:
	gr.Info('Prompt audio not provided.')
	return None
	# Check sample rate
	try:
	info = torchaudio.info(wav_path)
	if info.sample_rate < prompt_sr:
	gr.Info(f"Sampling rate too low:{info.sample_rate} < {prompt_sr}")
	return None
	except Exception as e:
	gr.Info(f"Failed to read audio:{e}")
	return None
	if not prompt_text:
	gr.Info('Prompt text not provided.')
	return None
	# Load and preprocess
	try:
	wav_tensor = load_wav(wav_path, prompt_sr)
	prompt_tensor = postprocess(wav_tensor)
	except Exception as e:
	gr.Info(f"Error while processing audio:{e}")
	return None
	# Run inference
	set_all_random_seed(0)
	logging.info("Inference Start")
	try:
	out = next(cosyvoice.inference_zero_shot(
	tts_text,
	prompt_text,
	prompt_tensor,
	stream=False,
	speed=1.0,
	))
	audio = out['tts_speech'].numpy().flatten()
	return cosyvoice.sample_rate, audio
	except Exception as e:
	gr.Info(f"Inference fails:{e}")
	return cosyvoice.sample_rate, default_data

	DEFAULT_PROMPT_PATH = os.path.join(CODE_ROOT, 'asset', 'default_prompt.wav')
	# Build Gradio interface
	with gr.Blocks() as demo:
	gr.Markdown("### SMIIP-NV CosyVoice2 (zero-shot) ")
	gr.Markdown("#### Using Huggingface CPU resources for computation ")
	tts_input = gr.Textbox(label="Text to be synthesised", lines=1,
	value="昨天翻出我们的旧照片<crying>，心里涌起无尽的感伤<crying>，回忆太过刺痛。")
	with gr.Row():
	upload = gr.Audio(sources=['upload'], type='filepath', label='Upload Prompt', value=DEFAULT_PROMPT_PATH)
	record = gr.Audio(sources=['microphone'], type='filepath', label='Record Prompt')
	text_prompt = gr.Textbox(label="Prompt Text", lines=1, value="在这个孤独的夜晚<crying>，窗外的雨声让我想起了你，<crying>我真的好想你。")
	btn = gr.Button('Generate Audio')
	out_audio = gr.Audio(label='Generate results', autoplay=True)
	btn.click(generate_audio, inputs=[tts_input, upload, record, text_prompt], outputs=[out_audio])

	# Initialize model
	try:
	cosyvoice = CosyVoice(model_dir)
	print("Loaded CosyVoice")
	except Exception:
	cosyvoice = CosyVoice2(model_dir)
	print("Loaded CosyVoice2")

	def init_silence():
	return np.zeros(cosyvoice.sample_rate, dtype=np.float32)
	default_data = init_silence()

	if __name__ == '__main__':
	demo.launch()