xunyi's picture
Update app.py
14a76c6 verified
import os
import sys
# Add local code directories to path
CODE_ROOT = os.path.abspath(os.path.dirname(__file__))
sys.path.insert(0, os.path.join(CODE_ROOT, 'third_party', 'Matcha-TTS'))
sys.path.insert(0, os.path.join(CODE_ROOT, 'cosyvoice'))
import gradio as gr
import numpy as np
import torch
import torchaudio
import librosa
from cosyvoice.cli.cosyvoice import CosyVoice, CosyVoice2
from cosyvoice.utils.file_utils import load_wav, logging
from cosyvoice.utils.common import set_all_random_seed
from huggingface_hub import snapshot_download
# Model repository and location of weights
MODEL_REPO = "xunyi/SMIIP-NV_finetune_CosyVoice2"
SUBFOLDER = "pretrained_models/CosyVoice2-0.5B"
# Download model repo to cache and locate subfolder
repo_local = snapshot_download(repo_id=MODEL_REPO, repo_type='model')
model_dir = os.path.join(repo_local, SUBFOLDER)
# Global settings
max_val = 0.8
cosyvoice = None
prompt_sr = 16000
default_data = None
def postprocess(speech, top_db=60, hop_length=220, win_length=440):
# Convert to NumPy and ensure mono
speech_np = speech.cpu().numpy()
if speech_np.ndim > 1:
speech_np = speech_np[0]
# Trim silence
trimmed_np, _ = librosa.effects.trim(
speech_np,
top_db=top_db,
frame_length=win_length,
hop_length=hop_length,
)
# Back to tensor
tensor = torch.from_numpy(trimmed_np).to(speech.device).float()
# Ensure 2D
if tensor.ndim == 1:
tensor = tensor.unsqueeze(0)
# Normalize
peak = tensor.abs().max()
if peak > max_val:
tensor = tensor / peak * max_val
# Append 0.2s silence
pad = torch.zeros(
1,
int(cosyvoice.sample_rate * 0.2),
device=tensor.device,
dtype=tensor.dtype,
)
return torch.cat([tensor, pad], dim=1)
def generate_audio(tts_text, prompt_upload, prompt_record, prompt_text):
global cosyvoice, default_data
wav_path = prompt_upload or prompt_record
if cosyvoice is None:
return None
if wav_path is None:
gr.Info('Prompt audio not provided.')
return None
# Check sample rate
try:
info = torchaudio.info(wav_path)
if info.sample_rate < prompt_sr:
gr.Info(f"Sampling rate too low:{info.sample_rate} < {prompt_sr}")
return None
except Exception as e:
gr.Info(f"Failed to read audio:{e}")
return None
if not prompt_text:
gr.Info('Prompt text not provided.')
return None
# Load and preprocess
try:
wav_tensor = load_wav(wav_path, prompt_sr)
prompt_tensor = postprocess(wav_tensor)
except Exception as e:
gr.Info(f"Error while processing audio:{e}")
return None
# Run inference
set_all_random_seed(0)
logging.info("Inference Start")
try:
out = next(cosyvoice.inference_zero_shot(
tts_text,
prompt_text,
prompt_tensor,
stream=False,
speed=1.0,
))
audio = out['tts_speech'].numpy().flatten()
return cosyvoice.sample_rate, audio
except Exception as e:
gr.Info(f"Inference fails:{e}")
return cosyvoice.sample_rate, default_data
DEFAULT_PROMPT_PATH = os.path.join(CODE_ROOT, 'asset', 'default_prompt.wav')
# Build Gradio interface
with gr.Blocks() as demo:
gr.Markdown("### SMIIP-NV CosyVoice2 (zero-shot) ")
gr.Markdown("#### Using Huggingface CPU resources for computation ")
tts_input = gr.Textbox(label="Text to be synthesised", lines=1,
value="昨天翻出我们的旧照片<crying>,心里涌起无尽的感伤<crying>,回忆太过刺痛。")
with gr.Row():
upload = gr.Audio(sources=['upload'], type='filepath', label='Upload Prompt', value=DEFAULT_PROMPT_PATH)
record = gr.Audio(sources=['microphone'], type='filepath', label='Record Prompt')
text_prompt = gr.Textbox(label="Prompt Text", lines=1, value="在这个孤独的夜晚<crying>,窗外的雨声让我想起了你,<crying>我真的好想你。")
btn = gr.Button('Generate Audio')
out_audio = gr.Audio(label='Generate results', autoplay=True)
btn.click(generate_audio, inputs=[tts_input, upload, record, text_prompt], outputs=[out_audio])
# Initialize model
try:
cosyvoice = CosyVoice(model_dir)
print("Loaded CosyVoice")
except Exception:
cosyvoice = CosyVoice2(model_dir)
print("Loaded CosyVoice2")
def init_silence():
return np.zeros(cosyvoice.sample_rate, dtype=np.float32)
default_data = init_silence()
if __name__ == '__main__':
demo.launch()