Spaces:
Build error
Build error
| import os | |
| import sys | |
| # Add local code directories to path | |
| CODE_ROOT = os.path.abspath(os.path.dirname(__file__)) | |
| sys.path.insert(0, os.path.join(CODE_ROOT, 'third_party', 'Matcha-TTS')) | |
| sys.path.insert(0, os.path.join(CODE_ROOT, 'cosyvoice')) | |
| import gradio as gr | |
| import numpy as np | |
| import torch | |
| import torchaudio | |
| import librosa | |
| from cosyvoice.cli.cosyvoice import CosyVoice, CosyVoice2 | |
| from cosyvoice.utils.file_utils import load_wav, logging | |
| from cosyvoice.utils.common import set_all_random_seed | |
| from huggingface_hub import snapshot_download | |
| # Model repository and location of weights | |
| MODEL_REPO = "xunyi/SMIIP-NV_finetune_CosyVoice2" | |
| SUBFOLDER = "pretrained_models/CosyVoice2-0.5B" | |
| # Download model repo to cache and locate subfolder | |
| repo_local = snapshot_download(repo_id=MODEL_REPO, repo_type='model') | |
| model_dir = os.path.join(repo_local, SUBFOLDER) | |
| # Global settings | |
| max_val = 0.8 | |
| cosyvoice = None | |
| prompt_sr = 16000 | |
| default_data = None | |
| def postprocess(speech, top_db=60, hop_length=220, win_length=440): | |
| # Convert to NumPy and ensure mono | |
| speech_np = speech.cpu().numpy() | |
| if speech_np.ndim > 1: | |
| speech_np = speech_np[0] | |
| # Trim silence | |
| trimmed_np, _ = librosa.effects.trim( | |
| speech_np, | |
| top_db=top_db, | |
| frame_length=win_length, | |
| hop_length=hop_length, | |
| ) | |
| # Back to tensor | |
| tensor = torch.from_numpy(trimmed_np).to(speech.device).float() | |
| # Ensure 2D | |
| if tensor.ndim == 1: | |
| tensor = tensor.unsqueeze(0) | |
| # Normalize | |
| peak = tensor.abs().max() | |
| if peak > max_val: | |
| tensor = tensor / peak * max_val | |
| # Append 0.2s silence | |
| pad = torch.zeros( | |
| 1, | |
| int(cosyvoice.sample_rate * 0.2), | |
| device=tensor.device, | |
| dtype=tensor.dtype, | |
| ) | |
| return torch.cat([tensor, pad], dim=1) | |
| def generate_audio(tts_text, prompt_upload, prompt_record, prompt_text): | |
| global cosyvoice, default_data | |
| wav_path = prompt_upload or prompt_record | |
| if cosyvoice is None: | |
| return None | |
| if wav_path is None: | |
| gr.Info('Prompt audio not provided.') | |
| return None | |
| # Check sample rate | |
| try: | |
| info = torchaudio.info(wav_path) | |
| if info.sample_rate < prompt_sr: | |
| gr.Info(f"Sampling rate too low:{info.sample_rate} < {prompt_sr}") | |
| return None | |
| except Exception as e: | |
| gr.Info(f"Failed to read audio:{e}") | |
| return None | |
| if not prompt_text: | |
| gr.Info('Prompt text not provided.') | |
| return None | |
| # Load and preprocess | |
| try: | |
| wav_tensor = load_wav(wav_path, prompt_sr) | |
| prompt_tensor = postprocess(wav_tensor) | |
| except Exception as e: | |
| gr.Info(f"Error while processing audio:{e}") | |
| return None | |
| # Run inference | |
| set_all_random_seed(0) | |
| logging.info("Inference Start") | |
| try: | |
| out = next(cosyvoice.inference_zero_shot( | |
| tts_text, | |
| prompt_text, | |
| prompt_tensor, | |
| stream=False, | |
| speed=1.0, | |
| )) | |
| audio = out['tts_speech'].numpy().flatten() | |
| return cosyvoice.sample_rate, audio | |
| except Exception as e: | |
| gr.Info(f"Inference fails:{e}") | |
| return cosyvoice.sample_rate, default_data | |
| DEFAULT_PROMPT_PATH = os.path.join(CODE_ROOT, 'asset', 'default_prompt.wav') | |
| # Build Gradio interface | |
| with gr.Blocks() as demo: | |
| gr.Markdown("### SMIIP-NV CosyVoice2 (zero-shot) ") | |
| gr.Markdown("#### Using Huggingface CPU resources for computation ") | |
| tts_input = gr.Textbox(label="Text to be synthesised", lines=1, | |
| value="昨天翻出我们的旧照片<crying>,心里涌起无尽的感伤<crying>,回忆太过刺痛。") | |
| with gr.Row(): | |
| upload = gr.Audio(sources=['upload'], type='filepath', label='Upload Prompt', value=DEFAULT_PROMPT_PATH) | |
| record = gr.Audio(sources=['microphone'], type='filepath', label='Record Prompt') | |
| text_prompt = gr.Textbox(label="Prompt Text", lines=1, value="在这个孤独的夜晚<crying>,窗外的雨声让我想起了你,<crying>我真的好想你。") | |
| btn = gr.Button('Generate Audio') | |
| out_audio = gr.Audio(label='Generate results', autoplay=True) | |
| btn.click(generate_audio, inputs=[tts_input, upload, record, text_prompt], outputs=[out_audio]) | |
| # Initialize model | |
| try: | |
| cosyvoice = CosyVoice(model_dir) | |
| print("Loaded CosyVoice") | |
| except Exception: | |
| cosyvoice = CosyVoice2(model_dir) | |
| print("Loaded CosyVoice2") | |
| def init_silence(): | |
| return np.zeros(cosyvoice.sample_rate, dtype=np.float32) | |
| default_data = init_silence() | |
| if __name__ == '__main__': | |
| demo.launch() | |