import gradio as gr import torch import torchaudio import numpy as np import os import sys from pathlib import Path import tempfile import soundfile as sf # 标题和描述 title = "CosyVoice - 语音处理模型" description = """ CosyVoice是一个先进的语音处理模型,支持语音识别、语音合成等功能。 上传音频文件或使用麦克风录制语音,体验CosyVoice的强大功能。 """ # 全局变量存储模型 cosyvoice_model = None model_loaded = False # 初始化CosyVoice模型 def load_cosyvoice_model(): """加载CosyVoice模型""" global cosyvoice_model, model_loaded if model_loaded: return cosyvoice_model print("\n" + "="*60) print("正在加载CosyVoice模型...") print("="*60) try: # 方法1: 尝试使用官方 CosyVoice 包 print("\n尝试使用官方 CosyVoice 包...") try: # 添加 third_party 路径(如果存在) third_party_path = os.path.join(os.path.dirname(__file__), 'third_party', 'Matcha-TTS') if os.path.exists(third_party_path): sys.path.insert(0, third_party_path) from cosyvoice.cli.cosyvoice import CosyVoice # 尝试从 Hugging Face Hub 加载 model_name = "FunAudioLLM/CosyVoice-300M" print(f"从 {model_name} 加载...") # 下载模型到本地 from huggingface_hub import snapshot_download model_dir = snapshot_download(repo_id=model_name, cache_dir="./models") # 使用 CosyVoice 加载 cosyvoice = CosyVoice(model_dir=model_dir) cosyvoice_model = { 'model': cosyvoice, 'type': 'cosyvoice_official', 'has_inference': True, 'sample_rate': getattr(cosyvoice, 'sample_rate', 22050) } model_loaded = True print("✓ 成功使用官方 CosyVoice 包加载模型") print("="*60 + "\n") return cosyvoice_model except ImportError as ie: print(f"⚠ 官方 CosyVoice 包不可用: {ie}") print(" 尝试其他加载方式...") # 方法2: 尝试使用 transformers AutoModel(需要 trust_remote_code) print("\n尝试使用 transformers AutoModel...") try: from transformers import AutoModel model_name = "FunAudioLLM/CosyVoice-300M" print(f"从 {model_name} 加载...") # 使用 trust_remote_code=True 加载自定义模型 model = AutoModel.from_pretrained( model_name, trust_remote_code=True, torch_dtype=torch.float32, low_cpu_mem_usage=True ) model.eval() # 检查模型方法 has_inference_sft = hasattr(model, 'inference_sft') has_inference_zero_shot = hasattr(model, 'inference_zero_shot') has_inference_cross_lingual = hasattr(model, 'inference_cross_lingual') print(f"模型类型: {type(model).__name__}") print(f"推理方法:") print(f" - inference_sft: {has_inference_sft}") print(f" - inference_zero_shot: {has_inference_zero_shot}") print(f" - inference_cross_lingual: {has_inference_cross_lingual}") if has_inference_sft or has_inference_zero_shot: cosyvoice_model = { 'model': model, 'type': 'transformers', 'has_inference': True, 'sample_rate': getattr(model, 'sample_rate', 22050) } model_loaded = True print("✓ 成功使用 transformers 加载模型") print("="*60 + "\n") return cosyvoice_model else: print("⚠ 模型缺少必要的推理方法") raise ValueError("Model missing inference methods") except Exception as te: print(f"⚠ transformers 加载失败: {te}") import traceback traceback.print_exc() # 方法3: 下载模型文件(演示模式) print("\n尝试下载模型文件...") from huggingface_hub import snapshot_download model_name = "FunAudioLLM/CosyVoice-300M" model_dir = snapshot_download( repo_id=model_name, allow_patterns=["*.pt", "*.pth", "*.bin", "*.json", "*.yaml", "*.txt", "*.safetensors"], cache_dir="./models" ) print(f"✓ 模型文件已下载到: {model_dir}") print("\n⚠ 注意: 模型文件已下载,但无法加载推理引擎") print(" 建议:") print(" 1. 安装完整的 CosyVoice 包: pip install cosyvoice") print(" 2. 或在 Hugging Face Space 中使用演示模式") print("="*60 + "\n") cosyvoice_model = None model_loaded = True return None except Exception as e: print(f"✗ 模型加载失败: {e}") import traceback print(f"详细错误:\n{traceback.format_exc()}") print("\n⚠ 使用演示模式") print("提示: 要使用完整功能,请:") print(" 1. 确保网络连接正常") print(" 2. 确保有足够的磁盘空间(约2GB)") print(" 3. 安装 CosyVoice: pip install cosyvoice") print("="*60 + "\n") cosyvoice_model = None model_loaded = True return None def process_audio(audio_file): """处理音频文件 - 语音识别""" if audio_file is None: return "请上传音频文件" try: # 加载模型 model = load_cosyvoice_model() # 处理不同的音频输入格式 if isinstance(audio_file, tuple): sample_rate, audio_data = audio_file else: # 如果是文件路径 import soundfile as sf audio_data, sample_rate = sf.read(audio_file) duration = len(audio_data) / sample_rate # 如果模型已加载,使用真实推理 if model is not None: try: # 使用CosyVoice进行语音识别 # 注意:CosyVoice主要是TTS模型,这里展示如何处理音频 result = f""" ✓ 音频处理成功 音频信息: - 采样率: {sample_rate} Hz - 时长: {duration:.2f} 秒 - 数据形状: {audio_data.shape} - 数据类型: {audio_data.dtype} 模型状态: CosyVoice模型已加载 注意: CosyVoice主要用于语音合成(TTS),如需语音识别请使用ASR模型 """ return result except Exception as e: return f"模型推理失败: {str(e)}" else: # 演示模式 result = f""" 音频信息: - 采样率: {sample_rate} Hz - 时长: {duration:.2f} 秒 - 数据点数: {len(audio_data)} ⚠ 演示模式(模型未加载) 提示: 请确保安装CosyVoice模型以使用完整功能 """ return result except Exception as e: return f"处理失败: {str(e)}" def text_to_speech(text, speaker="中文女", prompt_audio=None, prompt_text=None): """文本转语音 - 使用CosyVoice Args: text: 要合成的文本 speaker: 说话人(用于SFT模式) prompt_audio: 提示音频路径(用于zero-shot模式) prompt_text: 提示文本(用于zero-shot模式) """ if not text or text.strip() == "": return None, "请输入要转换的文本" try: # 加载模型 model = load_cosyvoice_model() # 如果模型已加载,使用真实推理 if model is not None: try: # 检查模型类型并使用相应的接口 if isinstance(model, dict): model_type = model.get('type', 'unknown') # 官方 CosyVoice AutoModel if model_type == 'cosyvoice_official': cosyvoice = model['model'] sample_rate = model.get('sample_rate', 22050) print(f"使用官方CosyVoice API: text={text[:50]}...") # 使用 inference_zero_shot 方法(zero-shot克隆) # 根据官方文档,使用正确的提示文本格式 try: # 准备提示文本(使用官方格式) if prompt_text is None: # 使用默认提示文本 prompt_text = 'You are a helpful assistant.<|endofprompt|>希望你以后能够做的比我还好呦。' # 准备提示音频 if prompt_audio is None: # 尝试使用项目中的示例音频 possible_prompt_paths = [ './asset/zero_shot_prompt.wav', './CosyVoice/asset/zero_shot_prompt.wav', './zero_shot_prompt.wav' ] for path in possible_prompt_paths: if os.path.exists(path): prompt_audio = path print(f"使用提示音频: {path}") break # 如果有提示音频,使用 zero-shot 模式 if prompt_audio and os.path.exists(prompt_audio): print(f"使用 inference_zero_shot: text={text[:30]}, prompt={prompt_text[:50]}") audio_chunks = [] for i, output in enumerate(cosyvoice.inference_zero_shot( text, prompt_text, prompt_audio, stream=False )): if isinstance(output, dict) and 'tts_speech' in output: audio_chunks.append(output['tts_speech']) else: audio_chunks.append(output) if audio_chunks: if torch.is_tensor(audio_chunks[0]): audio_data = torch.cat(audio_chunks, dim=-1).cpu().numpy() else: audio_data = np.concatenate(audio_chunks, axis=-1) if audio_data.ndim > 1: audio_data = audio_data.flatten() audio_tuple = (sample_rate, audio_data.astype(np.float32)) return audio_tuple, f"✓ 语音合成成功\n文本: {text}\n模式: Zero-shot\n模型: CosyVoice (官方API)" # 如果没有提示音频,尝试使用 inference_sft(预训练说话人) else: print(f"使用 inference_sft: text={text[:30]}, speaker={speaker}") # CosyVoice-300M 可能支持的说话人ID # 需要根据实际模型调整 audio_chunks = [] for i, output in enumerate(cosyvoice.inference_sft(text, speaker, stream=False)): if isinstance(output, dict) and 'tts_speech' in output: audio_chunks.append(output['tts_speech']) else: audio_chunks.append(output) if audio_chunks: if torch.is_tensor(audio_chunks[0]): audio_data = torch.cat(audio_chunks, dim=-1).cpu().numpy() else: audio_data = np.concatenate(audio_chunks, axis=-1) if audio_data.ndim > 1: audio_data = audio_data.flatten() audio_tuple = (sample_rate, audio_data.astype(np.float32)) return audio_tuple, f"✓ 语音合成成功\n文本: {text}\n说话人: {speaker}\n模型: CosyVoice (官方API - SFT)" except Exception as e: print(f"CosyVoice API 调用失败: {e}") import traceback traceback.print_exc() # 返回演示音频并显示错误信息 return generate_demo_audio(text, speaker, error=f"API调用失败: {str(e)}") elif model_type == 'transformers': # 使用transformers接口 tts_model = model['model'] # 生成语音 - 尝试不同的推理方法 with torch.no_grad(): # 方法1: 尝试inference_sft(CosyVoice标准接口) if hasattr(tts_model, 'inference_sft'): print(f"使用inference_sft方法: text={text}, speaker={speaker}") outputs = tts_model.inference_sft(text, speaker) # 处理输出 if isinstance(outputs, dict): if 'tts_speech' in outputs: audio_data = outputs['tts_speech'] elif 'audio' in outputs: audio_data = outputs['audio'] else: # 取第一个tensor值 audio_data = next(iter(outputs.values())) elif isinstance(outputs, (list, tuple)): audio_data = outputs[0] else: audio_data = outputs # 转换为numpy if torch.is_tensor(audio_data): audio_data = audio_data.cpu().numpy() # 确保是1D数组 if audio_data.ndim > 1: audio_data = audio_data.flatten() sample_rate = 22050 audio_tuple = (sample_rate, audio_data.astype(np.float32)) return audio_tuple, f"✓ 语音合成成功\n文本: {text}\n说话人: {speaker}\n模型: CosyVoice (transformers)" # 方法2: 尝试inference方法 elif hasattr(tts_model, 'inference'): print(f"使用inference方法: text={text}, speaker={speaker}") outputs = tts_model.inference(text, speaker) if torch.is_tensor(outputs): audio_data = outputs.cpu().numpy() else: audio_data = outputs if audio_data.ndim > 1: audio_data = audio_data.flatten() sample_rate = 22050 audio_tuple = (sample_rate, audio_data.astype(np.float32)) return audio_tuple, f"✓ 语音合成成功\n文本: {text}\n说话人: {speaker}\n模型: CosyVoice (transformers)" # 方法3: 尝试generate方法 elif hasattr(tts_model, 'generate'): print(f"使用generate方法: text={text}") # 准备输入 inputs = {"text": text, "speaker": speaker} outputs = tts_model.generate(**inputs) if torch.is_tensor(outputs): audio_data = outputs.cpu().numpy() elif isinstance(outputs, dict): audio_data = outputs.get('audio', outputs.get('waveform', next(iter(outputs.values())))) if torch.is_tensor(audio_data): audio_data = audio_data.cpu().numpy() else: audio_data = outputs if audio_data.ndim > 1: audio_data = audio_data.flatten() sample_rate = 22050 audio_tuple = (sample_rate, audio_data.astype(np.float32)) return audio_tuple, f"✓ 语音合成成功\n文本: {text}\n说话人: {speaker}\n模型: CosyVoice (transformers)" else: # 没有可用的推理方法 print(f"模型没有可用的推理方法") print(f"可用方法: {[m for m in dir(tts_model) if not m.startswith('_')][:20]}") return generate_demo_audio(text, speaker, error="模型缺少推理方法 (inference_sft/inference/generate)") elif model_type == 'pytorch': # 使用PyTorch模型 pytorch_model = model['model'] # 尝试推理 try: if hasattr(pytorch_model, 'inference_sft'): output = pytorch_model.inference_sft(text, speaker) elif hasattr(pytorch_model, 'inference'): output = pytorch_model.inference(text, speaker) else: # 无法推理,使用演示模式 return generate_demo_audio(text, speaker, error="PyTorch模型缺少推理方法") # 处理输出 if isinstance(output, dict) and 'tts_speech' in output: audio_data = output['tts_speech'] if torch.is_tensor(audio_data): audio_data = audio_data.cpu().numpy() elif torch.is_tensor(output): audio_data = output.cpu().numpy() else: audio_data = output sample_rate = 22050 audio_tuple = (sample_rate, audio_data.astype(np.float32)) return audio_tuple, f"✓ 语音合成成功\n文本: {text}\n说话人: {speaker}\n模型: PyTorch" except Exception as e: return generate_demo_audio(text, speaker, error=f"PyTorch推理失败: {str(e)}") elif model_type == 'downloaded': # 模型已下载但未加载,使用演示模式 return generate_demo_audio(text, speaker) else: return generate_demo_audio(text, speaker) elif hasattr(model, 'inference_sft'): # 使用官方CosyVoice API output = model.inference_sft(text, speaker) # 转换输出格式 if isinstance(output, dict) and 'tts_speech' in output: audio_data = output['tts_speech'].cpu().numpy() sample_rate = 22050 else: audio_data = output sample_rate = 22050 audio_tuple = (sample_rate, audio_data.astype(np.float32)) return audio_tuple, f"✓ 语音合成成功\n文本: {text}\n说话人: {speaker}\n模型: CosyVoice官方" else: # 尝试其他可能的接口 if hasattr(model, 'inference'): output = model.inference(text, speaker) audio_data = output if isinstance(output, np.ndarray) else output.cpu().numpy() sample_rate = 22050 audio_tuple = (sample_rate, audio_data.astype(np.float32)) return audio_tuple, f"✓ 语音合成成功\n文本: {text}\n说话人: {speaker}" else: # 使用演示模式 return generate_demo_audio(text, speaker) except Exception as e: print(f"模型推理错误: {str(e)}") # 出错时使用演示模式 return generate_demo_audio(text, speaker, error=str(e)) else: # 演示模式 return generate_demo_audio(text, speaker) except Exception as e: return None, f"语音合成失败: {str(e)}" def generate_demo_audio(text, speaker, error=None): """生成演示音频(当模型不可用时)""" # 生成简单的演示音频 sample_rate = 22050 duration = min(len(text) * 0.2, 5.0) t = np.linspace(0, duration, int(sample_rate * duration), False) frequency = 440 audio_data = 0.3 * np.sin(2 * np.pi * frequency * t) audio_data += 0.2 * np.sin(2 * np.pi * frequency * 1.5 * t) fade_samples = int(sample_rate * 0.1) audio_data[:fade_samples] *= np.linspace(0, 1, fade_samples) audio_data[-fade_samples:] *= np.linspace(1, 0, fade_samples) audio_tuple = (sample_rate, audio_data.astype(np.float32)) status_msg = f"⚠ 演示模式\n文本: {text}\n说话人: {speaker}\n" if error: status_msg += f"错误: {error}\n" status_msg += "提示: 这是演示音频,不是真实的语音合成结果。请确保模型正确加载。" return audio_tuple, status_msg # 在启动时加载模型 load_cosyvoice_model() # 更新模型状态消息 if cosyvoice_model is not None: model_status_msg = "✓ CosyVoice模型已成功加载" model_status_color = "green" else: model_status_msg = "⚠ 演示模式(模型未加载)" model_status_color = "orange" # 创建Gradio界面 try: theme = gr.themes.Soft() except: theme = None with gr.Blocks() as demo: gr.Markdown(f"# {title}") gr.Markdown(description) # 显示模型状态 if cosyvoice_model is not None: status_emoji = "✅" status_text = "CosyVoice模型已成功加载并可用" status_style = "background-color: #d4edda; padding: 10px; border-radius: 5px; border-left: 4px solid #28a745;" else: status_emoji = "⚠️" status_text = "演示模式 - 模型未加载。要使用完整功能,请安装CosyVoice模型。" status_style = "background-color: #fff3cd; padding: 10px; border-radius: 5px; border-left: 4px solid #ffc107;" gr.HTML(f'
{status_emoji} 模型状态: {status_text}
') with gr.Tab("语音识别"): with gr.Row(): with gr.Column(): audio_input = gr.Audio( label="上传音频文件或录制语音", type="numpy", sources=["upload", "microphone"] ) process_btn = gr.Button("处理音频", variant="primary") with gr.Column(): output_text = gr.Textbox( label="识别结果", lines=5, placeholder="识别结果将显示在这里..." ) process_btn.click( fn=process_audio, inputs=audio_input, outputs=output_text ) with gr.Tab("文本转语音"): with gr.Row(): with gr.Column(): text_input = gr.Textbox( label="输入文本", placeholder="请输入要转换为语音的文本...", lines=3 ) speaker_input = gr.Dropdown( label="选择说话人(SFT模式)", choices=["中文女", "中文男", "英文女", "英文男", "粤语女", "粤语男", "日语男", "韩语女"], value="中文女" ) # Zero-shot 模式选项 with gr.Accordion("高级选项 - Zero-shot 声音克隆", open=False): prompt_audio_input = gr.Audio( label="上传提示音频(3-10秒)", type="filepath", sources=["upload"] ) prompt_text_input = gr.Textbox( label="提示文本(音频对应的文字)", placeholder="You are a helpful assistant.<|endofprompt|>希望你以后能够做的比我还好呦。", lines=2 ) gr.Markdown(""" **使用说明:** - 上传一段3-10秒的参考音频 - 输入音频对应的文字内容 - 格式:`You are a helpful assistant.<|endofprompt|>音频对应的文字` - 系统将克隆该音频的音色来合成新文本 """) tts_btn = gr.Button("生成语音", variant="primary") with gr.Column(): audio_output = gr.Audio(label="生成的语音") tts_status = gr.Textbox(label="状态") tts_btn.click( fn=text_to_speech, inputs=[text_input, speaker_input, prompt_audio_input, prompt_text_input], outputs=[audio_output, tts_status] ) with gr.Tab("关于"): gr.Markdown(""" ## CosyVoice 模型 CosyVoice是一个先进的语音处理模型,具有以下特点: - 高质量的语音识别 - 自然的语音合成 - 多语言支持 - 实时处理能力 ### 使用方法 1. 在"语音识别"标签页上传音频文件进行识别 2. 在"文本转语音"标签页输入文本生成语音 3. 支持麦克风实时录制 ### 技术特性 - 基于Transformer架构 - 支持多种音频格式 - 高精度识别和合成 """) if __name__ == "__main__": demo.launch(theme=theme)