CosyVoice / app.py
jerrybwang
33
2bc8444
import gradio as gr
import torch
import torchaudio
import numpy as np
import os
import sys
from pathlib import Path
import tempfile
import soundfile as sf
# 标题和描述
title = "CosyVoice - 语音处理模型"
description = """
CosyVoice是一个先进的语音处理模型,支持语音识别、语音合成等功能。
上传音频文件或使用麦克风录制语音,体验CosyVoice的强大功能。
"""
# 全局变量存储模型
cosyvoice_model = None
model_loaded = False
# 初始化CosyVoice模型
def load_cosyvoice_model():
"""加载CosyVoice模型"""
global cosyvoice_model, model_loaded
if model_loaded:
return cosyvoice_model
print("\n" + "="*60)
print("正在加载CosyVoice模型...")
print("="*60)
try:
# 方法1: 尝试使用官方 CosyVoice 包
print("\n尝试使用官方 CosyVoice 包...")
try:
# 添加 third_party 路径(如果存在)
third_party_path = os.path.join(os.path.dirname(__file__), 'third_party', 'Matcha-TTS')
if os.path.exists(third_party_path):
sys.path.insert(0, third_party_path)
from cosyvoice.cli.cosyvoice import CosyVoice
# 尝试从 Hugging Face Hub 加载
model_name = "FunAudioLLM/CosyVoice-300M"
print(f"从 {model_name} 加载...")
# 下载模型到本地
from huggingface_hub import snapshot_download
model_dir = snapshot_download(repo_id=model_name, cache_dir="./models")
# 使用 CosyVoice 加载
cosyvoice = CosyVoice(model_dir=model_dir)
cosyvoice_model = {
'model': cosyvoice,
'type': 'cosyvoice_official',
'has_inference': True,
'sample_rate': getattr(cosyvoice, 'sample_rate', 22050)
}
model_loaded = True
print("✓ 成功使用官方 CosyVoice 包加载模型")
print("="*60 + "\n")
return cosyvoice_model
except ImportError as ie:
print(f"⚠ 官方 CosyVoice 包不可用: {ie}")
print(" 尝试其他加载方式...")
# 方法2: 尝试使用 transformers AutoModel(需要 trust_remote_code)
print("\n尝试使用 transformers AutoModel...")
try:
from transformers import AutoModel
model_name = "FunAudioLLM/CosyVoice-300M"
print(f"从 {model_name} 加载...")
# 使用 trust_remote_code=True 加载自定义模型
model = AutoModel.from_pretrained(
model_name,
trust_remote_code=True,
torch_dtype=torch.float32,
low_cpu_mem_usage=True
)
model.eval()
# 检查模型方法
has_inference_sft = hasattr(model, 'inference_sft')
has_inference_zero_shot = hasattr(model, 'inference_zero_shot')
has_inference_cross_lingual = hasattr(model, 'inference_cross_lingual')
print(f"模型类型: {type(model).__name__}")
print(f"推理方法:")
print(f" - inference_sft: {has_inference_sft}")
print(f" - inference_zero_shot: {has_inference_zero_shot}")
print(f" - inference_cross_lingual: {has_inference_cross_lingual}")
if has_inference_sft or has_inference_zero_shot:
cosyvoice_model = {
'model': model,
'type': 'transformers',
'has_inference': True,
'sample_rate': getattr(model, 'sample_rate', 22050)
}
model_loaded = True
print("✓ 成功使用 transformers 加载模型")
print("="*60 + "\n")
return cosyvoice_model
else:
print("⚠ 模型缺少必要的推理方法")
raise ValueError("Model missing inference methods")
except Exception as te:
print(f"⚠ transformers 加载失败: {te}")
import traceback
traceback.print_exc()
# 方法3: 下载模型文件(演示模式)
print("\n尝试下载模型文件...")
from huggingface_hub import snapshot_download
model_name = "FunAudioLLM/CosyVoice-300M"
model_dir = snapshot_download(
repo_id=model_name,
allow_patterns=["*.pt", "*.pth", "*.bin", "*.json", "*.yaml", "*.txt", "*.safetensors"],
cache_dir="./models"
)
print(f"✓ 模型文件已下载到: {model_dir}")
print("\n⚠ 注意: 模型文件已下载,但无法加载推理引擎")
print(" 建议:")
print(" 1. 安装完整的 CosyVoice 包: pip install cosyvoice")
print(" 2. 或在 Hugging Face Space 中使用演示模式")
print("="*60 + "\n")
cosyvoice_model = None
model_loaded = True
return None
except Exception as e:
print(f"✗ 模型加载失败: {e}")
import traceback
print(f"详细错误:\n{traceback.format_exc()}")
print("\n⚠ 使用演示模式")
print("提示: 要使用完整功能,请:")
print(" 1. 确保网络连接正常")
print(" 2. 确保有足够的磁盘空间(约2GB)")
print(" 3. 安装 CosyVoice: pip install cosyvoice")
print("="*60 + "\n")
cosyvoice_model = None
model_loaded = True
return None
def process_audio(audio_file):
"""处理音频文件 - 语音识别"""
if audio_file is None:
return "请上传音频文件"
try:
# 加载模型
model = load_cosyvoice_model()
# 处理不同的音频输入格式
if isinstance(audio_file, tuple):
sample_rate, audio_data = audio_file
else:
# 如果是文件路径
import soundfile as sf
audio_data, sample_rate = sf.read(audio_file)
duration = len(audio_data) / sample_rate
# 如果模型已加载,使用真实推理
if model is not None:
try:
# 使用CosyVoice进行语音识别
# 注意:CosyVoice主要是TTS模型,这里展示如何处理音频
result = f"""
✓ 音频处理成功
音频信息:
- 采样率: {sample_rate} Hz
- 时长: {duration:.2f}
- 数据形状: {audio_data.shape}
- 数据类型: {audio_data.dtype}
模型状态: CosyVoice模型已加载
注意: CosyVoice主要用于语音合成(TTS),如需语音识别请使用ASR模型
"""
return result
except Exception as e:
return f"模型推理失败: {str(e)}"
else:
# 演示模式
result = f"""
音频信息:
- 采样率: {sample_rate} Hz
- 时长: {duration:.2f}
- 数据点数: {len(audio_data)}
⚠ 演示模式(模型未加载)
提示: 请确保安装CosyVoice模型以使用完整功能
"""
return result
except Exception as e:
return f"处理失败: {str(e)}"
def text_to_speech(text, speaker="中文女", prompt_audio=None, prompt_text=None):
"""文本转语音 - 使用CosyVoice
Args:
text: 要合成的文本
speaker: 说话人(用于SFT模式)
prompt_audio: 提示音频路径(用于zero-shot模式)
prompt_text: 提示文本(用于zero-shot模式)
"""
if not text or text.strip() == "":
return None, "请输入要转换的文本"
try:
# 加载模型
model = load_cosyvoice_model()
# 如果模型已加载,使用真实推理
if model is not None:
try:
# 检查模型类型并使用相应的接口
if isinstance(model, dict):
model_type = model.get('type', 'unknown')
# 官方 CosyVoice AutoModel
if model_type == 'cosyvoice_official':
cosyvoice = model['model']
sample_rate = model.get('sample_rate', 22050)
print(f"使用官方CosyVoice API: text={text[:50]}...")
# 使用 inference_zero_shot 方法(zero-shot克隆)
# 根据官方文档,使用正确的提示文本格式
try:
# 准备提示文本(使用官方格式)
if prompt_text is None:
# 使用默认提示文本
prompt_text = 'You are a helpful assistant.<|endofprompt|>希望你以后能够做的比我还好呦。'
# 准备提示音频
if prompt_audio is None:
# 尝试使用项目中的示例音频
possible_prompt_paths = [
'./asset/zero_shot_prompt.wav',
'./CosyVoice/asset/zero_shot_prompt.wav',
'./zero_shot_prompt.wav'
]
for path in possible_prompt_paths:
if os.path.exists(path):
prompt_audio = path
print(f"使用提示音频: {path}")
break
# 如果有提示音频,使用 zero-shot 模式
if prompt_audio and os.path.exists(prompt_audio):
print(f"使用 inference_zero_shot: text={text[:30]}, prompt={prompt_text[:50]}")
audio_chunks = []
for i, output in enumerate(cosyvoice.inference_zero_shot(
text,
prompt_text,
prompt_audio,
stream=False
)):
if isinstance(output, dict) and 'tts_speech' in output:
audio_chunks.append(output['tts_speech'])
else:
audio_chunks.append(output)
if audio_chunks:
if torch.is_tensor(audio_chunks[0]):
audio_data = torch.cat(audio_chunks, dim=-1).cpu().numpy()
else:
audio_data = np.concatenate(audio_chunks, axis=-1)
if audio_data.ndim > 1:
audio_data = audio_data.flatten()
audio_tuple = (sample_rate, audio_data.astype(np.float32))
return audio_tuple, f"✓ 语音合成成功\n文本: {text}\n模式: Zero-shot\n模型: CosyVoice (官方API)"
# 如果没有提示音频,尝试使用 inference_sft(预训练说话人)
else:
print(f"使用 inference_sft: text={text[:30]}, speaker={speaker}")
# CosyVoice-300M 可能支持的说话人ID
# 需要根据实际模型调整
audio_chunks = []
for i, output in enumerate(cosyvoice.inference_sft(text, speaker, stream=False)):
if isinstance(output, dict) and 'tts_speech' in output:
audio_chunks.append(output['tts_speech'])
else:
audio_chunks.append(output)
if audio_chunks:
if torch.is_tensor(audio_chunks[0]):
audio_data = torch.cat(audio_chunks, dim=-1).cpu().numpy()
else:
audio_data = np.concatenate(audio_chunks, axis=-1)
if audio_data.ndim > 1:
audio_data = audio_data.flatten()
audio_tuple = (sample_rate, audio_data.astype(np.float32))
return audio_tuple, f"✓ 语音合成成功\n文本: {text}\n说话人: {speaker}\n模型: CosyVoice (官方API - SFT)"
except Exception as e:
print(f"CosyVoice API 调用失败: {e}")
import traceback
traceback.print_exc()
# 返回演示音频并显示错误信息
return generate_demo_audio(text, speaker, error=f"API调用失败: {str(e)}")
elif model_type == 'transformers':
# 使用transformers接口
tts_model = model['model']
# 生成语音 - 尝试不同的推理方法
with torch.no_grad():
# 方法1: 尝试inference_sft(CosyVoice标准接口)
if hasattr(tts_model, 'inference_sft'):
print(f"使用inference_sft方法: text={text}, speaker={speaker}")
outputs = tts_model.inference_sft(text, speaker)
# 处理输出
if isinstance(outputs, dict):
if 'tts_speech' in outputs:
audio_data = outputs['tts_speech']
elif 'audio' in outputs:
audio_data = outputs['audio']
else:
# 取第一个tensor值
audio_data = next(iter(outputs.values()))
elif isinstance(outputs, (list, tuple)):
audio_data = outputs[0]
else:
audio_data = outputs
# 转换为numpy
if torch.is_tensor(audio_data):
audio_data = audio_data.cpu().numpy()
# 确保是1D数组
if audio_data.ndim > 1:
audio_data = audio_data.flatten()
sample_rate = 22050
audio_tuple = (sample_rate, audio_data.astype(np.float32))
return audio_tuple, f"✓ 语音合成成功\n文本: {text}\n说话人: {speaker}\n模型: CosyVoice (transformers)"
# 方法2: 尝试inference方法
elif hasattr(tts_model, 'inference'):
print(f"使用inference方法: text={text}, speaker={speaker}")
outputs = tts_model.inference(text, speaker)
if torch.is_tensor(outputs):
audio_data = outputs.cpu().numpy()
else:
audio_data = outputs
if audio_data.ndim > 1:
audio_data = audio_data.flatten()
sample_rate = 22050
audio_tuple = (sample_rate, audio_data.astype(np.float32))
return audio_tuple, f"✓ 语音合成成功\n文本: {text}\n说话人: {speaker}\n模型: CosyVoice (transformers)"
# 方法3: 尝试generate方法
elif hasattr(tts_model, 'generate'):
print(f"使用generate方法: text={text}")
# 准备输入
inputs = {"text": text, "speaker": speaker}
outputs = tts_model.generate(**inputs)
if torch.is_tensor(outputs):
audio_data = outputs.cpu().numpy()
elif isinstance(outputs, dict):
audio_data = outputs.get('audio', outputs.get('waveform', next(iter(outputs.values()))))
if torch.is_tensor(audio_data):
audio_data = audio_data.cpu().numpy()
else:
audio_data = outputs
if audio_data.ndim > 1:
audio_data = audio_data.flatten()
sample_rate = 22050
audio_tuple = (sample_rate, audio_data.astype(np.float32))
return audio_tuple, f"✓ 语音合成成功\n文本: {text}\n说话人: {speaker}\n模型: CosyVoice (transformers)"
else:
# 没有可用的推理方法
print(f"模型没有可用的推理方法")
print(f"可用方法: {[m for m in dir(tts_model) if not m.startswith('_')][:20]}")
return generate_demo_audio(text, speaker, error="模型缺少推理方法 (inference_sft/inference/generate)")
elif model_type == 'pytorch':
# 使用PyTorch模型
pytorch_model = model['model']
# 尝试推理
try:
if hasattr(pytorch_model, 'inference_sft'):
output = pytorch_model.inference_sft(text, speaker)
elif hasattr(pytorch_model, 'inference'):
output = pytorch_model.inference(text, speaker)
else:
# 无法推理,使用演示模式
return generate_demo_audio(text, speaker, error="PyTorch模型缺少推理方法")
# 处理输出
if isinstance(output, dict) and 'tts_speech' in output:
audio_data = output['tts_speech']
if torch.is_tensor(audio_data):
audio_data = audio_data.cpu().numpy()
elif torch.is_tensor(output):
audio_data = output.cpu().numpy()
else:
audio_data = output
sample_rate = 22050
audio_tuple = (sample_rate, audio_data.astype(np.float32))
return audio_tuple, f"✓ 语音合成成功\n文本: {text}\n说话人: {speaker}\n模型: PyTorch"
except Exception as e:
return generate_demo_audio(text, speaker, error=f"PyTorch推理失败: {str(e)}")
elif model_type == 'downloaded':
# 模型已下载但未加载,使用演示模式
return generate_demo_audio(text, speaker)
else:
return generate_demo_audio(text, speaker)
elif hasattr(model, 'inference_sft'):
# 使用官方CosyVoice API
output = model.inference_sft(text, speaker)
# 转换输出格式
if isinstance(output, dict) and 'tts_speech' in output:
audio_data = output['tts_speech'].cpu().numpy()
sample_rate = 22050
else:
audio_data = output
sample_rate = 22050
audio_tuple = (sample_rate, audio_data.astype(np.float32))
return audio_tuple, f"✓ 语音合成成功\n文本: {text}\n说话人: {speaker}\n模型: CosyVoice官方"
else:
# 尝试其他可能的接口
if hasattr(model, 'inference'):
output = model.inference(text, speaker)
audio_data = output if isinstance(output, np.ndarray) else output.cpu().numpy()
sample_rate = 22050
audio_tuple = (sample_rate, audio_data.astype(np.float32))
return audio_tuple, f"✓ 语音合成成功\n文本: {text}\n说话人: {speaker}"
else:
# 使用演示模式
return generate_demo_audio(text, speaker)
except Exception as e:
print(f"模型推理错误: {str(e)}")
# 出错时使用演示模式
return generate_demo_audio(text, speaker, error=str(e))
else:
# 演示模式
return generate_demo_audio(text, speaker)
except Exception as e:
return None, f"语音合成失败: {str(e)}"
def generate_demo_audio(text, speaker, error=None):
"""生成演示音频(当模型不可用时)"""
# 生成简单的演示音频
sample_rate = 22050
duration = min(len(text) * 0.2, 5.0)
t = np.linspace(0, duration, int(sample_rate * duration), False)
frequency = 440
audio_data = 0.3 * np.sin(2 * np.pi * frequency * t)
audio_data += 0.2 * np.sin(2 * np.pi * frequency * 1.5 * t)
fade_samples = int(sample_rate * 0.1)
audio_data[:fade_samples] *= np.linspace(0, 1, fade_samples)
audio_data[-fade_samples:] *= np.linspace(1, 0, fade_samples)
audio_tuple = (sample_rate, audio_data.astype(np.float32))
status_msg = f"⚠ 演示模式\n文本: {text}\n说话人: {speaker}\n"
if error:
status_msg += f"错误: {error}\n"
status_msg += "提示: 这是演示音频,不是真实的语音合成结果。请确保模型正确加载。"
return audio_tuple, status_msg
# 在启动时加载模型
load_cosyvoice_model()
# 更新模型状态消息
if cosyvoice_model is not None:
model_status_msg = "✓ CosyVoice模型已成功加载"
model_status_color = "green"
else:
model_status_msg = "⚠ 演示模式(模型未加载)"
model_status_color = "orange"
# 创建Gradio界面
try:
theme = gr.themes.Soft()
except:
theme = None
with gr.Blocks() as demo:
gr.Markdown(f"# {title}")
gr.Markdown(description)
# 显示模型状态
if cosyvoice_model is not None:
status_emoji = "✅"
status_text = "CosyVoice模型已成功加载并可用"
status_style = "background-color: #d4edda; padding: 10px; border-radius: 5px; border-left: 4px solid #28a745;"
else:
status_emoji = "⚠️"
status_text = "演示模式 - 模型未加载。要使用完整功能,请安装CosyVoice模型。"
status_style = "background-color: #fff3cd; padding: 10px; border-radius: 5px; border-left: 4px solid #ffc107;"
gr.HTML(f'<div style="{status_style}"><strong>{status_emoji} 模型状态:</strong> {status_text}</div>')
with gr.Tab("语音识别"):
with gr.Row():
with gr.Column():
audio_input = gr.Audio(
label="上传音频文件或录制语音",
type="numpy",
sources=["upload", "microphone"]
)
process_btn = gr.Button("处理音频", variant="primary")
with gr.Column():
output_text = gr.Textbox(
label="识别结果",
lines=5,
placeholder="识别结果将显示在这里..."
)
process_btn.click(
fn=process_audio,
inputs=audio_input,
outputs=output_text
)
with gr.Tab("文本转语音"):
with gr.Row():
with gr.Column():
text_input = gr.Textbox(
label="输入文本",
placeholder="请输入要转换为语音的文本...",
lines=3
)
speaker_input = gr.Dropdown(
label="选择说话人(SFT模式)",
choices=["中文女", "中文男", "英文女", "英文男", "粤语女", "粤语男", "日语男", "韩语女"],
value="中文女"
)
# Zero-shot 模式选项
with gr.Accordion("高级选项 - Zero-shot 声音克隆", open=False):
prompt_audio_input = gr.Audio(
label="上传提示音频(3-10秒)",
type="filepath",
sources=["upload"]
)
prompt_text_input = gr.Textbox(
label="提示文本(音频对应的文字)",
placeholder="You are a helpful assistant.<|endofprompt|>希望你以后能够做的比我还好呦。",
lines=2
)
gr.Markdown("""
**使用说明:**
- 上传一段3-10秒的参考音频
- 输入音频对应的文字内容
- 格式:`You are a helpful assistant.<|endofprompt|>音频对应的文字`
- 系统将克隆该音频的音色来合成新文本
""")
tts_btn = gr.Button("生成语音", variant="primary")
with gr.Column():
audio_output = gr.Audio(label="生成的语音")
tts_status = gr.Textbox(label="状态")
tts_btn.click(
fn=text_to_speech,
inputs=[text_input, speaker_input, prompt_audio_input, prompt_text_input],
outputs=[audio_output, tts_status]
)
with gr.Tab("关于"):
gr.Markdown("""
## CosyVoice 模型
CosyVoice是一个先进的语音处理模型,具有以下特点:
- 高质量的语音识别
- 自然的语音合成
- 多语言支持
- 实时处理能力
### 使用方法
1. 在"语音识别"标签页上传音频文件进行识别
2. 在"文本转语音"标签页输入文本生成语音
3. 支持麦克风实时录制
### 技术特性
- 基于Transformer架构
- 支持多种音频格式
- 高精度识别和合成
""")
if __name__ == "__main__":
demo.launch(theme=theme)