|
|
import gradio as gr |
|
|
import torch |
|
|
import torchaudio |
|
|
import numpy as np |
|
|
import os |
|
|
import sys |
|
|
from pathlib import Path |
|
|
import tempfile |
|
|
import soundfile as sf |
|
|
|
|
|
|
|
|
title = "CosyVoice - 语音处理模型" |
|
|
description = """ |
|
|
CosyVoice是一个先进的语音处理模型,支持语音识别、语音合成等功能。 |
|
|
|
|
|
上传音频文件或使用麦克风录制语音,体验CosyVoice的强大功能。 |
|
|
""" |
|
|
|
|
|
|
|
|
cosyvoice_model = None |
|
|
model_loaded = False |
|
|
|
|
|
|
|
|
def load_cosyvoice_model(): |
|
|
"""加载CosyVoice模型""" |
|
|
global cosyvoice_model, model_loaded |
|
|
|
|
|
if model_loaded: |
|
|
return cosyvoice_model |
|
|
|
|
|
print("\n" + "="*60) |
|
|
print("正在加载CosyVoice模型...") |
|
|
print("="*60) |
|
|
|
|
|
try: |
|
|
|
|
|
print("\n尝试使用官方 CosyVoice 包...") |
|
|
try: |
|
|
|
|
|
third_party_path = os.path.join(os.path.dirname(__file__), 'third_party', 'Matcha-TTS') |
|
|
if os.path.exists(third_party_path): |
|
|
sys.path.insert(0, third_party_path) |
|
|
|
|
|
from cosyvoice.cli.cosyvoice import CosyVoice |
|
|
|
|
|
|
|
|
model_name = "FunAudioLLM/CosyVoice-300M" |
|
|
print(f"从 {model_name} 加载...") |
|
|
|
|
|
|
|
|
from huggingface_hub import snapshot_download |
|
|
model_dir = snapshot_download(repo_id=model_name, cache_dir="./models") |
|
|
|
|
|
|
|
|
cosyvoice = CosyVoice(model_dir=model_dir) |
|
|
|
|
|
cosyvoice_model = { |
|
|
'model': cosyvoice, |
|
|
'type': 'cosyvoice_official', |
|
|
'has_inference': True, |
|
|
'sample_rate': getattr(cosyvoice, 'sample_rate', 22050) |
|
|
} |
|
|
model_loaded = True |
|
|
print("✓ 成功使用官方 CosyVoice 包加载模型") |
|
|
print("="*60 + "\n") |
|
|
return cosyvoice_model |
|
|
|
|
|
except ImportError as ie: |
|
|
print(f"⚠ 官方 CosyVoice 包不可用: {ie}") |
|
|
print(" 尝试其他加载方式...") |
|
|
|
|
|
|
|
|
print("\n尝试使用 transformers AutoModel...") |
|
|
try: |
|
|
from transformers import AutoModel |
|
|
|
|
|
model_name = "FunAudioLLM/CosyVoice-300M" |
|
|
print(f"从 {model_name} 加载...") |
|
|
|
|
|
|
|
|
model = AutoModel.from_pretrained( |
|
|
model_name, |
|
|
trust_remote_code=True, |
|
|
torch_dtype=torch.float32, |
|
|
low_cpu_mem_usage=True |
|
|
) |
|
|
model.eval() |
|
|
|
|
|
|
|
|
has_inference_sft = hasattr(model, 'inference_sft') |
|
|
has_inference_zero_shot = hasattr(model, 'inference_zero_shot') |
|
|
has_inference_cross_lingual = hasattr(model, 'inference_cross_lingual') |
|
|
|
|
|
print(f"模型类型: {type(model).__name__}") |
|
|
print(f"推理方法:") |
|
|
print(f" - inference_sft: {has_inference_sft}") |
|
|
print(f" - inference_zero_shot: {has_inference_zero_shot}") |
|
|
print(f" - inference_cross_lingual: {has_inference_cross_lingual}") |
|
|
|
|
|
if has_inference_sft or has_inference_zero_shot: |
|
|
cosyvoice_model = { |
|
|
'model': model, |
|
|
'type': 'transformers', |
|
|
'has_inference': True, |
|
|
'sample_rate': getattr(model, 'sample_rate', 22050) |
|
|
} |
|
|
model_loaded = True |
|
|
print("✓ 成功使用 transformers 加载模型") |
|
|
print("="*60 + "\n") |
|
|
return cosyvoice_model |
|
|
else: |
|
|
print("⚠ 模型缺少必要的推理方法") |
|
|
raise ValueError("Model missing inference methods") |
|
|
|
|
|
except Exception as te: |
|
|
print(f"⚠ transformers 加载失败: {te}") |
|
|
import traceback |
|
|
traceback.print_exc() |
|
|
|
|
|
|
|
|
print("\n尝试下载模型文件...") |
|
|
from huggingface_hub import snapshot_download |
|
|
|
|
|
model_name = "FunAudioLLM/CosyVoice-300M" |
|
|
model_dir = snapshot_download( |
|
|
repo_id=model_name, |
|
|
allow_patterns=["*.pt", "*.pth", "*.bin", "*.json", "*.yaml", "*.txt", "*.safetensors"], |
|
|
cache_dir="./models" |
|
|
) |
|
|
|
|
|
print(f"✓ 模型文件已下载到: {model_dir}") |
|
|
print("\n⚠ 注意: 模型文件已下载,但无法加载推理引擎") |
|
|
print(" 建议:") |
|
|
print(" 1. 安装完整的 CosyVoice 包: pip install cosyvoice") |
|
|
print(" 2. 或在 Hugging Face Space 中使用演示模式") |
|
|
print("="*60 + "\n") |
|
|
|
|
|
cosyvoice_model = None |
|
|
model_loaded = True |
|
|
return None |
|
|
|
|
|
except Exception as e: |
|
|
print(f"✗ 模型加载失败: {e}") |
|
|
import traceback |
|
|
print(f"详细错误:\n{traceback.format_exc()}") |
|
|
|
|
|
print("\n⚠ 使用演示模式") |
|
|
print("提示: 要使用完整功能,请:") |
|
|
print(" 1. 确保网络连接正常") |
|
|
print(" 2. 确保有足够的磁盘空间(约2GB)") |
|
|
print(" 3. 安装 CosyVoice: pip install cosyvoice") |
|
|
print("="*60 + "\n") |
|
|
|
|
|
cosyvoice_model = None |
|
|
model_loaded = True |
|
|
return None |
|
|
|
|
|
def process_audio(audio_file): |
|
|
"""处理音频文件 - 语音识别""" |
|
|
if audio_file is None: |
|
|
return "请上传音频文件" |
|
|
|
|
|
try: |
|
|
|
|
|
model = load_cosyvoice_model() |
|
|
|
|
|
|
|
|
if isinstance(audio_file, tuple): |
|
|
sample_rate, audio_data = audio_file |
|
|
else: |
|
|
|
|
|
import soundfile as sf |
|
|
audio_data, sample_rate = sf.read(audio_file) |
|
|
|
|
|
duration = len(audio_data) / sample_rate |
|
|
|
|
|
|
|
|
if model is not None: |
|
|
try: |
|
|
|
|
|
|
|
|
result = f""" |
|
|
✓ 音频处理成功 |
|
|
|
|
|
音频信息: |
|
|
- 采样率: {sample_rate} Hz |
|
|
- 时长: {duration:.2f} 秒 |
|
|
- 数据形状: {audio_data.shape} |
|
|
- 数据类型: {audio_data.dtype} |
|
|
|
|
|
模型状态: CosyVoice模型已加载 |
|
|
注意: CosyVoice主要用于语音合成(TTS),如需语音识别请使用ASR模型 |
|
|
""" |
|
|
return result |
|
|
except Exception as e: |
|
|
return f"模型推理失败: {str(e)}" |
|
|
else: |
|
|
|
|
|
result = f""" |
|
|
音频信息: |
|
|
- 采样率: {sample_rate} Hz |
|
|
- 时长: {duration:.2f} 秒 |
|
|
- 数据点数: {len(audio_data)} |
|
|
|
|
|
⚠ 演示模式(模型未加载) |
|
|
提示: 请确保安装CosyVoice模型以使用完整功能 |
|
|
""" |
|
|
return result |
|
|
except Exception as e: |
|
|
return f"处理失败: {str(e)}" |
|
|
|
|
|
def text_to_speech(text, speaker="中文女", prompt_audio=None, prompt_text=None): |
|
|
"""文本转语音 - 使用CosyVoice |
|
|
|
|
|
Args: |
|
|
text: 要合成的文本 |
|
|
speaker: 说话人(用于SFT模式) |
|
|
prompt_audio: 提示音频路径(用于zero-shot模式) |
|
|
prompt_text: 提示文本(用于zero-shot模式) |
|
|
""" |
|
|
if not text or text.strip() == "": |
|
|
return None, "请输入要转换的文本" |
|
|
|
|
|
try: |
|
|
|
|
|
model = load_cosyvoice_model() |
|
|
|
|
|
|
|
|
if model is not None: |
|
|
try: |
|
|
|
|
|
if isinstance(model, dict): |
|
|
model_type = model.get('type', 'unknown') |
|
|
|
|
|
|
|
|
if model_type == 'cosyvoice_official': |
|
|
cosyvoice = model['model'] |
|
|
sample_rate = model.get('sample_rate', 22050) |
|
|
|
|
|
print(f"使用官方CosyVoice API: text={text[:50]}...") |
|
|
|
|
|
|
|
|
|
|
|
try: |
|
|
|
|
|
if prompt_text is None: |
|
|
|
|
|
prompt_text = 'You are a helpful assistant.<|endofprompt|>希望你以后能够做的比我还好呦。' |
|
|
|
|
|
|
|
|
if prompt_audio is None: |
|
|
|
|
|
possible_prompt_paths = [ |
|
|
'./asset/zero_shot_prompt.wav', |
|
|
'./CosyVoice/asset/zero_shot_prompt.wav', |
|
|
'./zero_shot_prompt.wav' |
|
|
] |
|
|
for path in possible_prompt_paths: |
|
|
if os.path.exists(path): |
|
|
prompt_audio = path |
|
|
print(f"使用提示音频: {path}") |
|
|
break |
|
|
|
|
|
|
|
|
if prompt_audio and os.path.exists(prompt_audio): |
|
|
print(f"使用 inference_zero_shot: text={text[:30]}, prompt={prompt_text[:50]}") |
|
|
audio_chunks = [] |
|
|
for i, output in enumerate(cosyvoice.inference_zero_shot( |
|
|
text, |
|
|
prompt_text, |
|
|
prompt_audio, |
|
|
stream=False |
|
|
)): |
|
|
if isinstance(output, dict) and 'tts_speech' in output: |
|
|
audio_chunks.append(output['tts_speech']) |
|
|
else: |
|
|
audio_chunks.append(output) |
|
|
|
|
|
if audio_chunks: |
|
|
if torch.is_tensor(audio_chunks[0]): |
|
|
audio_data = torch.cat(audio_chunks, dim=-1).cpu().numpy() |
|
|
else: |
|
|
audio_data = np.concatenate(audio_chunks, axis=-1) |
|
|
|
|
|
if audio_data.ndim > 1: |
|
|
audio_data = audio_data.flatten() |
|
|
|
|
|
audio_tuple = (sample_rate, audio_data.astype(np.float32)) |
|
|
return audio_tuple, f"✓ 语音合成成功\n文本: {text}\n模式: Zero-shot\n模型: CosyVoice (官方API)" |
|
|
|
|
|
|
|
|
else: |
|
|
print(f"使用 inference_sft: text={text[:30]}, speaker={speaker}") |
|
|
|
|
|
|
|
|
audio_chunks = [] |
|
|
for i, output in enumerate(cosyvoice.inference_sft(text, speaker, stream=False)): |
|
|
if isinstance(output, dict) and 'tts_speech' in output: |
|
|
audio_chunks.append(output['tts_speech']) |
|
|
else: |
|
|
audio_chunks.append(output) |
|
|
|
|
|
if audio_chunks: |
|
|
if torch.is_tensor(audio_chunks[0]): |
|
|
audio_data = torch.cat(audio_chunks, dim=-1).cpu().numpy() |
|
|
else: |
|
|
audio_data = np.concatenate(audio_chunks, axis=-1) |
|
|
|
|
|
if audio_data.ndim > 1: |
|
|
audio_data = audio_data.flatten() |
|
|
|
|
|
audio_tuple = (sample_rate, audio_data.astype(np.float32)) |
|
|
return audio_tuple, f"✓ 语音合成成功\n文本: {text}\n说话人: {speaker}\n模型: CosyVoice (官方API - SFT)" |
|
|
|
|
|
except Exception as e: |
|
|
print(f"CosyVoice API 调用失败: {e}") |
|
|
import traceback |
|
|
traceback.print_exc() |
|
|
|
|
|
return generate_demo_audio(text, speaker, error=f"API调用失败: {str(e)}") |
|
|
|
|
|
elif model_type == 'transformers': |
|
|
|
|
|
tts_model = model['model'] |
|
|
|
|
|
|
|
|
with torch.no_grad(): |
|
|
|
|
|
if hasattr(tts_model, 'inference_sft'): |
|
|
print(f"使用inference_sft方法: text={text}, speaker={speaker}") |
|
|
outputs = tts_model.inference_sft(text, speaker) |
|
|
|
|
|
|
|
|
if isinstance(outputs, dict): |
|
|
if 'tts_speech' in outputs: |
|
|
audio_data = outputs['tts_speech'] |
|
|
elif 'audio' in outputs: |
|
|
audio_data = outputs['audio'] |
|
|
else: |
|
|
|
|
|
audio_data = next(iter(outputs.values())) |
|
|
elif isinstance(outputs, (list, tuple)): |
|
|
audio_data = outputs[0] |
|
|
else: |
|
|
audio_data = outputs |
|
|
|
|
|
|
|
|
if torch.is_tensor(audio_data): |
|
|
audio_data = audio_data.cpu().numpy() |
|
|
|
|
|
|
|
|
if audio_data.ndim > 1: |
|
|
audio_data = audio_data.flatten() |
|
|
|
|
|
sample_rate = 22050 |
|
|
audio_tuple = (sample_rate, audio_data.astype(np.float32)) |
|
|
return audio_tuple, f"✓ 语音合成成功\n文本: {text}\n说话人: {speaker}\n模型: CosyVoice (transformers)" |
|
|
|
|
|
|
|
|
elif hasattr(tts_model, 'inference'): |
|
|
print(f"使用inference方法: text={text}, speaker={speaker}") |
|
|
outputs = tts_model.inference(text, speaker) |
|
|
|
|
|
if torch.is_tensor(outputs): |
|
|
audio_data = outputs.cpu().numpy() |
|
|
else: |
|
|
audio_data = outputs |
|
|
|
|
|
if audio_data.ndim > 1: |
|
|
audio_data = audio_data.flatten() |
|
|
|
|
|
sample_rate = 22050 |
|
|
audio_tuple = (sample_rate, audio_data.astype(np.float32)) |
|
|
return audio_tuple, f"✓ 语音合成成功\n文本: {text}\n说话人: {speaker}\n模型: CosyVoice (transformers)" |
|
|
|
|
|
|
|
|
elif hasattr(tts_model, 'generate'): |
|
|
print(f"使用generate方法: text={text}") |
|
|
|
|
|
inputs = {"text": text, "speaker": speaker} |
|
|
outputs = tts_model.generate(**inputs) |
|
|
|
|
|
if torch.is_tensor(outputs): |
|
|
audio_data = outputs.cpu().numpy() |
|
|
elif isinstance(outputs, dict): |
|
|
audio_data = outputs.get('audio', outputs.get('waveform', next(iter(outputs.values())))) |
|
|
if torch.is_tensor(audio_data): |
|
|
audio_data = audio_data.cpu().numpy() |
|
|
else: |
|
|
audio_data = outputs |
|
|
|
|
|
if audio_data.ndim > 1: |
|
|
audio_data = audio_data.flatten() |
|
|
|
|
|
sample_rate = 22050 |
|
|
audio_tuple = (sample_rate, audio_data.astype(np.float32)) |
|
|
return audio_tuple, f"✓ 语音合成成功\n文本: {text}\n说话人: {speaker}\n模型: CosyVoice (transformers)" |
|
|
|
|
|
else: |
|
|
|
|
|
print(f"模型没有可用的推理方法") |
|
|
print(f"可用方法: {[m for m in dir(tts_model) if not m.startswith('_')][:20]}") |
|
|
return generate_demo_audio(text, speaker, error="模型缺少推理方法 (inference_sft/inference/generate)") |
|
|
|
|
|
elif model_type == 'pytorch': |
|
|
|
|
|
pytorch_model = model['model'] |
|
|
|
|
|
|
|
|
try: |
|
|
if hasattr(pytorch_model, 'inference_sft'): |
|
|
output = pytorch_model.inference_sft(text, speaker) |
|
|
elif hasattr(pytorch_model, 'inference'): |
|
|
output = pytorch_model.inference(text, speaker) |
|
|
else: |
|
|
|
|
|
return generate_demo_audio(text, speaker, error="PyTorch模型缺少推理方法") |
|
|
|
|
|
|
|
|
if isinstance(output, dict) and 'tts_speech' in output: |
|
|
audio_data = output['tts_speech'] |
|
|
if torch.is_tensor(audio_data): |
|
|
audio_data = audio_data.cpu().numpy() |
|
|
elif torch.is_tensor(output): |
|
|
audio_data = output.cpu().numpy() |
|
|
else: |
|
|
audio_data = output |
|
|
|
|
|
sample_rate = 22050 |
|
|
audio_tuple = (sample_rate, audio_data.astype(np.float32)) |
|
|
return audio_tuple, f"✓ 语音合成成功\n文本: {text}\n说话人: {speaker}\n模型: PyTorch" |
|
|
except Exception as e: |
|
|
return generate_demo_audio(text, speaker, error=f"PyTorch推理失败: {str(e)}") |
|
|
|
|
|
elif model_type == 'downloaded': |
|
|
|
|
|
return generate_demo_audio(text, speaker) |
|
|
|
|
|
else: |
|
|
return generate_demo_audio(text, speaker) |
|
|
|
|
|
elif hasattr(model, 'inference_sft'): |
|
|
|
|
|
output = model.inference_sft(text, speaker) |
|
|
|
|
|
|
|
|
if isinstance(output, dict) and 'tts_speech' in output: |
|
|
audio_data = output['tts_speech'].cpu().numpy() |
|
|
sample_rate = 22050 |
|
|
else: |
|
|
audio_data = output |
|
|
sample_rate = 22050 |
|
|
|
|
|
audio_tuple = (sample_rate, audio_data.astype(np.float32)) |
|
|
return audio_tuple, f"✓ 语音合成成功\n文本: {text}\n说话人: {speaker}\n模型: CosyVoice官方" |
|
|
|
|
|
else: |
|
|
|
|
|
if hasattr(model, 'inference'): |
|
|
output = model.inference(text, speaker) |
|
|
audio_data = output if isinstance(output, np.ndarray) else output.cpu().numpy() |
|
|
sample_rate = 22050 |
|
|
audio_tuple = (sample_rate, audio_data.astype(np.float32)) |
|
|
return audio_tuple, f"✓ 语音合成成功\n文本: {text}\n说话人: {speaker}" |
|
|
else: |
|
|
|
|
|
return generate_demo_audio(text, speaker) |
|
|
|
|
|
except Exception as e: |
|
|
print(f"模型推理错误: {str(e)}") |
|
|
|
|
|
return generate_demo_audio(text, speaker, error=str(e)) |
|
|
else: |
|
|
|
|
|
return generate_demo_audio(text, speaker) |
|
|
|
|
|
except Exception as e: |
|
|
return None, f"语音合成失败: {str(e)}" |
|
|
|
|
|
def generate_demo_audio(text, speaker, error=None): |
|
|
"""生成演示音频(当模型不可用时)""" |
|
|
|
|
|
sample_rate = 22050 |
|
|
duration = min(len(text) * 0.2, 5.0) |
|
|
t = np.linspace(0, duration, int(sample_rate * duration), False) |
|
|
|
|
|
frequency = 440 |
|
|
audio_data = 0.3 * np.sin(2 * np.pi * frequency * t) |
|
|
audio_data += 0.2 * np.sin(2 * np.pi * frequency * 1.5 * t) |
|
|
|
|
|
fade_samples = int(sample_rate * 0.1) |
|
|
audio_data[:fade_samples] *= np.linspace(0, 1, fade_samples) |
|
|
audio_data[-fade_samples:] *= np.linspace(1, 0, fade_samples) |
|
|
|
|
|
audio_tuple = (sample_rate, audio_data.astype(np.float32)) |
|
|
|
|
|
status_msg = f"⚠ 演示模式\n文本: {text}\n说话人: {speaker}\n" |
|
|
if error: |
|
|
status_msg += f"错误: {error}\n" |
|
|
status_msg += "提示: 这是演示音频,不是真实的语音合成结果。请确保模型正确加载。" |
|
|
|
|
|
return audio_tuple, status_msg |
|
|
|
|
|
|
|
|
load_cosyvoice_model() |
|
|
|
|
|
|
|
|
if cosyvoice_model is not None: |
|
|
model_status_msg = "✓ CosyVoice模型已成功加载" |
|
|
model_status_color = "green" |
|
|
else: |
|
|
model_status_msg = "⚠ 演示模式(模型未加载)" |
|
|
model_status_color = "orange" |
|
|
|
|
|
|
|
|
try: |
|
|
theme = gr.themes.Soft() |
|
|
except: |
|
|
theme = None |
|
|
|
|
|
with gr.Blocks() as demo: |
|
|
gr.Markdown(f"# {title}") |
|
|
gr.Markdown(description) |
|
|
|
|
|
|
|
|
if cosyvoice_model is not None: |
|
|
status_emoji = "✅" |
|
|
status_text = "CosyVoice模型已成功加载并可用" |
|
|
status_style = "background-color: #d4edda; padding: 10px; border-radius: 5px; border-left: 4px solid #28a745;" |
|
|
else: |
|
|
status_emoji = "⚠️" |
|
|
status_text = "演示模式 - 模型未加载。要使用完整功能,请安装CosyVoice模型。" |
|
|
status_style = "background-color: #fff3cd; padding: 10px; border-radius: 5px; border-left: 4px solid #ffc107;" |
|
|
|
|
|
gr.HTML(f'<div style="{status_style}"><strong>{status_emoji} 模型状态:</strong> {status_text}</div>') |
|
|
|
|
|
with gr.Tab("语音识别"): |
|
|
with gr.Row(): |
|
|
with gr.Column(): |
|
|
audio_input = gr.Audio( |
|
|
label="上传音频文件或录制语音", |
|
|
type="numpy", |
|
|
sources=["upload", "microphone"] |
|
|
) |
|
|
process_btn = gr.Button("处理音频", variant="primary") |
|
|
|
|
|
with gr.Column(): |
|
|
output_text = gr.Textbox( |
|
|
label="识别结果", |
|
|
lines=5, |
|
|
placeholder="识别结果将显示在这里..." |
|
|
) |
|
|
|
|
|
process_btn.click( |
|
|
fn=process_audio, |
|
|
inputs=audio_input, |
|
|
outputs=output_text |
|
|
) |
|
|
|
|
|
with gr.Tab("文本转语音"): |
|
|
with gr.Row(): |
|
|
with gr.Column(): |
|
|
text_input = gr.Textbox( |
|
|
label="输入文本", |
|
|
placeholder="请输入要转换为语音的文本...", |
|
|
lines=3 |
|
|
) |
|
|
speaker_input = gr.Dropdown( |
|
|
label="选择说话人(SFT模式)", |
|
|
choices=["中文女", "中文男", "英文女", "英文男", "粤语女", "粤语男", "日语男", "韩语女"], |
|
|
value="中文女" |
|
|
) |
|
|
|
|
|
|
|
|
with gr.Accordion("高级选项 - Zero-shot 声音克隆", open=False): |
|
|
prompt_audio_input = gr.Audio( |
|
|
label="上传提示音频(3-10秒)", |
|
|
type="filepath", |
|
|
sources=["upload"] |
|
|
) |
|
|
prompt_text_input = gr.Textbox( |
|
|
label="提示文本(音频对应的文字)", |
|
|
placeholder="You are a helpful assistant.<|endofprompt|>希望你以后能够做的比我还好呦。", |
|
|
lines=2 |
|
|
) |
|
|
gr.Markdown(""" |
|
|
**使用说明:** |
|
|
- 上传一段3-10秒的参考音频 |
|
|
- 输入音频对应的文字内容 |
|
|
- 格式:`You are a helpful assistant.<|endofprompt|>音频对应的文字` |
|
|
- 系统将克隆该音频的音色来合成新文本 |
|
|
""") |
|
|
|
|
|
tts_btn = gr.Button("生成语音", variant="primary") |
|
|
|
|
|
with gr.Column(): |
|
|
audio_output = gr.Audio(label="生成的语音") |
|
|
tts_status = gr.Textbox(label="状态") |
|
|
|
|
|
tts_btn.click( |
|
|
fn=text_to_speech, |
|
|
inputs=[text_input, speaker_input, prompt_audio_input, prompt_text_input], |
|
|
outputs=[audio_output, tts_status] |
|
|
) |
|
|
|
|
|
with gr.Tab("关于"): |
|
|
gr.Markdown(""" |
|
|
## CosyVoice 模型 |
|
|
|
|
|
CosyVoice是一个先进的语音处理模型,具有以下特点: |
|
|
|
|
|
- 高质量的语音识别 |
|
|
- 自然的语音合成 |
|
|
- 多语言支持 |
|
|
- 实时处理能力 |
|
|
|
|
|
### 使用方法 |
|
|
1. 在"语音识别"标签页上传音频文件进行识别 |
|
|
2. 在"文本转语音"标签页输入文本生成语音 |
|
|
3. 支持麦克风实时录制 |
|
|
|
|
|
### 技术特性 |
|
|
- 基于Transformer架构 |
|
|
- 支持多种音频格式 |
|
|
- 高精度识别和合成 |
|
|
""") |
|
|
|
|
|
if __name__ == "__main__": |
|
|
demo.launch(theme=theme) |