jerrybwang commited on
Commit ·
6baad6c
1
Parent(s): 228e994
--other 更新代码
Browse files
app.py
CHANGED
|
@@ -36,14 +36,13 @@ def load_cosyvoice_model():
|
|
| 36 |
# 方法1: 使用transformers加载(推荐用于Hugging Face Space)
|
| 37 |
try:
|
| 38 |
print("\n[方法1] 尝试使用transformers加载...")
|
| 39 |
-
from transformers import AutoModel
|
| 40 |
import torch
|
| 41 |
|
| 42 |
model_name = "FunAudioLLM/CosyVoice-300M"
|
| 43 |
print(f" 从 {model_name} 加载...")
|
| 44 |
|
| 45 |
-
# CosyVoice需要trust_remote_code=True
|
| 46 |
-
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
|
| 47 |
model = AutoModel.from_pretrained(
|
| 48 |
model_name,
|
| 49 |
trust_remote_code=True,
|
|
@@ -53,10 +52,14 @@ def load_cosyvoice_model():
|
|
| 53 |
# 设置为评估模式
|
| 54 |
model.eval()
|
| 55 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 56 |
cosyvoice_model = {
|
| 57 |
-
'tokenizer': tokenizer,
|
| 58 |
'model': model,
|
| 59 |
-
'type': 'transformers'
|
|
|
|
| 60 |
}
|
| 61 |
model_loaded = True
|
| 62 |
print(" ✓ 成功通过transformers加载CosyVoice模型")
|
|
@@ -287,56 +290,86 @@ def text_to_speech(text, speaker="中文女"):
|
|
| 287 |
|
| 288 |
if model_type == 'transformers':
|
| 289 |
# 使用transformers接口
|
| 290 |
-
tokenizer = model['tokenizer']
|
| 291 |
tts_model = model['model']
|
| 292 |
|
| 293 |
-
#
|
| 294 |
-
inputs = tokenizer(text, return_tensors="pt", padding=True)
|
| 295 |
-
|
| 296 |
-
# 生成语音
|
| 297 |
with torch.no_grad():
|
| 298 |
-
#
|
| 299 |
if hasattr(tts_model, 'inference_sft'):
|
|
|
|
| 300 |
outputs = tts_model.inference_sft(text, speaker)
|
| 301 |
-
|
| 302 |
-
|
| 303 |
-
|
| 304 |
-
|
| 305 |
-
|
| 306 |
-
|
| 307 |
-
|
| 308 |
-
|
| 309 |
-
|
| 310 |
-
|
| 311 |
-
|
| 312 |
-
|
| 313 |
-
|
| 314 |
-
|
| 315 |
-
|
|
|
|
| 316 |
if torch.is_tensor(audio_data):
|
| 317 |
audio_data = audio_data.cpu().numpy()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 318 |
else:
|
| 319 |
-
#
|
| 320 |
-
|
| 321 |
-
|
| 322 |
-
|
| 323 |
-
break
|
| 324 |
-
elif torch.is_tensor(outputs):
|
| 325 |
-
audio_data = outputs.cpu().numpy()
|
| 326 |
-
if audio_data.ndim > 1:
|
| 327 |
-
audio_data = audio_data[0] # 取第一个batch
|
| 328 |
-
elif isinstance(outputs, np.ndarray):
|
| 329 |
-
audio_data = outputs
|
| 330 |
-
else:
|
| 331 |
-
raise TypeError(f"不支持的输出类型: {type(outputs)}")
|
| 332 |
-
|
| 333 |
-
# 确保音频数据是1D数组
|
| 334 |
-
if audio_data.ndim > 1:
|
| 335 |
-
audio_data = audio_data.flatten()
|
| 336 |
-
|
| 337 |
-
sample_rate = 22050
|
| 338 |
-
audio_tuple = (sample_rate, audio_data.astype(np.float32))
|
| 339 |
-
return audio_tuple, f"✓ 语音合成成功\n文本: {text}\n说话人: {speaker}\n模型: transformers"
|
| 340 |
|
| 341 |
elif model_type == 'pytorch':
|
| 342 |
# 使用PyTorch模型
|
|
|
|
| 36 |
# 方法1: 使用transformers加载(推荐用于Hugging Face Space)
|
| 37 |
try:
|
| 38 |
print("\n[方法1] 尝试使用transformers加载...")
|
| 39 |
+
from transformers import AutoModel
|
| 40 |
import torch
|
| 41 |
|
| 42 |
model_name = "FunAudioLLM/CosyVoice-300M"
|
| 43 |
print(f" 从 {model_name} 加载...")
|
| 44 |
|
| 45 |
+
# CosyVoice需要trust_remote_code=True来加载自定义模型代码
|
|
|
|
| 46 |
model = AutoModel.from_pretrained(
|
| 47 |
model_name,
|
| 48 |
trust_remote_code=True,
|
|
|
|
| 52 |
# 设置为评估模式
|
| 53 |
model.eval()
|
| 54 |
|
| 55 |
+
# 检查模型是否有推理方法
|
| 56 |
+
has_inference = hasattr(model, 'inference_sft') or hasattr(model, 'inference') or hasattr(model, 'generate')
|
| 57 |
+
print(f" 模型推理方法检查: inference_sft={hasattr(model, 'inference_sft')}, inference={hasattr(model, 'inference')}, generate={hasattr(model, 'generate')}")
|
| 58 |
+
|
| 59 |
cosyvoice_model = {
|
|
|
|
| 60 |
'model': model,
|
| 61 |
+
'type': 'transformers',
|
| 62 |
+
'has_inference': has_inference
|
| 63 |
}
|
| 64 |
model_loaded = True
|
| 65 |
print(" ✓ 成功通过transformers加载CosyVoice模型")
|
|
|
|
| 290 |
|
| 291 |
if model_type == 'transformers':
|
| 292 |
# 使用transformers接口
|
|
|
|
| 293 |
tts_model = model['model']
|
| 294 |
|
| 295 |
+
# 生成语音 - 尝试不同的推理方法
|
|
|
|
|
|
|
|
|
|
| 296 |
with torch.no_grad():
|
| 297 |
+
# 方法1: 尝试inference_sft(CosyVoice标准接口)
|
| 298 |
if hasattr(tts_model, 'inference_sft'):
|
| 299 |
+
print(f"使用inference_sft方法: text={text}, speaker={speaker}")
|
| 300 |
outputs = tts_model.inference_sft(text, speaker)
|
| 301 |
+
|
| 302 |
+
# 处理输出
|
| 303 |
+
if isinstance(outputs, dict):
|
| 304 |
+
if 'tts_speech' in outputs:
|
| 305 |
+
audio_data = outputs['tts_speech']
|
| 306 |
+
elif 'audio' in outputs:
|
| 307 |
+
audio_data = outputs['audio']
|
| 308 |
+
else:
|
| 309 |
+
# 取第一个tensor值
|
| 310 |
+
audio_data = next(iter(outputs.values()))
|
| 311 |
+
elif isinstance(outputs, (list, tuple)):
|
| 312 |
+
audio_data = outputs[0]
|
| 313 |
+
else:
|
| 314 |
+
audio_data = outputs
|
| 315 |
+
|
| 316 |
+
# 转换为numpy
|
| 317 |
if torch.is_tensor(audio_data):
|
| 318 |
audio_data = audio_data.cpu().numpy()
|
| 319 |
+
|
| 320 |
+
# 确保是1D数组
|
| 321 |
+
if audio_data.ndim > 1:
|
| 322 |
+
audio_data = audio_data.flatten()
|
| 323 |
+
|
| 324 |
+
sample_rate = 22050
|
| 325 |
+
audio_tuple = (sample_rate, audio_data.astype(np.float32))
|
| 326 |
+
return audio_tuple, f"✓ 语音合成成功\n文本: {text}\n说话人: {speaker}\n模型: CosyVoice (transformers)"
|
| 327 |
+
|
| 328 |
+
# 方法2: 尝试inference方法
|
| 329 |
+
elif hasattr(tts_model, 'inference'):
|
| 330 |
+
print(f"使用inference方法: text={text}, speaker={speaker}")
|
| 331 |
+
outputs = tts_model.inference(text, speaker)
|
| 332 |
+
|
| 333 |
+
if torch.is_tensor(outputs):
|
| 334 |
+
audio_data = outputs.cpu().numpy()
|
| 335 |
+
else:
|
| 336 |
+
audio_data = outputs
|
| 337 |
+
|
| 338 |
+
if audio_data.ndim > 1:
|
| 339 |
+
audio_data = audio_data.flatten()
|
| 340 |
+
|
| 341 |
+
sample_rate = 22050
|
| 342 |
+
audio_tuple = (sample_rate, audio_data.astype(np.float32))
|
| 343 |
+
return audio_tuple, f"✓ 语音合成成功\n文本: {text}\n说话人: {speaker}\n模型: CosyVoice (transformers)"
|
| 344 |
+
|
| 345 |
+
# 方法3: 尝试generate方法
|
| 346 |
+
elif hasattr(tts_model, 'generate'):
|
| 347 |
+
print(f"使用generate方法: text={text}")
|
| 348 |
+
# 准备输入
|
| 349 |
+
inputs = {"text": text, "speaker": speaker}
|
| 350 |
+
outputs = tts_model.generate(**inputs)
|
| 351 |
+
|
| 352 |
+
if torch.is_tensor(outputs):
|
| 353 |
+
audio_data = outputs.cpu().numpy()
|
| 354 |
+
elif isinstance(outputs, dict):
|
| 355 |
+
audio_data = outputs.get('audio', outputs.get('waveform', next(iter(outputs.values()))))
|
| 356 |
+
if torch.is_tensor(audio_data):
|
| 357 |
+
audio_data = audio_data.cpu().numpy()
|
| 358 |
+
else:
|
| 359 |
+
audio_data = outputs
|
| 360 |
+
|
| 361 |
+
if audio_data.ndim > 1:
|
| 362 |
+
audio_data = audio_data.flatten()
|
| 363 |
+
|
| 364 |
+
sample_rate = 22050
|
| 365 |
+
audio_tuple = (sample_rate, audio_data.astype(np.float32))
|
| 366 |
+
return audio_tuple, f"✓ 语音合成成功\n文本: {text}\n说话人: {speaker}\n模型: CosyVoice (transformers)"
|
| 367 |
+
|
| 368 |
else:
|
| 369 |
+
# 没有可用的推理方法
|
| 370 |
+
print(f"模型没有可用的推理方法")
|
| 371 |
+
print(f"可用方法: {[m for m in dir(tts_model) if not m.startswith('_')][:20]}")
|
| 372 |
+
return generate_demo_audio(text, speaker, error="模型缺少推理方法 (inference_sft/inference/generate)")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 373 |
|
| 374 |
elif model_type == 'pytorch':
|
| 375 |
# 使用PyTorch模型
|