jerrybwang commited on
Commit
6baad6c
·
1 Parent(s): 228e994

--other 更新代码

Browse files
Files changed (1) hide show
  1. app.py +80 -47
app.py CHANGED
@@ -36,14 +36,13 @@ def load_cosyvoice_model():
36
  # 方法1: 使用transformers加载(推荐用于Hugging Face Space)
37
  try:
38
  print("\n[方法1] 尝试使用transformers加载...")
39
- from transformers import AutoModel, AutoTokenizer
40
  import torch
41
 
42
  model_name = "FunAudioLLM/CosyVoice-300M"
43
  print(f" 从 {model_name} 加载...")
44
 
45
- # CosyVoice需要trust_remote_code=True
46
- tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
47
  model = AutoModel.from_pretrained(
48
  model_name,
49
  trust_remote_code=True,
@@ -53,10 +52,14 @@ def load_cosyvoice_model():
53
  # 设置为评估模式
54
  model.eval()
55
 
 
 
 
 
56
  cosyvoice_model = {
57
- 'tokenizer': tokenizer,
58
  'model': model,
59
- 'type': 'transformers'
 
60
  }
61
  model_loaded = True
62
  print(" ✓ 成功通过transformers加载CosyVoice模型")
@@ -287,56 +290,86 @@ def text_to_speech(text, speaker="中文女"):
287
 
288
  if model_type == 'transformers':
289
  # 使用transformers接口
290
- tokenizer = model['tokenizer']
291
  tts_model = model['model']
292
 
293
- # 准备输入
294
- inputs = tokenizer(text, return_tensors="pt", padding=True)
295
-
296
- # 生成语音
297
  with torch.no_grad():
298
- # 尝试不同的生成方法
299
  if hasattr(tts_model, 'inference_sft'):
 
300
  outputs = tts_model.inference_sft(text, speaker)
301
- elif hasattr(tts_model, 'generate'):
302
- outputs = tts_model.generate(**inputs)
303
- elif hasattr(tts_model, 'forward'):
304
- outputs = tts_model(**inputs)
305
- else:
306
- raise AttributeError("模型没有可用的推理方法")
307
-
308
- # 处理输出
309
- if isinstance(outputs, dict):
310
- if 'tts_speech' in outputs:
311
- audio_data = outputs['tts_speech']
312
- if torch.is_tensor(audio_data):
313
- audio_data = audio_data.cpu().numpy()
314
- elif 'audio' in outputs:
315
- audio_data = outputs['audio']
 
316
  if torch.is_tensor(audio_data):
317
  audio_data = audio_data.cpu().numpy()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
318
  else:
319
- # 使第一个tensor输出
320
- for v in outputs.values():
321
- if torch.is_tensor(v):
322
- audio_data = v.cpu().numpy()
323
- break
324
- elif torch.is_tensor(outputs):
325
- audio_data = outputs.cpu().numpy()
326
- if audio_data.ndim > 1:
327
- audio_data = audio_data[0] # 取第一个batch
328
- elif isinstance(outputs, np.ndarray):
329
- audio_data = outputs
330
- else:
331
- raise TypeError(f"不支持的输出类型: {type(outputs)}")
332
-
333
- # 确保音频数据是1D数组
334
- if audio_data.ndim > 1:
335
- audio_data = audio_data.flatten()
336
-
337
- sample_rate = 22050
338
- audio_tuple = (sample_rate, audio_data.astype(np.float32))
339
- return audio_tuple, f"✓ 语音合成成功\n文本: {text}\n说话人: {speaker}\n模型: transformers"
340
 
341
  elif model_type == 'pytorch':
342
  # 使用PyTorch模型
 
36
  # 方法1: 使用transformers加载(推荐用于Hugging Face Space)
37
  try:
38
  print("\n[方法1] 尝试使用transformers加载...")
39
+ from transformers import AutoModel
40
  import torch
41
 
42
  model_name = "FunAudioLLM/CosyVoice-300M"
43
  print(f" 从 {model_name} 加载...")
44
 
45
+ # CosyVoice需要trust_remote_code=True来加载自定义模型代码
 
46
  model = AutoModel.from_pretrained(
47
  model_name,
48
  trust_remote_code=True,
 
52
  # 设置为评估模式
53
  model.eval()
54
 
55
+ # 检查模型是否有推理方法
56
+ has_inference = hasattr(model, 'inference_sft') or hasattr(model, 'inference') or hasattr(model, 'generate')
57
+ print(f" 模型推理方法检查: inference_sft={hasattr(model, 'inference_sft')}, inference={hasattr(model, 'inference')}, generate={hasattr(model, 'generate')}")
58
+
59
  cosyvoice_model = {
 
60
  'model': model,
61
+ 'type': 'transformers',
62
+ 'has_inference': has_inference
63
  }
64
  model_loaded = True
65
  print(" ✓ 成功通过transformers加载CosyVoice模型")
 
290
 
291
  if model_type == 'transformers':
292
  # 使用transformers接口
 
293
  tts_model = model['model']
294
 
295
+ # 生成语音 - 尝试不同的推理方法
 
 
 
296
  with torch.no_grad():
297
+ # 方法1: 尝试inference_sft(CosyVoice标准接口)
298
  if hasattr(tts_model, 'inference_sft'):
299
+ print(f"使用inference_sft方法: text={text}, speaker={speaker}")
300
  outputs = tts_model.inference_sft(text, speaker)
301
+
302
+ # 处理输出
303
+ if isinstance(outputs, dict):
304
+ if 'tts_speech' in outputs:
305
+ audio_data = outputs['tts_speech']
306
+ elif 'audio' in outputs:
307
+ audio_data = outputs['audio']
308
+ else:
309
+ # 取第一个tensor值
310
+ audio_data = next(iter(outputs.values()))
311
+ elif isinstance(outputs, (list, tuple)):
312
+ audio_data = outputs[0]
313
+ else:
314
+ audio_data = outputs
315
+
316
+ # 转换为numpy
317
  if torch.is_tensor(audio_data):
318
  audio_data = audio_data.cpu().numpy()
319
+
320
+ # 确保是1D数组
321
+ if audio_data.ndim > 1:
322
+ audio_data = audio_data.flatten()
323
+
324
+ sample_rate = 22050
325
+ audio_tuple = (sample_rate, audio_data.astype(np.float32))
326
+ return audio_tuple, f"✓ 语音合成成功\n文本: {text}\n说话人: {speaker}\n模型: CosyVoice (transformers)"
327
+
328
+ # 方法2: 尝试inference方法
329
+ elif hasattr(tts_model, 'inference'):
330
+ print(f"使用inference方法: text={text}, speaker={speaker}")
331
+ outputs = tts_model.inference(text, speaker)
332
+
333
+ if torch.is_tensor(outputs):
334
+ audio_data = outputs.cpu().numpy()
335
+ else:
336
+ audio_data = outputs
337
+
338
+ if audio_data.ndim > 1:
339
+ audio_data = audio_data.flatten()
340
+
341
+ sample_rate = 22050
342
+ audio_tuple = (sample_rate, audio_data.astype(np.float32))
343
+ return audio_tuple, f"✓ 语音合成成功\n文本: {text}\n说话人: {speaker}\n模型: CosyVoice (transformers)"
344
+
345
+ # 方法3: 尝试generate方法
346
+ elif hasattr(tts_model, 'generate'):
347
+ print(f"使用generate方法: text={text}")
348
+ # 准备输入
349
+ inputs = {"text": text, "speaker": speaker}
350
+ outputs = tts_model.generate(**inputs)
351
+
352
+ if torch.is_tensor(outputs):
353
+ audio_data = outputs.cpu().numpy()
354
+ elif isinstance(outputs, dict):
355
+ audio_data = outputs.get('audio', outputs.get('waveform', next(iter(outputs.values()))))
356
+ if torch.is_tensor(audio_data):
357
+ audio_data = audio_data.cpu().numpy()
358
+ else:
359
+ audio_data = outputs
360
+
361
+ if audio_data.ndim > 1:
362
+ audio_data = audio_data.flatten()
363
+
364
+ sample_rate = 22050
365
+ audio_tuple = (sample_rate, audio_data.astype(np.float32))
366
+ return audio_tuple, f"✓ 语音合成成功\n文本: {text}\n说话人: {speaker}\n模型: CosyVoice (transformers)"
367
+
368
  else:
369
+ # 没有可的推理方法
370
+ print(f"模型没有可用的推理方法")
371
+ print(f"可用方法: {[m for m in dir(tts_model) if not m.startswith('_')][:20]}")
372
+ return generate_demo_audio(text, speaker, error="模型缺少推理方法 (inference_sft/inference/generate)")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
373
 
374
  elif model_type == 'pytorch':
375
  # 使用PyTorch模型