jerrybwang commited on
Commit
bd60378
·
1 Parent(s): 43a7cd5
Files changed (1) hide show
  1. app.py +98 -57
app.py CHANGED
@@ -264,65 +264,85 @@ def text_to_speech(text, speaker="中文女", prompt_audio=None, prompt_text=Non
264
 
265
  print(f"使用官方CosyVoice API: text={text[:50]}...")
266
 
267
- # 使用 inference_sft 方法(预训练说话人
268
- # 注意:CosyVoice-300M 支持的说话人需要查看模型文档
269
  try:
270
- # 尝试使用 inference_sft
271
- audio_chunks = []
272
- for i, output in enumerate(cosyvoice.inference_sft(text, speaker, stream=False)):
273
- if isinstance(output, dict) and 'tts_speech' in output:
274
- audio_chunks.append(output['tts_speech'])
275
- else:
276
- audio_chunks.append(output)
277
 
278
- # 合并音频
279
- if audio_chunks:
280
- if torch.is_tensor(audio_chunks[0]):
281
- audio_data = torch.cat(audio_chunks, dim=-1).cpu().numpy()
282
- else:
283
- audio_data = np.concatenate(audio_chunks, axis=-1)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
284
 
285
- # 确保是1D数组
286
- if audio_data.ndim > 1:
287
- audio_data = audio_data.flatten()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
288
 
289
- audio_tuple = (sample_rate, audio_data.astype(np.float32))
290
- return audio_tuple, f"✓ 语音合成成功\n文本: {text}\n说话人: {speaker}\n模型: CosyVoice (官方API - SFT)"
 
 
 
 
 
 
 
 
 
291
 
292
  except Exception as e:
293
- print(f"inference_sft 失败: {e}")
294
- # 如果有提示音频,尝试 zero-shot
295
- if prompt_audio and prompt_text:
296
- try:
297
- audio_chunks = []
298
- for i, output in enumerate(cosyvoice.inference_zero_shot(
299
- text,
300
- prompt_text,
301
- prompt_audio,
302
- stream=False
303
- )):
304
- if isinstance(output, dict) and 'tts_speech' in output:
305
- audio_chunks.append(output['tts_speech'])
306
- else:
307
- audio_chunks.append(output)
308
-
309
- if audio_chunks:
310
- if torch.is_tensor(audio_chunks[0]):
311
- audio_data = torch.cat(audio_chunks, dim=-1).cpu().numpy()
312
- else:
313
- audio_data = np.concatenate(audio_chunks, axis=-1)
314
-
315
- if audio_data.ndim > 1:
316
- audio_data = audio_data.flatten()
317
-
318
- audio_tuple = (sample_rate, audio_data.astype(np.float32))
319
- return audio_tuple, f"✓ 语音合成成功\n文本: {text}\n模式: Zero-shot\n模型: CosyVoice (官方API)"
320
-
321
- except Exception as e2:
322
- print(f"inference_zero_shot 也失败: {e2}")
323
- raise e
324
- else:
325
- raise e
326
 
327
  elif model_type == 'transformers':
328
  # 使用transformers接口
@@ -523,7 +543,7 @@ try:
523
  except:
524
  theme = None
525
 
526
- with gr.Blocks(theme=theme) as demo:
527
  gr.Markdown(f"# {title}")
528
  gr.Markdown(description)
529
 
@@ -571,10 +591,31 @@ with gr.Blocks(theme=theme) as demo:
571
  lines=3
572
  )
573
  speaker_input = gr.Dropdown(
574
- label="选择说话人",
575
  choices=["中文女", "中文男", "英文女", "英文男", "粤语女", "粤语男", "日语男", "韩语女"],
576
  value="中文女"
577
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
578
  tts_btn = gr.Button("生成语音", variant="primary")
579
 
580
  with gr.Column():
@@ -583,7 +624,7 @@ with gr.Blocks(theme=theme) as demo:
583
 
584
  tts_btn.click(
585
  fn=text_to_speech,
586
- inputs=[text_input, speaker_input],
587
  outputs=[audio_output, tts_status]
588
  )
589
 
@@ -610,4 +651,4 @@ with gr.Blocks(theme=theme) as demo:
610
  """)
611
 
612
  if __name__ == "__main__":
613
- demo.launch()
 
264
 
265
  print(f"使用官方CosyVoice API: text={text[:50]}...")
266
 
267
+ # 使用 inference_zero_shot 方法(zero-shot克隆
268
+ # 根据官方文档,使用正确的提示文本格式
269
  try:
270
+ # 准备提示文本(使用官方格式)
271
+ if prompt_text is None:
272
+ # 使用默认提示文本
273
+ prompt_text = 'You are a helpful assistant.<|endofprompt|>希望你以后能够做的比我还好呦。'
 
 
 
274
 
275
+ # 准备提示音频
276
+ if prompt_audio is None:
277
+ # 尝试使用项目中的示例音频
278
+ possible_prompt_paths = [
279
+ './asset/zero_shot_prompt.wav',
280
+ './CosyVoice/asset/zero_shot_prompt.wav',
281
+ './zero_shot_prompt.wav'
282
+ ]
283
+ for path in possible_prompt_paths:
284
+ if os.path.exists(path):
285
+ prompt_audio = path
286
+ print(f"使用提示音频: {path}")
287
+ break
288
+
289
+ # 如果有提示音频,使用 zero-shot 模式
290
+ if prompt_audio and os.path.exists(prompt_audio):
291
+ print(f"使用 inference_zero_shot: text={text[:30]}, prompt={prompt_text[:50]}")
292
+ audio_chunks = []
293
+ for i, output in enumerate(cosyvoice.inference_zero_shot(
294
+ text,
295
+ prompt_text,
296
+ prompt_audio,
297
+ stream=False
298
+ )):
299
+ if isinstance(output, dict) and 'tts_speech' in output:
300
+ audio_chunks.append(output['tts_speech'])
301
+ else:
302
+ audio_chunks.append(output)
303
 
304
+ if audio_chunks:
305
+ if torch.is_tensor(audio_chunks[0]):
306
+ audio_data = torch.cat(audio_chunks, dim=-1).cpu().numpy()
307
+ else:
308
+ audio_data = np.concatenate(audio_chunks, axis=-1)
309
+
310
+ if audio_data.ndim > 1:
311
+ audio_data = audio_data.flatten()
312
+
313
+ audio_tuple = (sample_rate, audio_data.astype(np.float32))
314
+ return audio_tuple, f"✓ 语音合成成功\n文本: {text}\n模式: Zero-shot\n模型: CosyVoice (官方API)"
315
+
316
+ # 如果没有提示音频,尝试使用 inference_sft(预训练说话人)
317
+ else:
318
+ print(f"使用 inference_sft: text={text[:30]}, speaker={speaker}")
319
+ # CosyVoice-300M 可能支持的说话人ID
320
+ # 需要根据实际模型调整
321
+ audio_chunks = []
322
+ for i, output in enumerate(cosyvoice.inference_sft(text, speaker, stream=False)):
323
+ if isinstance(output, dict) and 'tts_speech' in output:
324
+ audio_chunks.append(output['tts_speech'])
325
+ else:
326
+ audio_chunks.append(output)
327
 
328
+ if audio_chunks:
329
+ if torch.is_tensor(audio_chunks[0]):
330
+ audio_data = torch.cat(audio_chunks, dim=-1).cpu().numpy()
331
+ else:
332
+ audio_data = np.concatenate(audio_chunks, axis=-1)
333
+
334
+ if audio_data.ndim > 1:
335
+ audio_data = audio_data.flatten()
336
+
337
+ audio_tuple = (sample_rate, audio_data.astype(np.float32))
338
+ return audio_tuple, f"✓ 语音合成成功\n文本: {text}\n说话人: {speaker}\n模型: CosyVoice (官方API - SFT)"
339
 
340
  except Exception as e:
341
+ print(f"CosyVoice API 调用失败: {e}")
342
+ import traceback
343
+ traceback.print_exc()
344
+ # 返回演示音频并显示错误信息
345
+ return generate_demo_audio(text, speaker, error=f"API调用失败: {str(e)}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
346
 
347
  elif model_type == 'transformers':
348
  # 使用transformers接口
 
543
  except:
544
  theme = None
545
 
546
+ with gr.Blocks() as demo:
547
  gr.Markdown(f"# {title}")
548
  gr.Markdown(description)
549
 
 
591
  lines=3
592
  )
593
  speaker_input = gr.Dropdown(
594
+ label="选择说话人(SFT模式)",
595
  choices=["中文女", "中文男", "英文女", "英文男", "粤语女", "粤语男", "日语男", "韩语女"],
596
  value="中文女"
597
  )
598
+
599
+ # Zero-shot 模式选项
600
+ with gr.Accordion("高级选项 - Zero-shot 声音克隆", open=False):
601
+ prompt_audio_input = gr.Audio(
602
+ label="上传提示音频(3-10秒)",
603
+ type="filepath",
604
+ sources=["upload"]
605
+ )
606
+ prompt_text_input = gr.Textbox(
607
+ label="提示文本(音频对应的文字)",
608
+ placeholder="You are a helpful assistant.<|endofprompt|>希望你以后能够做的比我还好呦。",
609
+ lines=2
610
+ )
611
+ gr.Markdown("""
612
+ **使用说明:**
613
+ - 上传一段3-10秒的参考音频
614
+ - 输入音频对应的文字内容
615
+ - 格式:`You are a helpful assistant.<|endofprompt|>音频对应的文字`
616
+ - 系统将克隆该音频的音色来合成新文本
617
+ """)
618
+
619
  tts_btn = gr.Button("生成语音", variant="primary")
620
 
621
  with gr.Column():
 
624
 
625
  tts_btn.click(
626
  fn=text_to_speech,
627
+ inputs=[text_input, speaker_input, prompt_audio_input, prompt_text_input],
628
  outputs=[audio_output, tts_status]
629
  )
630
 
 
651
  """)
652
 
653
  if __name__ == "__main__":
654
+ demo.launch(theme=theme)