leesenx commited on
Commit
232e893
·
verified ·
1 Parent(s): cc001df

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +21 -152
app.py CHANGED
@@ -1,160 +1,29 @@
1
- import sys
2
- import os
3
- import subprocess
4
-
5
- ROOT = os.path.dirname(os.path.realpath(__file__))
6
- sys.path.insert(0, ROOT)
7
-
8
- if not os.path.isdir(os.path.join(ROOT, 'cosyvoice')):
9
- subprocess.run(['git', 'clone', '--depth', '1', 'https://github.com/FunAudioLLM/CosyVoice.git', 'cosyvoice_repo'], check=True, cwd=ROOT)
10
- subprocess.run(['git', 'submodule', 'update', '--init', '--recursive'], check=True, cwd=os.path.join(ROOT, 'cosyvoice_repo'))
11
- repo = os.path.join(ROOT, 'cosyvoice_repo')
12
- for d in ['cosyvoice', 'third_party', 'asset']:
13
- src = os.path.join(repo, d)
14
- if os.path.exists(src):
15
- os.symlink(src, os.path.join(ROOT, d)) if not os.path.exists(os.path.join(ROOT, d)) else None
16
- sys.path.insert(0, os.path.join(ROOT, 'cosyvoice_repo'))
17
- sys.path.insert(0, os.path.join(ROOT, 'cosyvoice_repo', 'third_party', 'Matcha-TTS'))
18
-
19
- if os.path.isdir(os.path.join(ROOT, 'cosyvoice_repo', 'third_party', 'Matcha-TTS')):
20
- sys.path.insert(0, os.path.join(ROOT, 'cosyvoice_repo', 'third_party', 'Matcha-TTS'))
21
- elif os.path.isdir(os.path.join(ROOT, 'cosyvoice', '..', 'third_party', 'Matcha-TTS')):
22
- sys.path.insert(0, os.path.realpath(os.path.join(ROOT, 'cosyvoice', '..', 'third_party', 'Matcha-TTS')))
23
- elif os.path.isdir(os.path.join(ROOT, 'third_party', 'Matcha-TTS')):
24
- sys.path.insert(0, os.path.join(ROOT, 'third_party', 'Matcha-TTS'))
25
-
26
- import time
27
- import tempfile
28
- import gradio as gr
29
- import torch
30
- import torchaudio
31
  from huggingface_hub import snapshot_download
32
- from cosyvoice.cli.cosyvoice import CosyVoice
33
-
34
- MODEL_DIR = os.path.join(ROOT, 'pretrained_models', 'CosyVoice-300M')
35
- if not os.path.isfile(os.path.join(MODEL_DIR, 'cosyvoice.yaml')):
36
- print("Downloading CosyVoice-300M model from HuggingFace...")
37
- snapshot_download(
38
- 'FunAudioLLM/CosyVoice-300M',
39
- local_dir=MODEL_DIR,
40
- allow_patterns=['*.pt', '*.onnx', '*.yaml', 'configuration.json'],
41
- )
42
- fp32_onnx = os.path.join(MODEL_DIR, 'flow.decoder.estimator.fp32.onnx')
43
- if os.path.isfile(fp32_onnx):
44
- os.remove(fp32_onnx)
45
- print("Model download complete.")
46
-
47
- print("Loading model...")
48
- cosyvoice = CosyVoice(MODEL_DIR)
49
- SAMPLE_RATE = cosyvoice.sample_rate
50
- print("Model loaded.")
51
-
52
 
53
- def _synthesize(generator):
54
- chunks = []
55
- for c in generator:
56
- chunks.append(c['tts_speech'])
57
- if not chunks:
58
- return None
59
- speech = torch.cat(chunks, dim=1)
60
- f = tempfile.NamedTemporaryFile(suffix='.wav', delete=False)
61
- torchaudio.save(f.name, speech, SAMPLE_RATE)
62
- return f.name
63
 
 
64
 
65
- def zero_shot_tts(tts_text, prompt_text, prompt_wav):
66
- if not tts_text.strip():
67
- raise gr.Error("Enter text to synthesize")
68
- if not prompt_text.strip():
69
- raise gr.Error("Enter prompt text")
70
- if prompt_wav is None:
71
- raise gr.Error("Upload reference audio")
72
- t0 = time.time()
73
- out = _synthesize(cosyvoice.inference_zero_shot(tts_text, prompt_text, prompt_wav, stream=False))
74
- return out, f"Done in {time.time()-t0:.1f}s"
75
-
76
-
77
- def cross_lingual_tts(tts_text, prompt_wav):
78
- if not tts_text.strip():
79
- raise gr.Error("Enter text to synthesize")
80
- if prompt_wav is None:
81
- raise gr.Error("Upload reference audio")
82
- t0 = time.time()
83
- out = _synthesize(cosyvoice.inference_cross_lingual(tts_text, prompt_wav, stream=False))
84
- return out, f"Done in {time.time()-t0:.1f}s"
85
-
86
-
87
- def voice_conversion(source_wav, prompt_wav):
88
- if source_wav is None:
89
- raise gr.Error("Upload source audio")
90
- if prompt_wav is None:
91
- raise gr.Error("Upload target speaker audio")
92
- t0 = time.time()
93
- out = _synthesize(cosyvoice.inference_vc(source_wav, prompt_wav))
94
- return out, f"Done in {time.time()-t0:.1f}s"
95
-
96
-
97
- ASSET = os.path.join(ROOT, 'asset') if os.path.isdir(os.path.join(ROOT, 'asset')) else os.path.join(ROOT, 'cosyvoice_repo', 'asset')
98
-
99
- with gr.Blocks(title="CosyVoice-300M TTS") as app:
100
- gr.Markdown("# CosyVoice-300M Text-to-Speech\n> CPU inference — slow (~40-70x realtime), please be patient!")
101
 
102
- with gr.Tabs():
103
- with gr.Tab("Zero-Shot TTS"):
104
- gr.Markdown("Clone a voice from a short reference audio.")
105
- with gr.Row():
106
- with gr.Column():
107
- zs_text = gr.Textbox(label="Text to Synthesize", lines=3)
108
- zs_ptext = gr.Textbox(label="Prompt Text (transcript of reference)", lines=2)
109
- zs_wav = gr.Audio(label="Reference Audio", type="filepath")
110
- zs_btn = gr.Button("Synthesize", variant="primary")
111
- with gr.Column():
112
- zs_out = gr.Audio(label="Output", type="filepath")
113
- zs_info = gr.Textbox(label="Info", interactive=False)
114
- zs_btn.click(zero_shot_tts, [zs_text, zs_ptext, zs_wav], [zs_out, zs_info])
115
- examples_zs = [
116
- ["收到好友从远方寄来的生日礼物,那份意外的惊喜与深深的祝福让我心中充满了甜蜜的快乐,笑容如花儿般绽放。",
117
- "希望你以后能够做的比我还好呦。",
118
- os.path.join(ASSET, 'zero_shot_prompt.wav')]
119
- ] if os.path.isfile(os.path.join(ASSET, 'zero_shot_prompt.wav')) else None
120
- if examples_zs:
121
- gr.Examples(examples_zs, [zs_text, zs_ptext, zs_wav])
122
 
123
- with gr.Tab("Cross-Lingual TTS"):
124
- gr.Markdown("Synthesize in another language, keeping the speaker's voice. Prefix with `<|en|>`, `<|zh|>`, `<|ja|>`, `<|ko|>`, `<|de|>`, `<|fr|>`.")
125
- with gr.Row():
126
- with gr.Column():
127
- cl_text = gr.Textbox(label="Text (with language tag)", lines=3, placeholder="<|en|>Hello world")
128
- cl_wav = gr.Audio(label="Reference Audio", type="filepath")
129
- cl_btn = gr.Button("Synthesize", variant="primary")
130
- with gr.Column():
131
- cl_out = gr.Audio(label="Output", type="filepath")
132
- cl_info = gr.Textbox(label="Info", interactive=False)
133
- cl_btn.click(cross_lingual_tts, [cl_text, cl_wav], [cl_out, cl_info])
134
- examples_cl = [
135
- ["<|en|>And then later on, fully acquiring that company.",
136
- os.path.join(ASSET, 'cross_lingual_prompt.wav')]
137
- ] if os.path.isfile(os.path.join(ASSET, 'cross_lingual_prompt.wav')) else None
138
- if examples_cl:
139
- gr.Examples(examples_cl, [cl_text, cl_wav])
140
 
141
- with gr.Tab("Voice Conversion"):
142
- gr.Markdown("Convert source audio to sound like the target speaker.")
143
- with gr.Row():
144
- with gr.Column():
145
- vc_src = gr.Audio(label="Source Audio", type="filepath")
146
- vc_ref = gr.Audio(label="Target Speaker Audio", type="filepath")
147
- vc_btn = gr.Button("Convert", variant="primary")
148
- with gr.Column():
149
- vc_out = gr.Audio(label="Output", type="filepath")
150
- vc_info = gr.Textbox(label="Info", interactive=False)
151
- vc_btn.click(voice_conversion, [vc_src, vc_ref], [vc_out, vc_info])
152
- examples_vc = [
153
- [os.path.join(ASSET, 'cross_lingual_prompt.wav'),
154
- os.path.join(ASSET, 'zero_shot_prompt.wav')]
155
- ] if os.path.isfile(os.path.join(ASSET, 'cross_lingual_prompt.wav')) else None
156
- if examples_vc:
157
- gr.Examples(examples_vc, [vc_src, vc_ref])
158
 
159
- if __name__ == '__main__':
160
- app.launch()
 
1
+ import os, sys, subprocess, torch, numpy as np, gradio as gr
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  from huggingface_hub import snapshot_download
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
 
4
+ subprocess.run(["git", "clone", "--recursive", "https://github.com/FunAudioLLM/CosyVoice.git", "CosyVoice"], check=True)
5
+ sys.path.insert(0, "CosyVoice/third_party/Matcha-TTS")
6
+ sys.path.insert(0, "CosyVoice")
 
 
 
 
 
 
 
7
 
8
+ model_dir = snapshot_download("iic/CosyVoice-300M-SFT", local_dir="pretrained_models/CosyVoice-300M-SFT")
9
 
10
+ from cosyvoice.cli.cosyvoice import CosyVoice
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11
 
12
+ cosyvoice = CosyVoice(model_dir)
13
+ spk_list = cosyvoice.list_available_spks()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14
 
15
+ def tts(text, spk):
16
+ for result in cosyvoice.inference_sft(text, spk, stream=False):
17
+ audio = result["tts_speech"].numpy().flatten()
18
+ return (cosyvoice.sample_rate, audio)
 
 
 
 
 
 
 
 
 
 
 
 
 
19
 
20
+ demo = gr.Interface(
21
+ fn=tts,
22
+ inputs=[
23
+ gr.Textbox(label="Text", value="你好,我是通义生成式语音大模型。"),
24
+ gr.Dropdown(choices=spk_list, value=spk_list[0], label="Speaker"),
25
+ ],
26
+ outputs=gr.Audio(label="Audio"),
27
+ )
 
 
 
 
 
 
 
 
 
28
 
29
+ demo.launch()