Spaces:

leesenx
/

cv

Build error

App Files Files Community

leesenx commited on 6 days ago

Commit

232e893

verified ·

1 Parent(s): cc001df

Update app.py

Browse files

Files changed (1) hide show

app.py +21 -152

app.py CHANGED Viewed

@@ -1,160 +1,29 @@
-import sys
-import os
-import subprocess
-ROOT = os.path.dirname(os.path.realpath(__file__))
-sys.path.insert(0, ROOT)
-if not os.path.isdir(os.path.join(ROOT, 'cosyvoice')):
-    subprocess.run(['git', 'clone', '--depth', '1', 'https://github.com/FunAudioLLM/CosyVoice.git', 'cosyvoice_repo'], check=True, cwd=ROOT)
-    subprocess.run(['git', 'submodule', 'update', '--init', '--recursive'], check=True, cwd=os.path.join(ROOT, 'cosyvoice_repo'))
-    repo = os.path.join(ROOT, 'cosyvoice_repo')
-    for d in ['cosyvoice', 'third_party', 'asset']:
-        src = os.path.join(repo, d)
-        if os.path.exists(src):
-            os.symlink(src, os.path.join(ROOT, d)) if not os.path.exists(os.path.join(ROOT, d)) else None
-    sys.path.insert(0, os.path.join(ROOT, 'cosyvoice_repo'))
-    sys.path.insert(0, os.path.join(ROOT, 'cosyvoice_repo', 'third_party', 'Matcha-TTS'))
-if os.path.isdir(os.path.join(ROOT, 'cosyvoice_repo', 'third_party', 'Matcha-TTS')):
-    sys.path.insert(0, os.path.join(ROOT, 'cosyvoice_repo', 'third_party', 'Matcha-TTS'))
-elif os.path.isdir(os.path.join(ROOT, 'cosyvoice', '..', 'third_party', 'Matcha-TTS')):
-    sys.path.insert(0, os.path.realpath(os.path.join(ROOT, 'cosyvoice', '..', 'third_party', 'Matcha-TTS')))
-elif os.path.isdir(os.path.join(ROOT, 'third_party', 'Matcha-TTS')):
-    sys.path.insert(0, os.path.join(ROOT, 'third_party', 'Matcha-TTS'))
-import time
-import tempfile
-import gradio as gr
-import torch
-import torchaudio
 from huggingface_hub import snapshot_download
-from cosyvoice.cli.cosyvoice import CosyVoice
-MODEL_DIR = os.path.join(ROOT, 'pretrained_models', 'CosyVoice-300M')
-if not os.path.isfile(os.path.join(MODEL_DIR, 'cosyvoice.yaml')):
-    print("Downloading CosyVoice-300M model from HuggingFace...")
-    snapshot_download(
-        'FunAudioLLM/CosyVoice-300M',
-        local_dir=MODEL_DIR,
-        allow_patterns=['*.pt', '*.onnx', '*.yaml', 'configuration.json'],
-    )
-    fp32_onnx = os.path.join(MODEL_DIR, 'flow.decoder.estimator.fp32.onnx')
-    if os.path.isfile(fp32_onnx):
-        os.remove(fp32_onnx)
-    print("Model download complete.")
-print("Loading model...")
-cosyvoice = CosyVoice(MODEL_DIR)
-SAMPLE_RATE = cosyvoice.sample_rate
-print("Model loaded.")
-def _synthesize(generator):
-    chunks = []
-    for c in generator:
-        chunks.append(c['tts_speech'])
-    if not chunks:
-        return None
-    speech = torch.cat(chunks, dim=1)
-    f = tempfile.NamedTemporaryFile(suffix='.wav', delete=False)
-    torchaudio.save(f.name, speech, SAMPLE_RATE)
-    return f.name
-def zero_shot_tts(tts_text, prompt_text, prompt_wav):
-    if not tts_text.strip():
-        raise gr.Error("Enter text to synthesize")
-    if not prompt_text.strip():
-        raise gr.Error("Enter prompt text")
-    if prompt_wav is None:
-        raise gr.Error("Upload reference audio")
-    t0 = time.time()
-    out = _synthesize(cosyvoice.inference_zero_shot(tts_text, prompt_text, prompt_wav, stream=False))
-    return out, f"Done in {time.time()-t0:.1f}s"
-def cross_lingual_tts(tts_text, prompt_wav):
-    if not tts_text.strip():
-        raise gr.Error("Enter text to synthesize")
-    if prompt_wav is None:
-        raise gr.Error("Upload reference audio")
-    t0 = time.time()
-    out = _synthesize(cosyvoice.inference_cross_lingual(tts_text, prompt_wav, stream=False))
-    return out, f"Done in {time.time()-t0:.1f}s"
-def voice_conversion(source_wav, prompt_wav):
-    if source_wav is None:
-        raise gr.Error("Upload source audio")
-    if prompt_wav is None:
-        raise gr.Error("Upload target speaker audio")
-    t0 = time.time()
-    out = _synthesize(cosyvoice.inference_vc(source_wav, prompt_wav))
-    return out, f"Done in {time.time()-t0:.1f}s"
-ASSET = os.path.join(ROOT, 'asset') if os.path.isdir(os.path.join(ROOT, 'asset')) else os.path.join(ROOT, 'cosyvoice_repo', 'asset')
-with gr.Blocks(title="CosyVoice-300M TTS") as app:
-    gr.Markdown("# CosyVoice-300M Text-to-Speech\n> CPU inference — slow (~40-70x realtime), please be patient!")
-    with gr.Tabs():
-        with gr.Tab("Zero-Shot TTS"):
-            gr.Markdown("Clone a voice from a short reference audio.")
-            with gr.Row():
-                with gr.Column():
-                    zs_text = gr.Textbox(label="Text to Synthesize", lines=3)
-                    zs_ptext = gr.Textbox(label="Prompt Text (transcript of reference)", lines=2)
-                    zs_wav = gr.Audio(label="Reference Audio", type="filepath")
-                    zs_btn = gr.Button("Synthesize", variant="primary")
-                with gr.Column():
-                    zs_out = gr.Audio(label="Output", type="filepath")
-                    zs_info = gr.Textbox(label="Info", interactive=False)
-            zs_btn.click(zero_shot_tts, [zs_text, zs_ptext, zs_wav], [zs_out, zs_info])
-            examples_zs = [
-                ["收到好友从远方寄来的生日礼物，那份意外的惊喜与深深的祝福让我心中充满了甜蜜的快乐，笑容如花儿般绽放。",
-                 "希望你以后能够做的比我还好呦。",
-                 os.path.join(ASSET, 'zero_shot_prompt.wav')]
-            ] if os.path.isfile(os.path.join(ASSET, 'zero_shot_prompt.wav')) else None
-            if examples_zs:
-                gr.Examples(examples_zs, [zs_text, zs_ptext, zs_wav])
-        with gr.Tab("Cross-Lingual TTS"):
-            gr.Markdown("Synthesize in another language, keeping the speaker's voice. Prefix with `<|en|>`, `<|zh|>`, `<|ja|>`, `<|ko|>`, `<|de|>`, `<|fr|>`.")
-            with gr.Row():
-                with gr.Column():
-                    cl_text = gr.Textbox(label="Text (with language tag)", lines=3, placeholder="<|en|>Hello world")
-                    cl_wav = gr.Audio(label="Reference Audio", type="filepath")
-                    cl_btn = gr.Button("Synthesize", variant="primary")
-                with gr.Column():
-                    cl_out = gr.Audio(label="Output", type="filepath")
-                    cl_info = gr.Textbox(label="Info", interactive=False)
-            cl_btn.click(cross_lingual_tts, [cl_text, cl_wav], [cl_out, cl_info])
-            examples_cl = [
-                ["<|en|>And then later on, fully acquiring that company.",
-                 os.path.join(ASSET, 'cross_lingual_prompt.wav')]
-            ] if os.path.isfile(os.path.join(ASSET, 'cross_lingual_prompt.wav')) else None
-            if examples_cl:
-                gr.Examples(examples_cl, [cl_text, cl_wav])
-        with gr.Tab("Voice Conversion"):
-            gr.Markdown("Convert source audio to sound like the target speaker.")
-            with gr.Row():
-                with gr.Column():
-                    vc_src = gr.Audio(label="Source Audio", type="filepath")
-                    vc_ref = gr.Audio(label="Target Speaker Audio", type="filepath")
-                    vc_btn = gr.Button("Convert", variant="primary")
-                with gr.Column():
-                    vc_out = gr.Audio(label="Output", type="filepath")
-                    vc_info = gr.Textbox(label="Info", interactive=False)
-            vc_btn.click(voice_conversion, [vc_src, vc_ref], [vc_out, vc_info])
-            examples_vc = [
-                [os.path.join(ASSET, 'cross_lingual_prompt.wav'),
-                 os.path.join(ASSET, 'zero_shot_prompt.wav')]
-            ] if os.path.isfile(os.path.join(ASSET, 'cross_lingual_prompt.wav')) else None
-            if examples_vc:
-                gr.Examples(examples_vc, [vc_src, vc_ref])
-if __name__ == '__main__':
-    app.launch()

+import os, sys, subprocess, torch, numpy as np, gradio as gr
 from huggingface_hub import snapshot_download
+subprocess.run(["git", "clone", "--recursive", "https://github.com/FunAudioLLM/CosyVoice.git", "CosyVoice"], check=True)
+sys.path.insert(0, "CosyVoice/third_party/Matcha-TTS")
+sys.path.insert(0, "CosyVoice")
+model_dir = snapshot_download("iic/CosyVoice-300M-SFT", local_dir="pretrained_models/CosyVoice-300M-SFT")
+from cosyvoice.cli.cosyvoice import CosyVoice
+cosyvoice = CosyVoice(model_dir)
+spk_list = cosyvoice.list_available_spks()
+def tts(text, spk):
+    for result in cosyvoice.inference_sft(text, spk, stream=False):
+        audio = result["tts_speech"].numpy().flatten()
+        return (cosyvoice.sample_rate, audio)
+demo = gr.Interface(
+    fn=tts,
+    inputs=[
+        gr.Textbox(label="Text", value="你好，我是通义生成式语音大模型。"),
+        gr.Dropdown(choices=spk_list, value=spk_list[0], label="Speaker"),
+    ],
+    outputs=gr.Audio(label="Audio"),
+)
+demo.launch()