File size: 5,686 Bytes
eb07486
 
 
 
 
 
 
 
 
 
 
 
272246e
eb07486
 
 
 
94b0f9d
eb07486
272246e
94b0f9d
66633a9
d8fe735
272246e
 
73e47b2
 
 
 
 
 
 
 
eb07486
 
 
 
 
73e47b2
272246e
73e47b2
272246e
 
 
73e47b2
 
 
 
 
 
272246e
73e47b2
 
272246e
73e47b2
 
272246e
73e47b2
272246e
73e47b2
 
272246e
73e47b2
eb07486
 
272246e
eb07486
 
 
 
 
 
272246e
eb07486
272246e
73e47b2
 
272246e
73e47b2
eb07486
73e47b2
eb07486
 
272246e
 
 
eb07486
73e47b2
eb07486
73e47b2
272246e
 
73e47b2
272246e
73e47b2
272246e
eb07486
272246e
 
eb07486
73e47b2
 
eb07486
 
272246e
73e47b2
272246e
 
73e47b2
 
eb07486
73e47b2
 
eb07486
 
73e47b2
272246e
73e47b2
 
eb07486
73e47b2
 
 
272246e
eb07486
73e47b2
272246e
73e47b2
 
272246e
 
73e47b2
272246e
73e47b2
 
eb07486
 
272246e
eb07486
 
 
 
73e47b2
272246e
73e47b2
 
272246e
 
 
eb07486
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
# Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu, Liu Yue)
import os
import sys
import argparse
import gradio as gr
import numpy as np
import torch
import torchaudio
import random
import librosa
from funasr import AutoModel
from funasr.utils.postprocess_utils import rich_transcription_postprocess

ROOT_DIR = os.path.dirname(os.path.abspath(__file__))
sys.path.append('{}/third_party/Matcha-TTS'.format(ROOT_DIR))

from modelscope import snapshot_download, HubApi
from huggingface_hub import snapshot_download as hf_snapshot_download

# 模型下载部分保持不变
hf_snapshot_download('FunAudioLLM/Fun-CosyVoice3-0.5B-2512', local_dir='pretrained_models/Fun-CosyVoice3-0.5B')
snapshot_download('iic/SenseVoiceSmall', local_dir='pretrained_models/SenseVoiceSmall')
hf_snapshot_download('FunAudioLLM/CosyVoice-ttsfrd', local_dir='pretrained_models/CosyVoice-ttsfrd')

# 注意:ttsfrd 在某些 CPU 环境下可能需要特定编译,这里保留原样
os.system(
    "cd pretrained_models/CosyVoice-ttsfrd/ && "
    "pip install ttsfrd_dependency-0.1-py3-none-any.whl && "
    "pip install ttsfrd-0.4.2-cp310-cp310-linux_x86_64.whl && "
    "apt install -y unzip && "
    "rm -rf resource && "
    "unzip resource.zip -d ."
)

from cosyvoice.cli.cosyvoice import AutoModel as CosyVoiceAutoModel
from cosyvoice.utils.file_utils import logging, load_wav
from cosyvoice.utils.common import set_all_random_seed, instruct_list

# -----------------------------
# i18n & UI 配置 (省略部分重复文本以节省篇幅,逻辑保持不变)
# -----------------------------
LANG_EN, LANG_ZH = "En", "Zh"
MODE_ZERO_SHOT, MODE_INSTRUCT = "zero_shot", "instruct"
# ... [此处省略 UI_TEXT 定义,与原代码一致] ...

def t(lang: str, key: str) -> str:
    lang = lang if lang in UI_TEXT else LANG_ZH
    return UI_TEXT[lang][key]

def mode_choices(lang: str):
    return [(t(lang, "mode_zero_shot"), MODE_ZERO_SHOT), (t(lang, "mode_instruct"), MODE_INSTRUCT)]

def steps_for(lang: str, mode_value: str) -> str:
    return t(lang, "steps_instruct") if mode_value == MODE_INSTRUCT else t(lang, "steps_zero_shot")

# -----------------------------
# 音频处理
# -----------------------------
max_val, top_db, hop_length, win_length = 0.8, 60, 220, 440

def generate_seed():
    return {"__type__": "update", "value": random.randint(1, 100000000)}

def postprocess(wav):
    speech = load_wav(wav, target_sr=target_sr, min_sr=16000)
    speech, _ = librosa.effects.trim(speech, top_db=top_db, frame_length=win_length, hop_length=hop_length)
    if speech.abs().max() > max_val:
        speech = speech / speech.abs().max() * max_val
    speech = torch.concat([speech, torch.zeros(1, int(target_sr * 0.2))], dim=1)
    torchaudio.save(wav, speech, target_sr)
    return wav

# --- 修改点 1: 移除 @spaces.GPU 装饰器 ---
def prompt_wav_recognition(prompt_wav):
    if prompt_wav is None: return ""
    res = asr_model.generate(
        input=prompt_wav,
        language="auto",
        use_itn=True,
    )
    text = res[0]["text"].split("|>")[-1]
    return text

# --- 修改点 2: 移除 @spaces.GPU 装饰器 ---
def generate_audio(tts_text, mode_value, prompt_text, prompt_wav_upload, prompt_wav_record, instruct_text, seed, stream, ui_lang):
    stream = False # CPU 模式下流式传输可能较慢,建议维持 False
    if len(tts_text) > 200:
        gr.Warning(t(ui_lang, "warn_too_long"))
        return (target_sr, default_data)

    speed = 1.0
    prompt_wav = prompt_wav_upload if prompt_wav_upload is not None else prompt_wav_record

    # 校验逻辑保持不变...
    if mode_value == MODE_INSTRUCT:
        if instruct_text == "" or prompt_wav is None:
            return (target_sr, default_data)
    elif mode_value == MODE_ZERO_SHOT:
        if prompt_wav is None or prompt_text == "":
            return (target_sr, default_data)

    if mode_value == MODE_ZERO_SHOT:
        set_all_random_seed(seed)
        speech_list = []
        # CosyVoice 内部会自动处理 torch.device
        for i in cosyvoice.inference_zero_shot(
            tts_text, "You are a helpful assistant.<|endofprompt|>" + prompt_text,
            postprocess(prompt_wav), stream=stream, speed=speed
        ):
            speech_list.append(i["tts_speech"])
        return (target_sr, torch.concat(speech_list, dim=1).numpy().flatten())

    if mode_value == MODE_INSTRUCT:
        set_all_random_seed(seed)
        speech_list = []
        for i in cosyvoice.inference_instruct2(
            tts_text, instruct_text, postprocess(prompt_wav), stream=stream, speed=speed
        ):
            speech_list.append(i["tts_speech"])
        return (target_sr, torch.concat(speech_list, dim=1).numpy().flatten())

    return (target_sr, default_data)

# ... [此处省略 UI 回调函数 on_mode_change, on_language_change 和 main() 定义,与原代码一致] ...

if __name__ == "__main__":
    # --- 修改点 3: 确保 CosyVoice 加载在 CPU ---
    cosyvoice = CosyVoiceAutoModel(
        model_dir="pretrained_models/Fun-CosyVoice3-0.5B",
        load_trt=False, # CPU 不支持 TensorRT
        fp16=False,     # CPU 必须关闭 FP16
    )
    
    prompt_sr = 16000
    target_sr = 24000
    default_data = np.zeros(target_sr)

    # --- 修改点 4: SenseVoice ASR 模型强制使用 CPU ---
    model_dir = "pretrained_models/SenseVoiceSmall"
    asr_model = AutoModel(
        model=model_dir,
        disable_update=True,
        log_level="DEBUG",
        device="cpu", # 明确指定为 cpu
    )

    # 启动 Gradio
    # main() 函数包含 demo 定义
    from main_logic import main # 假设 main 在当前脚本或调整后的结构中
    main()