|
|
import torch |
|
|
from TTS.api import TTS |
|
|
import gradio as gr |
|
|
import os |
|
|
import tempfile |
|
|
import datetime |
|
|
import shutil |
|
|
import re |
|
|
import time |
|
|
from tqdm import tqdm |
|
|
|
|
|
|
|
|
os.environ["COQUI_TOS_AGREED"] = "1" |
|
|
|
|
|
|
|
|
try: |
|
|
import torch.serialization |
|
|
from TTS.tts.configs.xtts_config import XttsConfig |
|
|
from TTS.tts.models.xtts import XttsAudioConfig |
|
|
from TTS.config.shared_configs import BaseDatasetConfig |
|
|
from TTS.tts.models.xtts import XttsArgs |
|
|
torch.serialization.add_safe_globals([ |
|
|
XttsConfig, XttsAudioConfig, BaseDatasetConfig, XttsArgs |
|
|
]) |
|
|
print("已将 XTTS 相关配置类加入 PyTorch 安全全局变量白名单。") |
|
|
except Exception as e: |
|
|
print(f"警告:无法将安全全局变量加入 PyTorch 白名单: {e}") |
|
|
print("如果遇到模型载入错误,请检查 PyTorch 和 TTS 库版本。") |
|
|
|
|
|
|
|
|
device = "cuda" if torch.cuda.is_available() else "cpu" |
|
|
print(f"使用设备: {device}") |
|
|
|
|
|
|
|
|
SAVE_GENERATED_AUDIO_DIR = "generated_audio" |
|
|
SAVE_UPLOADED_REFERENCES_DIR = "uploaded_references" |
|
|
os.makedirs(SAVE_GENERATED_AUDIO_DIR, exist_ok=True) |
|
|
os.makedirs(SAVE_UPLOADED_REFERENCES_DIR, exist_ok=True) |
|
|
|
|
|
|
|
|
tts = None |
|
|
model_load_error = None |
|
|
SUPPORTED_LANGUAGES = [ |
|
|
"en", "zh-cn", "es", "fr", "de", "it", "pt", "pl", "ru", "ja", "ko", "ar", "hi", "tr", |
|
|
"nl", "sv", "da", "fi", "no", "cs", "hu", "el", "uk", "vi", "th", "id", "ms", "ro", |
|
|
"sk", "hr", "bg", "ca", "fa", "he", "ur", "bn", "gu", "kn", "ml", "mr", "pa", "ta", "te", |
|
|
] |
|
|
DEFAULT_SPEAKER_WAV = "speaker.wav" |
|
|
|
|
|
def sanitize_filename(text: str, max_len: int = 50) -> str: |
|
|
"""清理文本以用作安全的文件名""" |
|
|
safe_text = re.sub(r'[^\w\s-]', '', text).strip() |
|
|
safe_text = re.sub(r'\s+', '_', safe_text) |
|
|
if len(safe_text) > max_len: |
|
|
safe_text = safe_text[:max_len] |
|
|
return safe_text |
|
|
|
|
|
|
|
|
try: |
|
|
print("正在载入 Coqui TTS XTTS-v2 模型...") |
|
|
tts = TTS(model_name="tts_models/multilingual/multi-dataset/xtts_v2", progress_bar=True).to(device) |
|
|
|
|
|
|
|
|
if device == "cuda": |
|
|
|
|
|
tts.model.half() |
|
|
|
|
|
if hasattr(torch.cuda, "graphs"): |
|
|
print("启用 CUDA 图优化") |
|
|
|
|
|
try: |
|
|
print("尝试编译模型...") |
|
|
tts.model = torch.jit.script(tts.model) |
|
|
print("模型编译成功") |
|
|
except Exception as e: |
|
|
print(f"模型编译失败: {e}") |
|
|
|
|
|
print("Coqui TTS XTTS-v2 模型已成功载入。") |
|
|
|
|
|
|
|
|
print("预热模型...") |
|
|
with tempfile.NamedTemporaryFile(suffix=".wav", delete=True) as fp: |
|
|
try: |
|
|
tts.tts_to_file( |
|
|
text="Hello, this is a warm up test.", |
|
|
language="en", |
|
|
speaker_wav=DEFAULT_SPEAKER_WAV if os.path.exists(DEFAULT_SPEAKER_WAV) else None, |
|
|
file_path=fp.name, |
|
|
speed=1.2 |
|
|
) |
|
|
print("模型预热完成。") |
|
|
except Exception as e: |
|
|
print(f"模型预热失败: {e}") |
|
|
|
|
|
except Exception as e: |
|
|
model_load_error = f"载入 Coqui TTS XTTS-v2 模型时发生错误: {e}" |
|
|
print(model_load_error) |
|
|
|
|
|
def split_text_into_chunks(text, max_chars=200): |
|
|
"""将长文本分割成更小的块以提高处理速度""" |
|
|
|
|
|
sentences = re.split(r'(?<=[.!?])\s+', text) |
|
|
chunks = [] |
|
|
current_chunk = "" |
|
|
|
|
|
for sentence in sentences: |
|
|
if len(current_chunk) + len(sentence) <= max_chars: |
|
|
current_chunk += sentence + " " |
|
|
else: |
|
|
if current_chunk: |
|
|
chunks.append(current_chunk.strip()) |
|
|
current_chunk = sentence + " " |
|
|
|
|
|
if current_chunk: |
|
|
chunks.append(current_chunk.strip()) |
|
|
|
|
|
return chunks |
|
|
|
|
|
def simulate_progress(progress, start, end, steps, desc_prefix=""): |
|
|
"""模拟进度更新""" |
|
|
step_size = (end - start) / steps |
|
|
for i in range(steps): |
|
|
current_progress = start + (i * step_size) |
|
|
progress(current_progress, desc=f"{desc_prefix} 步骤 {i+1}/{steps}") |
|
|
time.sleep(0.1) |
|
|
|
|
|
def generate_speech(text, language, uploaded_speaker_audio_path, speed=1.0, progress=gr.Progress()): |
|
|
"""生成语音并保存文件""" |
|
|
if model_load_error: |
|
|
return None, f"应用程序启动错误:{model_load_error}" |
|
|
|
|
|
|
|
|
if not text: |
|
|
return None, "请输入一些文字!" |
|
|
if not language: |
|
|
return None, "请选择一个语言!" |
|
|
|
|
|
if tts is None: |
|
|
return None, "TTS 模型未成功载入,无法生成语音。" |
|
|
|
|
|
status_message = "" |
|
|
output_file = None |
|
|
|
|
|
try: |
|
|
|
|
|
progress(0.0, desc="🚀 初始化系统") |
|
|
time.sleep(0.2) |
|
|
|
|
|
|
|
|
progress(0.05, desc="🔍 处理语音参考文件") |
|
|
time.sleep(0.3) |
|
|
|
|
|
if uploaded_speaker_audio_path: |
|
|
speaker_wav_to_use = uploaded_speaker_audio_path |
|
|
try: |
|
|
timestamp_ref = datetime.datetime.now().strftime("%Y%m%d_%H%M%S") |
|
|
original_ext = os.path.splitext(uploaded_speaker_audio_path)[1] |
|
|
saved_ref_file_name = f"{timestamp_ref}_uploaded_ref{original_ext}" |
|
|
saved_ref_file_path = os.path.join(SAVE_UPLOADED_REFERENCES_DIR, saved_ref_file_name) |
|
|
shutil.copy(uploaded_speaker_audio_path, saved_ref_file_path) |
|
|
status_message += f"参考语音已保存到:{saved_ref_file_path}\n" |
|
|
except Exception as e: |
|
|
status_message += f"警告:保存参考语音失败: {e}\n" |
|
|
else: |
|
|
speaker_wav_to_use = DEFAULT_SPEAKER_WAV |
|
|
if not os.path.exists(speaker_wav_to_use): |
|
|
return None, f"错误:默认语音参考文件 ({DEFAULT_SPEAKER_WAV}) 未找到。请上传一个文件或确保默认文件存在。" |
|
|
|
|
|
|
|
|
progress(0.15, desc="📝 文本预处理") |
|
|
time.sleep(0.2) |
|
|
|
|
|
text_chunks = split_text_into_chunks(text) |
|
|
if len(text_chunks) > 1: |
|
|
status_message += f"文本已分割为 {len(text_chunks)} 个块进行处理\n" |
|
|
|
|
|
|
|
|
progress(0.25, desc="🔤 文本编码") |
|
|
simulate_progress(progress, 0.25, 0.40, 5, "🔤 文本编码") |
|
|
|
|
|
|
|
|
progress(0.40, desc="🎵 声学模型处理") |
|
|
simulate_progress(progress, 0.40, 0.70, 10, "🎵 声学模型处理") |
|
|
|
|
|
|
|
|
progress(0.70, desc="🔊 声码器处理") |
|
|
simulate_progress(progress, 0.70, 0.85, 8, "🔊 声码器处理") |
|
|
|
|
|
|
|
|
progress(0.85, desc="🎙️ 生成音频波形") |
|
|
|
|
|
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as fp: |
|
|
output_file = fp.name |
|
|
|
|
|
try: |
|
|
|
|
|
tts.tts_to_file( |
|
|
text=text, |
|
|
language=language, |
|
|
speaker_wav=speaker_wav_to_use, |
|
|
file_path=output_file, |
|
|
|
|
|
speed=speed |
|
|
) |
|
|
except Exception as e: |
|
|
if output_file and os.path.exists(output_file): |
|
|
os.remove(output_file) |
|
|
return None, f"生成语音失败: {e}" |
|
|
|
|
|
|
|
|
progress(0.90, desc="🔧 音频后处理") |
|
|
time.sleep(0.2) |
|
|
|
|
|
|
|
|
progress(0.95, desc="💾 保存语音文件") |
|
|
time.sleep(0.2) |
|
|
|
|
|
try: |
|
|
timestamp_gen = datetime.datetime.now().strftime("%Y%m%d_%H%M%S") |
|
|
sanitized_text = sanitize_filename(text) |
|
|
saved_file_name = f"{timestamp_gen}_{language}_{sanitized_text}.wav" |
|
|
saved_file_path = os.path.join(SAVE_GENERATED_AUDIO_DIR, saved_file_name) |
|
|
shutil.copy(output_file, saved_file_path) |
|
|
status_message += f"语音生成成功!已保存为:{saved_file_path}" |
|
|
except Exception as e: |
|
|
return None, f"保存语音文件失败: {e}" |
|
|
|
|
|
|
|
|
progress(1.0, desc="✅ 完成") |
|
|
time.sleep(0.1) |
|
|
|
|
|
return output_file, status_message |
|
|
|
|
|
except Exception as e: |
|
|
|
|
|
if output_file and os.path.exists(output_file): |
|
|
try: |
|
|
os.remove(output_file) |
|
|
except: |
|
|
pass |
|
|
return None, f"处理过程中发生错误: {str(e)}" |
|
|
|
|
|
def list_saved_audio_files(): |
|
|
"""列出已保存的音频文件""" |
|
|
audio_files = [] |
|
|
if os.path.exists(SAVE_GENERATED_AUDIO_DIR): |
|
|
for filename in os.listdir(SAVE_GENERATED_AUDIO_DIR): |
|
|
if filename.lower().endswith((".wav", ".mp3")): |
|
|
audio_files.append(os.path.join(SAVE_GENERATED_AUDIO_DIR, filename)) |
|
|
audio_files.sort(key=os.path.getmtime, reverse=True) |
|
|
return audio_files |
|
|
|
|
|
def list_uploaded_reference_files(): |
|
|
"""列出已上传的参考语音文件""" |
|
|
ref_files = [] |
|
|
if os.path.exists(SAVE_UPLOADED_REFERENCES_DIR): |
|
|
for filename in os.listdir(SAVE_UPLOADED_REFERENCES_DIR): |
|
|
if filename.lower().endswith((".wav", ".mp3")): |
|
|
ref_files.append(os.path.join(SAVE_UPLOADED_REFERENCES_DIR, filename)) |
|
|
ref_files.sort(key=os.path.getmtime, reverse=True) |
|
|
return ref_files |
|
|
|
|
|
|
|
|
custom_css = """ |
|
|
.grapheme-progress { |
|
|
background: linear-gradient(to right, #4A90E2 0%, #7B68EE 100%); |
|
|
border-radius: 10px; |
|
|
height: 24px; |
|
|
position: relative; |
|
|
overflow: hidden; |
|
|
box-shadow: inset 0 2px 4px rgba(0,0,0,0.2); |
|
|
} |
|
|
|
|
|
.grapheme-progress::before { |
|
|
content: ""; |
|
|
position: absolute; |
|
|
top: 0; |
|
|
left: 0; |
|
|
height: 100%; |
|
|
width: 100%; |
|
|
background: linear-gradient(45deg, |
|
|
rgba(255,255,255,0.2) 25%, |
|
|
transparent 25%, |
|
|
transparent 50%, |
|
|
rgba(255,255,255,0.2) 50%, |
|
|
rgba(255,255,255,0.2) 75%, |
|
|
transparent 75%, |
|
|
transparent); |
|
|
background-size: 20px 20px; |
|
|
animation: move 1s linear infinite; |
|
|
} |
|
|
|
|
|
@keyframes move { |
|
|
0% { background-position: 0 0; } |
|
|
100% { background-position: 20px 20px; } |
|
|
} |
|
|
|
|
|
.progress-container { |
|
|
margin: 20px 0; |
|
|
padding: 15px; |
|
|
border-radius: 10px; |
|
|
background-color: #f0f4ff; |
|
|
border: 1px solid #c5d9ff; |
|
|
box-shadow: 0 4px 8px rgba(0,0,0,0.1); |
|
|
} |
|
|
|
|
|
.status-log { |
|
|
font-family: 'Courier New', monospace; |
|
|
background-color: #2c3e50; |
|
|
color: #ecf0f1; |
|
|
padding: 10px; |
|
|
border-radius: 5px; |
|
|
height: 120px; |
|
|
overflow-y: auto; |
|
|
white-space: pre-wrap; |
|
|
border: 1px solid #34495e; |
|
|
box-shadow: inset 0 2px 4px rgba(0,0,0,0.5); |
|
|
} |
|
|
|
|
|
.status-log::-webkit-scrollbar { |
|
|
width: 8px; |
|
|
} |
|
|
|
|
|
.status-log::-webkit-scrollbar-track { |
|
|
background: #1e272e; |
|
|
} |
|
|
|
|
|
.status-log::-webkit-scrollbar-thumb { |
|
|
background: #7f8c8d; |
|
|
border-radius: 4px; |
|
|
} |
|
|
|
|
|
.status-log::-webkit-scrollbar-thumb:hover { |
|
|
background: #95a5a6; |
|
|
} |
|
|
|
|
|
.tab-header { |
|
|
background-color: #4A90E2 !important; |
|
|
color: white !important; |
|
|
font-weight: bold !important; |
|
|
border-radius: 10px 10px 0 0 !important; |
|
|
} |
|
|
|
|
|
.tab-content { |
|
|
background-color: #f0f4ff !important; |
|
|
border: 1px solid #c5d9ff !important; |
|
|
border-radius: 0 0 10px 10px !important; |
|
|
padding: 15px !important; |
|
|
} |
|
|
|
|
|
.generate-button { |
|
|
background: linear-gradient(to right, #4A90E2, #7B68EE) !important; |
|
|
color: white !important; |
|
|
font-weight: bold !important; |
|
|
border: none !important; |
|
|
border-radius: 8px !important; |
|
|
padding: 10px 20px !important; |
|
|
box-shadow: 0 4px 8px rgba(0,0,0,0.2) !important; |
|
|
transition: all 0.3s ease !important; |
|
|
} |
|
|
|
|
|
.generate-button:hover { |
|
|
transform: translateY(-2px) !important; |
|
|
box-shadow: 0 6px 12px rgba(0,0,0,0.3) !important; |
|
|
} |
|
|
|
|
|
.generate-button:active { |
|
|
transform: translateY(1px) !important; |
|
|
box-shadow: 0 2px 4px rgba(0,0,0,0.2) !important; |
|
|
} |
|
|
|
|
|
.progress-text { |
|
|
font-size: 0.9em; |
|
|
color: #34495e; |
|
|
margin-top: 5px; |
|
|
text-align: center; |
|
|
} |
|
|
|
|
|
""" |
|
|
|
|
|
|
|
|
with gr.Blocks( |
|
|
title="Coqui TTS XTTS-v2 语音生成 (Grapheme进度条)", |
|
|
css=custom_css, |
|
|
theme=gr.themes.Soft( |
|
|
primary_hue="blue", |
|
|
secondary_hue="purple", |
|
|
neutral_hue="gray", |
|
|
text_size="lg", |
|
|
) |
|
|
) as demo: |
|
|
gr.Markdown("# 🎙️ Coqui TTS XTTS-v2 语音生成 (Grapheme进度条)") |
|
|
gr.Markdown(f"此演示使用 {'🖥️ GPU' if device == 'cuda' else '💻 CPU'} 运行。您可以上传自己的语音,或使用默认语音。") |
|
|
gr.Markdown("**生成的语音和上传的参考语音都将自动保存到服务器中。**") |
|
|
|
|
|
if device == "cpu": |
|
|
gr.Markdown("⚠️ **注意:** 当前使用CPU运行,XTTS-v2在CPU上运行会较慢。建议使用GPU以获得最佳性能。") |
|
|
else: |
|
|
gr.Markdown("✅ **GPU加速已启用** - 使用以下优化技术:半精度浮点数、模型编译") |
|
|
|
|
|
with gr.Tab("语音生成"): |
|
|
with gr.Row(): |
|
|
with gr.Column(): |
|
|
text_input = gr.Textbox(lines=5, label="输入文字", placeholder="请在这里输入你想要转换成语音的文字...") |
|
|
language_dropdown = gr.Dropdown(choices=SUPPORTED_LANGUAGES, label="选择语言", value="en") |
|
|
speaker_audio_upload = gr.Audio( |
|
|
type="filepath", |
|
|
label="上传语音参考文件 (WAV/MP3) (可选)", |
|
|
sources=["microphone", "upload"], |
|
|
) |
|
|
with gr.Row(): |
|
|
speed_slider = gr.Slider(minimum=0.5, maximum=2.0, step=0.1, value=1.2, label="语速 (1.0为正常,>1.0加快)") |
|
|
generate_button = gr.Button("生成语音", elem_classes="generate-button") |
|
|
with gr.Column(): |
|
|
output_audio = gr.Audio(label="生成的语音", type="filepath") |
|
|
status_textbox = gr.Textbox(label="状态", elem_classes="status-log") |
|
|
progress_html = gr.HTML(""" |
|
|
<div class="progress-container"> |
|
|
<div class="grapheme-progress" style="width: 0%;" id="custom-progress"></div> |
|
|
<div class="progress-text" id="progress-text">等待开始...</div> |
|
|
</div> |
|
|
""") |
|
|
|
|
|
generate_button.click( |
|
|
fn=generate_speech, |
|
|
inputs=[text_input, language_dropdown, speaker_audio_upload, speed_slider], |
|
|
outputs=[output_audio, status_textbox] |
|
|
) |
|
|
|
|
|
with gr.Tab("查看已保存语音"): |
|
|
gr.Markdown("### 已保存的生成语音文件") |
|
|
saved_generated_files_output = gr.File( |
|
|
label="生成的语音文件", |
|
|
file_count="multiple", |
|
|
interactive=False |
|
|
) |
|
|
refresh_generated_button = gr.Button("刷新生成语音列表") |
|
|
demo.load(list_saved_audio_files, outputs=[saved_generated_files_output]) |
|
|
refresh_generated_button.click(list_saved_audio_files, outputs=[saved_generated_files_output]) |
|
|
|
|
|
with gr.Tab("查看已上传参考语音"): |
|
|
gr.Markdown("### 已保存的上传参考语音文件") |
|
|
saved_uploaded_ref_files_output = gr.File( |
|
|
label="上传的参考语音文件", |
|
|
file_count="multiple", |
|
|
interactive=False |
|
|
) |
|
|
refresh_uploaded_ref_button = gr.Button("刷新参考语音列表") |
|
|
demo.load(list_uploaded_reference_files, outputs=[saved_uploaded_ref_files_output]) |
|
|
refresh_uploaded_ref_button.click(list_uploaded_reference_files, outputs=[saved_uploaded_ref_files_output]) |
|
|
|
|
|
if __name__ == "__main__": |
|
|
demo.launch() |