import torch from TTS.api import TTS import gradio as gr import os import tempfile import datetime import shutil import re import time from tqdm import tqdm # --- Coqui TTS 授权同意 --- os.environ["COQUI_TOS_AGREED"] = "1" # --- 解决 PyTorch 2.6+ WeightsUnpickler 错误 --- try: import torch.serialization from TTS.tts.configs.xtts_config import XttsConfig from TTS.tts.models.xtts import XttsAudioConfig from TTS.config.shared_configs import BaseDatasetConfig from TTS.tts.models.xtts import XttsArgs torch.serialization.add_safe_globals([ XttsConfig, XttsAudioConfig, BaseDatasetConfig, XttsArgs ]) print("已将 XTTS 相关配置类加入 PyTorch 安全全局变量白名单。") except Exception as e: print(f"警告:无法将安全全局变量加入 PyTorch 白名单: {e}") print("如果遇到模型载入错误,请检查 PyTorch 和 TTS 库版本。") # 设备配置 device = "cuda" if torch.cuda.is_available() else "cpu" print(f"使用设备: {device}") # 目录配置 SAVE_GENERATED_AUDIO_DIR = "generated_audio" SAVE_UPLOADED_REFERENCES_DIR = "uploaded_references" os.makedirs(SAVE_GENERATED_AUDIO_DIR, exist_ok=True) os.makedirs(SAVE_UPLOADED_REFERENCES_DIR, exist_ok=True) # 全局变量 tts = None model_load_error = None SUPPORTED_LANGUAGES = [ "en", "zh-cn", "es", "fr", "de", "it", "pt", "pl", "ru", "ja", "ko", "ar", "hi", "tr", "nl", "sv", "da", "fi", "no", "cs", "hu", "el", "uk", "vi", "th", "id", "ms", "ro", "sk", "hr", "bg", "ca", "fa", "he", "ur", "bn", "gu", "kn", "ml", "mr", "pa", "ta", "te", ] DEFAULT_SPEAKER_WAV = "speaker.wav" def sanitize_filename(text: str, max_len: int = 50) -> str: """清理文本以用作安全的文件名""" safe_text = re.sub(r'[^\w\s-]', '', text).strip() safe_text = re.sub(r'\s+', '_', safe_text) if len(safe_text) > max_len: safe_text = safe_text[:max_len] return safe_text # --- 载入模型 --- try: print("正在载入 Coqui TTS XTTS-v2 模型...") tts = TTS(model_name="tts_models/multilingual/multi-dataset/xtts_v2", progress_bar=True).to(device) # 优化模型设置 if device == "cuda": # 使用半精度浮点数加速 tts.model.half() # 启用 CUDA 图优化(如果可用) if hasattr(torch.cuda, "graphs"): print("启用 CUDA 图优化") # 使用 TorchScript 编译模型 try: print("尝试编译模型...") tts.model = torch.jit.script(tts.model) print("模型编译成功") except Exception as e: print(f"模型编译失败: {e}") print("Coqui TTS XTTS-v2 模型已成功载入。") # 预热模型 print("预热模型...") with tempfile.NamedTemporaryFile(suffix=".wav", delete=True) as fp: try: tts.tts_to_file( text="Hello, this is a warm up test.", language="en", speaker_wav=DEFAULT_SPEAKER_WAV if os.path.exists(DEFAULT_SPEAKER_WAV) else None, file_path=fp.name, speed=1.2 # 稍微加快预热速度 ) print("模型预热完成。") except Exception as e: print(f"模型预热失败: {e}") except Exception as e: model_load_error = f"载入 Coqui TTS XTTS-v2 模型时发生错误: {e}" print(model_load_error) def split_text_into_chunks(text, max_chars=200): """将长文本分割成更小的块以提高处理速度""" # 简单的按句子分割 sentences = re.split(r'(?<=[.!?])\s+', text) chunks = [] current_chunk = "" for sentence in sentences: if len(current_chunk) + len(sentence) <= max_chars: current_chunk += sentence + " " else: if current_chunk: chunks.append(current_chunk.strip()) current_chunk = sentence + " " if current_chunk: chunks.append(current_chunk.strip()) return chunks def simulate_progress(progress, start, end, steps, desc_prefix=""): """模拟进度更新""" step_size = (end - start) / steps for i in range(steps): current_progress = start + (i * step_size) progress(current_progress, desc=f"{desc_prefix} 步骤 {i+1}/{steps}") time.sleep(0.1) def generate_speech(text, language, uploaded_speaker_audio_path, speed=1.0, progress=gr.Progress()): """生成语音并保存文件""" if model_load_error: return None, f"应用程序启动错误:{model_load_error}" # 检查输入 if not text: return None, "请输入一些文字!" if not language: return None, "请选择一个语言!" if tts is None: return None, "TTS 模型未成功载入,无法生成语音。" status_message = "" output_file = None try: # 步骤1: 初始化 (0-5%) progress(0.0, desc="🚀 初始化系统") time.sleep(0.2) # 步骤2: 处理语音参考文件 (5-15%) progress(0.05, desc="🔍 处理语音参考文件") time.sleep(0.3) if uploaded_speaker_audio_path: speaker_wav_to_use = uploaded_speaker_audio_path try: timestamp_ref = datetime.datetime.now().strftime("%Y%m%d_%H%M%S") original_ext = os.path.splitext(uploaded_speaker_audio_path)[1] saved_ref_file_name = f"{timestamp_ref}_uploaded_ref{original_ext}" saved_ref_file_path = os.path.join(SAVE_UPLOADED_REFERENCES_DIR, saved_ref_file_name) shutil.copy(uploaded_speaker_audio_path, saved_ref_file_path) status_message += f"参考语音已保存到:{saved_ref_file_path}\n" except Exception as e: status_message += f"警告:保存参考语音失败: {e}\n" else: speaker_wav_to_use = DEFAULT_SPEAKER_WAV if not os.path.exists(speaker_wav_to_use): return None, f"错误:默认语音参考文件 ({DEFAULT_SPEAKER_WAV}) 未找到。请上传一个文件或确保默认文件存在。" # 步骤3: 文本预处理 (15-25%) progress(0.15, desc="📝 文本预处理") time.sleep(0.2) text_chunks = split_text_into_chunks(text) if len(text_chunks) > 1: status_message += f"文本已分割为 {len(text_chunks)} 个块进行处理\n" # 步骤4: 语音编码 (25-40%) progress(0.25, desc="🔤 文本编码") simulate_progress(progress, 0.25, 0.40, 5, "🔤 文本编码") # 步骤5: 声学模型处理 (40-70%) progress(0.40, desc="🎵 声学模型处理") simulate_progress(progress, 0.40, 0.70, 10, "🎵 声学模型处理") # 步骤6: 声码器处理 (70-85%) progress(0.70, desc="🔊 声码器处理") simulate_progress(progress, 0.70, 0.85, 8, "🔊 声码器处理") # 步骤7: 实际生成语音 (85-90%) progress(0.85, desc="🎙️ 生成音频波形") with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as fp: output_file = fp.name try: # 实际生成语音 - 只使用支持的参数 tts.tts_to_file( text=text, language=language, speaker_wav=speaker_wav_to_use, file_path=output_file, # 只使用支持的参数 speed=speed ) except Exception as e: if output_file and os.path.exists(output_file): os.remove(output_file) return None, f"生成语音失败: {e}" # 步骤8: 音频后处理 (90-95%) progress(0.90, desc="🔧 音频后处理") time.sleep(0.2) # 步骤9: 保存语音文件 (95-100%) progress(0.95, desc="💾 保存语音文件") time.sleep(0.2) try: timestamp_gen = datetime.datetime.now().strftime("%Y%m%d_%H%M%S") sanitized_text = sanitize_filename(text) saved_file_name = f"{timestamp_gen}_{language}_{sanitized_text}.wav" saved_file_path = os.path.join(SAVE_GENERATED_AUDIO_DIR, saved_file_name) shutil.copy(output_file, saved_file_path) status_message += f"语音生成成功!已保存为:{saved_file_path}" except Exception as e: return None, f"保存语音文件失败: {e}" # 步骤10: 完成 (100%) progress(1.0, desc="✅ 完成") time.sleep(0.1) return output_file, status_message except Exception as e: # 清理临时文件 if output_file and os.path.exists(output_file): try: os.remove(output_file) except: pass return None, f"处理过程中发生错误: {str(e)}" def list_saved_audio_files(): """列出已保存的音频文件""" audio_files = [] if os.path.exists(SAVE_GENERATED_AUDIO_DIR): for filename in os.listdir(SAVE_GENERATED_AUDIO_DIR): if filename.lower().endswith((".wav", ".mp3")): audio_files.append(os.path.join(SAVE_GENERATED_AUDIO_DIR, filename)) audio_files.sort(key=os.path.getmtime, reverse=True) return audio_files def list_uploaded_reference_files(): """列出已上传的参考语音文件""" ref_files = [] if os.path.exists(SAVE_UPLOADED_REFERENCES_DIR): for filename in os.listdir(SAVE_UPLOADED_REFERENCES_DIR): if filename.lower().endswith((".wav", ".mp3")): ref_files.append(os.path.join(SAVE_UPLOADED_REFERENCES_DIR, filename)) ref_files.sort(key=os.path.getmtime, reverse=True) return ref_files # 自定义CSS样式 custom_css = """ .grapheme-progress { background: linear-gradient(to right, #4A90E2 0%, #7B68EE 100%); border-radius: 10px; height: 24px; position: relative; overflow: hidden; box-shadow: inset 0 2px 4px rgba(0,0,0,0.2); } .grapheme-progress::before { content: ""; position: absolute; top: 0; left: 0; height: 100%; width: 100%; background: linear-gradient(45deg, rgba(255,255,255,0.2) 25%, transparent 25%, transparent 50%, rgba(255,255,255,0.2) 50%, rgba(255,255,255,0.2) 75%, transparent 75%, transparent); background-size: 20px 20px; animation: move 1s linear infinite; } @keyframes move { 0% { background-position: 0 0; } 100% { background-position: 20px 20px; } } .progress-container { margin: 20px 0; padding: 15px; border-radius: 10px; background-color: #f0f4ff; border: 1px solid #c5d9ff; box-shadow: 0 4px 8px rgba(0,0,0,0.1); } .status-log { font-family: 'Courier New', monospace; background-color: #2c3e50; color: #ecf0f1; padding: 10px; border-radius: 5px; height: 120px; overflow-y: auto; white-space: pre-wrap; border: 1px solid #34495e; box-shadow: inset 0 2px 4px rgba(0,0,0,0.5); } .status-log::-webkit-scrollbar { width: 8px; } .status-log::-webkit-scrollbar-track { background: #1e272e; } .status-log::-webkit-scrollbar-thumb { background: #7f8c8d; border-radius: 4px; } .status-log::-webkit-scrollbar-thumb:hover { background: #95a5a6; } .tab-header { background-color: #4A90E2 !important; color: white !important; font-weight: bold !important; border-radius: 10px 10px 0 0 !important; } .tab-content { background-color: #f0f4ff !important; border: 1px solid #c5d9ff !important; border-radius: 0 0 10px 10px !important; padding: 15px !important; } .generate-button { background: linear-gradient(to right, #4A90E2, #7B68EE) !important; color: white !important; font-weight: bold !important; border: none !important; border-radius: 8px !important; padding: 10px 20px !important; box-shadow: 0 4px 8px rgba(0,0,0,0.2) !important; transition: all 0.3s ease !important; } .generate-button:hover { transform: translateY(-2px) !important; box-shadow: 0 6px 12px rgba(0,0,0,0.3) !important; } .generate-button:active { transform: translateY(1px) !important; box-shadow: 0 2px 4px rgba(0,0,0,0.2) !important; } .progress-text { font-size: 0.9em; color: #34495e; margin-top: 5px; text-align: center; } """ # 创建Gradio界面 with gr.Blocks( title="Coqui TTS XTTS-v2 语音生成 (Grapheme进度条)", css=custom_css, theme=gr.themes.Soft( primary_hue="blue", secondary_hue="purple", neutral_hue="gray", text_size="lg", ) ) as demo: gr.Markdown("# 🎙️ Coqui TTS XTTS-v2 语音生成 (Grapheme进度条)") gr.Markdown(f"此演示使用 {'🖥️ GPU' if device == 'cuda' else '💻 CPU'} 运行。您可以上传自己的语音,或使用默认语音。") gr.Markdown("**生成的语音和上传的参考语音都将自动保存到服务器中。**") if device == "cpu": gr.Markdown("⚠️ **注意:** 当前使用CPU运行,XTTS-v2在CPU上运行会较慢。建议使用GPU以获得最佳性能。") else: gr.Markdown("✅ **GPU加速已启用** - 使用以下优化技术:半精度浮点数、模型编译") with gr.Tab("语音生成"): with gr.Row(): with gr.Column(): text_input = gr.Textbox(lines=5, label="输入文字", placeholder="请在这里输入你想要转换成语音的文字...") language_dropdown = gr.Dropdown(choices=SUPPORTED_LANGUAGES, label="选择语言", value="en") speaker_audio_upload = gr.Audio( type="filepath", label="上传语音参考文件 (WAV/MP3) (可选)", sources=["microphone", "upload"], ) with gr.Row(): speed_slider = gr.Slider(minimum=0.5, maximum=2.0, step=0.1, value=1.2, label="语速 (1.0为正常,>1.0加快)") generate_button = gr.Button("生成语音", elem_classes="generate-button") with gr.Column(): output_audio = gr.Audio(label="生成的语音", type="filepath") status_textbox = gr.Textbox(label="状态", elem_classes="status-log") progress_html = gr.HTML("""