Spaces:

Ryanus
/

CoquiTTS

Sleeping

App Files Files Community

Ryanus commited on Sep 21, 2025

Commit

7d2c1dc

verified ·

1 Parent(s): 1982214

Update app.py

Browse files

Files changed (1) hide show

app.py +120 -105

app.py CHANGED Viewed

@@ -9,10 +9,10 @@ import re
 from tqdm import tqdm
 import time
-# --- Coqui TTS 授權同意 ---
 os.environ["COQUI_TOS_AGREED"] = "1"
-# --- 解決 PyTorch 2.6+ WeightsUnpickler 錯誤 ---
 try:
     import torch.serialization
     from TTS.tts.configs.xtts_config import XttsConfig
@@ -22,152 +22,166 @@ try:
     torch.serialization.add_safe_globals([
         XttsConfig, XttsAudioConfig, BaseDatasetConfig, XttsArgs
     ])
-    print("已將 XTTS 相關配置類加入 PyTorch 安全全局變數白名單。")
 except Exception as e:
-    print(f"警告：無法將安全全局變數加入 PyTorch 白名單: {e}")
-    print("如果遇到模型載入錯誤，請檢查 PyTorch 和 TTS 庫版本。")
 device = "cuda" if torch.cuda.is_available() else "cpu"
-print(f"使用設備: {device}")
 tts = None
 model_load_error = None
 SUPPORTED_LANGUAGES = [
     "en", "zh-cn", "es", "fr", "de", "it", "pt", "pl", "ru", "ja", "ko", "ar", "hi", "tr",
     "nl", "sv", "da", "fi", "no", "cs", "hu", "el", "uk", "vi", "th", "id", "ms", "ro",
     "sk", "hr", "bg", "ca", "fa", "he", "ur", "bn", "gu", "kn", "ml", "mr", "pa", "ta", "te",
 ]
 DEFAULT_SPEAKER_WAV = "speaker.wav"
-SAVE_GENERATED_AUDIO_DIR = "generated_audio"
-SAVE_UPLOADED_REFERENCES_DIR = "uploaded_references"
-os.makedirs(SAVE_GENERATED_AUDIO_DIR, exist_ok=True)
-os.makedirs(SAVE_UPLOADED_REFERENCES_DIR, exist_ok=True)
 def sanitize_filename(text: str, max_len: int = 50) -> str:
     safe_text = re.sub(r'[^\w\s-]', '', text).strip()
     safe_text = re.sub(r'\s+', '_', safe_text)
     if len(safe_text) > max_len:
         safe_text = safe_text[:max_len]
     return safe_text
-# --- 載入模型 ---
 try:
     tts = TTS(model_name="tts_models/multilingual/multi-dataset/xtts_v2", progress_bar=True).to(device)
-    print("Coqui TTS XTTS-v2 模型已成功載入。")
 except Exception as e:
-    model_load_error = f"載入 Coqui TTS XTTS-v2 模型時發生錯誤: {e}"
-def generate_speech(text, language, uploaded_speaker_audio_path):
     if model_load_error:
-        return None, f"應用程式啟動錯誤：{model_load_error}"
-    steps = [
-        "檢查模型狀態",
-        "檢查輸入",
-        "處理語音參考檔案",
-        "生成語音",
-        "儲存語音檔案",
-        "完成"
-    ]
-    for i, step in enumerate(tqdm(steps, desc="語音生成流程", ncols=80)):
-        if step == "檢查模型狀態":
-            if tts is None:
-                return None, "TTS 模型未成功載入，無法生成語音。"
-            time.sleep(0.1)
-        elif step == "檢查輸入":
-            if not text:
-                return None, "請輸入一些文字！"
-            if not language:
-                return None, "請選擇一個語言！"
-            time.sleep(0.1)
-        elif step == "處理語音參考檔案":
-            global speaker_wav_to_use
-            speaker_wav_to_use = None
-            global status_message
-            status_message = ""
-            if uploaded_speaker_audio_path:
-                speaker_wav_to_use = uploaded_speaker_audio_path
-                try:
-                    timestamp_ref = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
-                    original_ext = os.path.splitext(uploaded_speaker_audio_path)[1]
-                    saved_ref_file_name = f"{timestamp_ref}_uploaded_ref{original_ext}"
-                    saved_ref_file_path = os.path.join(SAVE_UPLOADED_REFERENCES_DIR, saved_ref_file_name)
-                    shutil.copy(uploaded_speaker_audio_path, saved_ref_file_path)
-                    status_message += f"參考語音已儲存到：{saved_ref_file_path}\n"
-                except Exception as e:
-                    status_message += f"警告：儲存參考語音失敗: {e}\n"
-            else:
-                speaker_wav_to_use = DEFAULT_SPEAKER_WAV
-                if not os.path.exists(speaker_wav_to_use):
-                    return None, f"錯誤：預設語音參考檔案 ({DEFAULT_SPEAKER_WAV}) 未找���。請上傳一個檔案或確保預設檔案存在。"
-            time.sleep(0.1)
-        elif step == "生成語音":
-            global output_file
-            output_file = None
             try:
-                with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as fp:
-                    output_file = fp.name
-                tts.tts_to_file(text=text, language=language, speaker_wav=speaker_wav_to_use, file_path=output_file)
             except Exception as e:
-                if output_file and os.path.exists(output_file):
-                    os.remove(output_file)
-                return None, f"生成語音失敗: {e}"
-        elif step == "儲存語音檔案":
             try:
-                timestamp_gen = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
-                sanitized_text = sanitize_filename(text)
-                saved_file_name = f"{timestamp_gen}_{language}_{sanitized_text}.wav"
-                saved_file_path = os.path.join(SAVE_GENERATED_AUDIO_DIR, saved_file_name)
-                shutil.copy(output_file, saved_file_path)
-                status_message += f"語音生成成功！已儲存為：{saved_file_path}"
-            except Exception as e:
-                return None, f"儲存語音檔案失敗: {e}"
-        elif step == "完成":
-            pass
-        time.sleep(0.1)
-    return output_file, status_message
 def list_saved_audio_files():
     audio_files = []
     if os.path.exists(SAVE_GENERATED_AUDIO_DIR):
         for filename in os.listdir(SAVE_GENERATED_AUDIO_DIR):
-            if filename.lower().endswith(".wav"):
                 audio_files.append(os.path.join(SAVE_GENERATED_AUDIO_DIR, filename))
     audio_files.sort(key=os.path.getmtime, reverse=True)
     return audio_files
 def list_uploaded_reference_files():
     ref_files = []
     if os.path.exists(SAVE_UPLOADED_REFERENCES_DIR):
         for filename in os.listdir(SAVE_UPLOADED_REFERENCES_DIR):
-            if filename.lower().endswith(".wav"):
                 ref_files.append(os.path.join(SAVE_UPLOADED_REFERENCES_DIR, filename))
     ref_files.sort(key=os.path.getmtime, reverse=True)
     return ref_files
-with gr.Blocks(title="Coqui TTS XTTS-v2 語音生成") as demo:
-    gr.Markdown("# Coqui TTS XTTS-v2 語音生成 (CPU)")
-    gr.Markdown("此演示使用 CPU 運行，請注意 XTTS-v2 在 CPU 上運行會非常慢。您可以上傳自己的語音，或使用預設語音。**生成的語音和上傳的參考語音都將自動儲存到 Space 專案中。**")
-    gr.Markdown("**重要提示：** 每次儲存檔案都會觸發 Hugging Face Space 的自動重建，導致應用程式暫時不可用並重新載入模型。")
-    with gr.Tab("語音生成"):
         with gr.Row():
             with gr.Column():
-                text_input = gr.Textbox(lines=5, label="輸入文字", placeholder="請在這裡輸入你想要轉換成語音的文字...")
-                language_dropdown = gr.Dropdown(choices=SUPPORTED_LANGUAGES, label="選擇語言", value="en")
                 speaker_audio_upload = gr.Audio(
                     type="filepath",
-                    label="上傳語音參考檔案 (WAV) (可選)",
                     sources=["microphone", "upload"],
                 )
-                generate_button = gr.Button("生成語音")
             with gr.Column():
-                output_audio = gr.Audio(label="生成的語音", type="filepath")
-                status_textbox = gr.Textbox(label="狀態")
         generate_button.click(
             fn=generate_speech,
@@ -175,26 +189,27 @@ with gr.Blocks(title="Coqui TTS XTTS-v2 語音生成") as demo:
             outputs=[output_audio, status_textbox]
         )
-    with gr.Tab("查看已儲存語音"):
-        gr.Markdown("### 已儲存的生成語音檔案")
         saved_generated_files_output = gr.File(
-            label="生成的語音檔案",
             file_count="multiple",
             interactive=False
         )
-        refresh_generated_button = gr.Button("刷新生成語音列表")
         demo.load(list_saved_audio_files, outputs=[saved_generated_files_output])
         refresh_generated_button.click(list_saved_audio_files, outputs=[saved_generated_files_output])
-    with gr.Tab("查看已上傳參考語音"):
-        gr.Markdown("### 已儲存的上傳參考語音檔案")
         saved_uploaded_ref_files_output = gr.File(
-            label="上傳的參考語音檔案",
             file_count="multiple",
             interactive=False
         )
-        refresh_uploaded_ref_button = gr.Button("刷新參考語音列表")
         demo.load(list_uploaded_reference_files, outputs=[saved_uploaded_ref_files_output])
         refresh_uploaded_ref_button.click(list_uploaded_reference_files, outputs=[saved_uploaded_ref_files_output])
-demo.launch()

 from tqdm import tqdm
 import time
+# --- Coqui TTS 授权同意 ---
 os.environ["COQUI_TOS_AGREED"] = "1"
+# --- 解决 PyTorch 2.6+ WeightsUnpickler 错误 ---
 try:
     import torch.serialization
     from TTS.tts.configs.xtts_config import XttsConfig
     torch.serialization.add_safe_globals([
         XttsConfig, XttsAudioConfig, BaseDatasetConfig, XttsArgs
     ])
+    print("已将 XTTS 相关配置类加入 PyTorch 安全全局变量白名单。")
 except Exception as e:
+    print(f"警告：无法将安全全局变量加入 PyTorch 白名单: {e}")
+    print("如果遇到模型载入错误，请检查 PyTorch 和 TTS 库版本。")
+# 设备配置
 device = "cuda" if torch.cuda.is_available() else "cpu"
+print(f"使用设备: {device}")
+# 目录配置
+SAVE_GENERATED_AUDIO_DIR = "generated_audio"
+SAVE_UPLOADED_REFERENCES_DIR = "uploaded_references"
+os.makedirs(SAVE_GENERATED_AUDIO_DIR, exist_ok=True)
+os.makedirs(SAVE_UPLOADED_REFERENCES_DIR, exist_ok=True)
+# 全局变量
 tts = None
 model_load_error = None
 SUPPORTED_LANGUAGES = [
     "en", "zh-cn", "es", "fr", "de", "it", "pt", "pl", "ru", "ja", "ko", "ar", "hi", "tr",
     "nl", "sv", "da", "fi", "no", "cs", "hu", "el", "uk", "vi", "th", "id", "ms", "ro",
     "sk", "hr", "bg", "ca", "fa", "he", "ur", "bn", "gu", "kn", "ml", "mr", "pa", "ta", "te",
 ]
 DEFAULT_SPEAKER_WAV = "speaker.wav"
 def sanitize_filename(text: str, max_len: int = 50) -> str:
+    """清理文本以用作安全的文件名"""
     safe_text = re.sub(r'[^\w\s-]', '', text).strip()
     safe_text = re.sub(r'\s+', '_', safe_text)
     if len(safe_text) > max_len:
         safe_text = safe_text[:max_len]
     return safe_text
+# --- 载入模型 ---
 try:
+    print("正在载入 Coqui TTS XTTS-v2 模型...")
     tts = TTS(model_name="tts_models/multilingual/multi-dataset/xtts_v2", progress_bar=True).to(device)
+    print("Coqui TTS XTTS-v2 模型已成功载入。")
 except Exception as e:
+    model_load_error = f"载入 Coqui TTS XTTS-v2 模型时发生错误: {e}"
+    print(model_load_error)
+def generate_speech(text, language, uploaded_speaker_audio_path, progress=gr.Progress()):
+    """生成语音并保存文件"""
     if model_load_error:
+        return None, f"应用程序启动错误：{model_load_error}"
+    # 检查输入
+    if not text:
+        return None, "请输入一些文字！"
+    if not language:
+        return None, "请选择一个语言！"
+    if tts is None:
+        return None, "TTS 模型未成功载入，无法生成语音。"
+    status_message = ""
+    output_file = None
+    try:
+        # 处理语音参考文件
+        progress(0.2, desc="处理语音参考文件")
+        if uploaded_speaker_audio_path:
+            speaker_wav_to_use = uploaded_speaker_audio_path
             try:
+                timestamp_ref = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
+                original_ext = os.path.splitext(uploaded_speaker_audio_path)[1]
+                saved_ref_file_name = f"{timestamp_ref}_uploaded_ref{original_ext}"
+                saved_ref_file_path = os.path.join(SAVE_UPLOADED_REFERENCES_DIR, saved_ref_file_name)
+                shutil.copy(uploaded_speaker_audio_path, saved_ref_file_path)
+                status_message += f"参考语音已保存到：{saved_ref_file_path}\n"
             except Exception as e:
+                status_message += f"警告：保存参考语音失败: {e}\n"
+        else:
+            speaker_wav_to_use = DEFAULT_SPEAKER_WAV
+            if not os.path.exists(speaker_wav_to_use):
+                return None, f"错误：默认语音参考文件 ({DEFAULT_SPEAKER_WAV}) 未找到。请上传一个文件或确保默认文件存在。"
+        # 生成语音
+        progress(0.5, desc="生成语音中...")
+        with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as fp:
+            output_file = fp.name
+        try:
+            tts.tts_to_file(text=text, language=language, speaker_wav=speaker_wav_to_use, file_path=output_file)
+        except Exception as e:
+            if output_file and os.path.exists(output_file):
+                os.remove(output_file)
+            return None, f"生成语音失败: {e}"
+        # 保存语音文件
+        progress(0.8, desc="保存语音文件")
+        try:
+            timestamp_gen = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
+            sanitized_text = sanitize_filename(text)
+            saved_file_name = f"{timestamp_gen}_{language}_{sanitized_text}.wav"
+            saved_file_path = os.path.join(SAVE_GENERATED_AUDIO_DIR, saved_file_name)
+            shutil.copy(output_file, saved_file_path)
+            status_message += f"语音生成成功！已保存为：{saved_file_path}"
+        except Exception as e:
+            return None, f"保存语音文件失败: {e}"
+        progress(1.0, desc="完成")
+        return output_file, status_message
+    except Exception as e:
+        # 清理临时文件
+        if output_file and os.path.exists(output_file):
             try:
+                os.remove(output_file)
+            except:
+                pass
+        return None, f"处理过程中发生错误: {str(e)}"
 def list_saved_audio_files():
+    """列出已保存的音频文件"""
     audio_files = []
     if os.path.exists(SAVE_GENERATED_AUDIO_DIR):
         for filename in os.listdir(SAVE_GENERATED_AUDIO_DIR):
+            if filename.lower().endswith((".wav", ".mp3")):
                 audio_files.append(os.path.join(SAVE_GENERATED_AUDIO_DIR, filename))
     audio_files.sort(key=os.path.getmtime, reverse=True)
     return audio_files
 def list_uploaded_reference_files():
+    """列出已上传的参考语音文件"""
     ref_files = []
     if os.path.exists(SAVE_UPLOADED_REFERENCES_DIR):
         for filename in os.listdir(SAVE_UPLOADED_REFERENCES_DIR):
+            if filename.lower().endswith((".wav", ".mp3")):
                 ref_files.append(os.path.join(SAVE_UPLOADED_REFERENCES_DIR, filename))
     ref_files.sort(key=os.path.getmtime, reverse=True)
     return ref_files
+# 创建Gradio界面
+with gr.Blocks(title="Coqui TTS XTTS-v2 语音生成") as demo:
+    gr.Markdown("# Coqui TTS XTTS-v2 语音生成")
+    gr.Markdown(f"此演示使用 {'GPU' if device == 'cuda' else 'CPU'} 运行。您可以上传自己的语音，或使用默认语音。")
+    gr.Markdown("**生成的语音和上传的参考语音都将自动保存到服务器中。**")
+    if device == "cpu":
+        gr.Markdown("⚠️ **注意：** 当前使用CPU运行，XTTS-v2在CPU上运行会较慢。")
+    with gr.Tab("语音生成"):
         with gr.Row():
             with gr.Column():
+                text_input = gr.Textbox(lines=5, label="输入文字", placeholder="请在这里输入你想要转换成语音的文字...")
+                language_dropdown = gr.Dropdown(choices=SUPPORTED_LANGUAGES, label="选择语言", value="en")
                 speaker_audio_upload = gr.Audio(
                     type="filepath",
+                    label="上传语音参考文件 (WAV/MP3) (可选)",
                     sources=["microphone", "upload"],
                 )
+                generate_button = gr.Button("生成语音")
             with gr.Column():
+                output_audio = gr.Audio(label="生成的语音", type="filepath")
+                status_textbox = gr.Textbox(label="状态")
         generate_button.click(
             fn=generate_speech,
             outputs=[output_audio, status_textbox]
         )
+    with gr.Tab("查看已保存语音"):
+        gr.Markdown("### 已保存的生成语音文件")
         saved_generated_files_output = gr.File(
+            label="生成的语音文件",
             file_count="multiple",
             interactive=False
         )
+        refresh_generated_button = gr.Button("刷新生成语音列表")
         demo.load(list_saved_audio_files, outputs=[saved_generated_files_output])
         refresh_generated_button.click(list_saved_audio_files, outputs=[saved_generated_files_output])
+    with gr.Tab("查看已上传参考语音"):
+        gr.Markdown("### 已保存的上传参考语音文件")
         saved_uploaded_ref_files_output = gr.File(
+            label="上传的参考语音文件",
             file_count="multiple",
             interactive=False
         )
+        refresh_uploaded_ref_button = gr.Button("刷新参考语音列表")
         demo.load(list_uploaded_reference_files, outputs=[saved_uploaded_ref_files_output])
         refresh_uploaded_ref_button.click(list_uploaded_reference_files, outputs=[saved_uploaded_ref_files_output])
+if __name__ == "__main__":
+    demo.launch()