Spaces:

tonyshark
/

styletts2

Runtime error

App Files Files Community

tonyshark commited on Sep 17, 2025

Commit

64749f5

verified ·

1 Parent(s): 7f65a42

Update app.py

Browse files

Files changed (1) hide show

app.py +311 -105

app.py CHANGED Viewed

@@ -1,127 +1,333 @@
-# app.py
-import gradio as gr
-import torch
 import os
 import numpy as np
 import soundfile as sf
-import time
 from model import StyleTTModel
 SPEAKER_WAV_PATH = "speakers/example_female.wav"
 OUTPUT_FILENAME = "output.wav"
 SAMPLE_RATE = 24000
-# Kiểm tra xem file giọng nói tham chiếu có tồn tại không
-if not os.path.exists(SPEAKER_WAV_PATH):
-    raise FileNotFoundError(f"Không tìm thấy file giọng nói tham chiếu tại: {SPEAKER_WAV_PATH}. "
-                            "Vui lòng tạo thư mục và đặt file .wav của bạn vào đó.")
-print("Bắt đầu khởi tạo StyleTTS2 Model...")
-model = StyleTTModel(speaker_wav=SPEAKER_WAV_PATH)
-print("Đang tải model StyleTTS2. Quá trình này có thể mất vài phút...")
-start_time = time.time()
-model.load()
-end_time = time.time()
-print(f"Model đã được tải thành công sau {end_time - start_time:.2f} giây.")
-def process_expressive_tags(text: str) -> str:
-    """
-    Chuyển đổi các tag biểu cảm như <laugh> thành các cụm từ mô tả
-    mà StyleTTS2 có thể hiểu để thay đổi giọng điệu.
-    """
-    tag_map = {
-        "<laugh>": "[laughing]",
-        "<whisper>": "[whispering]",
-        "<sigh>": "[sighs]",
-        "<cry>": "[crying]",
-        "<naughty>": "[mischievous tone]",
-        "<giggle>": "[giggling]",
-        "<tease>": "[teasingly]",
-        "<smirk>": "[sly tone]",
-        "<surprise>": "[surprised tone]",
-        "<romantic>": "[romantic tone]",
-        "<shy>": "[shyly]",
-        "<excited>": "[excitedly]",
-        "<shock>": "[shocked tone]",
-        "<curious>": "[curious tone]",
-        "<discover>": "[discovering tone]",
-        "<blush>": "[blushing voice]"
-    }
-    processed_text = text
-    for tag, style_prompt in tag_map.items():
-        processed_text = processed_text.replace(tag, style_prompt)
-    print(f"Văn bản sau khi xử lý tag: '{processed_text}'")
-    return processed_text
-# --- Phần 3: Hàm chính để tổng hợp giọng nói ---
-def generate_speech(text: str, speed: float):
-    """
-    Hàm chính được gọi bởi Gradio để tạo ra âm thanh từ văn bản.
-    """
-    if not text:
-        return None
-    print(f"Nhận được văn bản: '{text}' với tốc độ {speed}")
-    # Bước 1: Xử lý các tag biểu cảm
-    styled_text = process_expressive_tags(text)
-    # Bước 2: Sử dụng plugin để tổng hợp âm thanh
-    # Hàm synthesize trả về một mảng numpy
-    audio_array = model.synthesize(styled_text, speed=speed)
-    sf.write(OUTPUT_FILENAME, audio_array, SAMPLE_RATE, 'FLOAT')
-    print(f"Đã tạo file âm thanh thành công tại: {OUTPUT_FILENAME}")
-    return OUTPUT_FILENAME
-with gr.Blocks() as demo:
-    gr.Markdown(
-        """
-        # 🎙️ Demo StyleTTS2 Lite với Tag biểu cảm
-        Nhập văn bản vào ô bên dưới. Bạn có thể sử dụng các tag như `<laugh>`, `<whisper>`, `<sigh>`
-        để thêm cảm xúc cho giọng nói. Điều chỉnh thanh trượt để thay đổi tốc độ nói.
-        """
-    )
-    with gr.Row():
-        text_input = gr.Textbox(
-            label="Nhập văn bản ở đây",
-            placeholder="Ví dụ: Chào bạn, tôi là một trợ lý ảo. <laugh> Thật vui được gặp bạn!",
-            lines=4
-        )
-    speed_slider = gr.Slider(
-        minimum=0.5,
-        maximum=2.0,
-        value=1.0,
-        step=0.1,
-        label="Tốc độ nói (Speed)"
-    )
-    generate_button = gr.Button("Tạo giọng nói 🎙️", variant="primary")
-    audio_output = gr.Audio(label="Kết quả", type="filepath")
-    # Kết nối các thành phần
-    generate_button.click(
-        fn=generate_speech,
-        inputs=[text_input, speed_slider],
-        outputs=audio_output
-    )
-    gr.Markdown("### Các tag biểu cảm được hỗ trợ:")
-    gr.Markdown("- `<laugh>`: Tiếng cười\n- `<whisper>`: Thì thầm\n- `<sigh>`: Thở dài\n- `<cry>`: Khóc\n- `<giggle>`: Cười khúc khích\n- `<tease>`: Trêu chọc\n- `<surprised>`: Ngạc nhiên\n- Và nhiều tag khác trong mã nguồn...")
-# --- Phần 5: Chạy ứng dụng ---
 if __name__ == "__main__":
-    # Chia sẻ (share=True) sẽ tạo một liên kết công khai tạm thời
-    demo.launch(share=False)

 import os
+import re
+import time
 import numpy as np
 import soundfile as sf
+import matplotlib.pyplot as plt
+import librosa
+import gradio as gr
+from scipy.signal import fftconvolve
 from model import StyleTTModel
 SPEAKER_WAV_PATH = "speakers/example_female.wav"
 OUTPUT_FILENAME = "output.wav"
 SAMPLE_RATE = 24000
+# Global model variable
+model = None
+def initialize_model():
+    """Initialize the StyleTTS model with error handling"""
+    global model
+    try:
+        # Check if speaker reference file exists
+        if not os.path.exists(SPEAKER_WAV_PATH):
+            raise FileNotFoundError(f"Không tìm thấy file giọng nói tham chiếu tại: {SPEAKER_WAV_PATH}. "
+                                    "Vui lòng tạo thư mục và đặt file .wav của bạn vào đó.")
+        print("Bắt đầu khởi tạo StyleTTS2 Model...")
+        model = StyleTTModel(speaker_wav=SPEAKER_WAV_PATH)
+        print("Đang tải model StyleTTS2. Quá trình này có thể mất vài phút...")
+        start_time = time.time()
+        model.load()
+        end_time = time.time()
+        print(f"Model đã được tải thành công sau {end_time - start_time:.2f} giây.")
+        return True
+    except Exception as e:
+        print(f"Lỗi khi khởi tạo model: {e}")
+        model = None
+        return False
+# Initialize model on startup
+model_loaded = initialize_model()
+# ---------------------------
+# Load HF TTS model (hexgrad/styletts2)
+# ---------------------------
+SR_OUT = 24000
+# tts_pipe = pipeline("text-to-speech", model="hexgrad/styletts2")
+# ---------------------------
+# Audio helpers
+# ---------------------------
+def load_wav(path, sr_target=SR_OUT):
+    wav, sr = sf.read(path)
+    if wav.ndim > 1:
+        wav = wav.mean(axis=1)
+    if sr != sr_target:
+        wav = librosa.resample(wav.astype(np.float32), orig_sr=sr, target_sr=sr_target)
+        sr = sr_target
+    return wav.astype(np.float32), sr
+def apply_reverb(wav, ir_path):
+    """Apply reverb effect using impulse response"""
+    try:
+        if not os.path.exists(ir_path):
+            print(f"Cảnh báo: Không tìm thấy file impulse response: {ir_path}")
+            return wav
+        ir, _ = load_wav(ir_path, sr_target=SR_OUT)
+        return fftconvolve(wav, ir, mode="full")
+    except Exception as e:
+        print(f"Lỗi khi áp dụng reverb: {e}")
+        return wav
+def add_noise(wav, noise_path, snr_db=10):
+    """Add background noise to audio"""
+    try:
+        if not os.path.exists(noise_path):
+            print(f"Cảnh báo: Không tìm thấy file noise: {noise_path}")
+            return wav
+        noise, _ = load_wav(noise_path, sr_target=SR_OUT)
+        if len(noise) < len(wav):
+            noise = np.tile(noise, int(len(wav)/len(noise)) + 1)
+        noise = noise[:len(wav)]
+        sig_power = np.mean(wav**2)
+        noise_power = np.mean(noise**2)
+        if noise_power == 0:
+            return wav
+        scale = np.sqrt(sig_power / (10**(snr_db/10) * noise_power))
+        return wav + noise * scale
+    except Exception as e:
+        print(f"Lỗi khi thêm noise: {e}")
+        return wav
+def bandlimit_phone(wav, sr=SR_OUT):
+    """Apply phone-like band limiting"""
+    try:
+        return librosa.effects.preemphasis(wav)
+    except Exception as e:
+        print(f"Lỗi khi áp dụng band limiting: {e}")
+        return wav
+def plot_waveforms(clean, processed, sr=SR_OUT):
+    """Create waveform comparison plot"""
+    try:
+        fig, axes = plt.subplots(2, 1, figsize=(10, 4), sharex=True)
+        t_clean = np.arange(len(clean)) / sr
+        t_proc = np.arange(len(processed)) / sr
+        axes[0].plot(t_clean, clean, color="blue", linewidth=0.8)
+        axes[0].set_title("🎤 Waveform gốc (StyleTTS2)")
+        axes[0].set_ylabel("Amplitude")
+        axes[0].grid(True, alpha=0.3)
+        axes[1].plot(t_proc, processed, color="red", linewidth=0.8)
+        axes[1].set_title("🎵 Waveform có hiệu ứng môi trường")
+        axes[1].set_xlabel("Thời gian (s)")
+        axes[1].set_ylabel("Amplitude")
+        axes[1].grid(True, alpha=0.3)
+        fig.tight_layout()
+        return fig
+    except Exception as e:
+        print(f"Lỗi khi tạo biểu đồ: {e}")
+        # Return a simple error plot
+        fig, ax = plt.subplots(1, 1, figsize=(10, 2))
+        ax.text(0.5, 0.5, "Không thể tạo biểu đồ", ha='center', va='center', transform=ax.transAxes)
+        ax.set_title("Lỗi tạo biểu đồ")
+        return fig
+# ---------------------------
+# Tag list
+# ---------------------------
+TAG_LIST = {
+    "laugh": "😆 Cười thoải mái",
+    "whisper": "🤫 Thì thầm",
+    "naughty": "😏 Tinh nghịch",
+    "giggle": "😂 Cười rúc rích",
+    "tease": "😉 Trêu chọc",
+    "smirk": "😼 Đắc ý",
+    "surprise": "😲 Ngạc nhiên",
+    "shock": "😱 Hoảng hốt",
+    "romantic": "❤️ Lãng mạn",
+    "shy": "🫣 Bẽn lẽn",
+    "excited": "🤩 Phấn khích",
+    "curious": "🧐 Tò mò",
+    "discover": "✨ Phát hiện",
+    "blush": "🌸 Ngượng ngùng",
+    "angry": "😡 Giận dữ",
+    "sad": "😢 Buồn",
+    "happy": "😊 Vui vẻ",
+    "fear": "😨 Sợ hãi",
+    "confident": "😎 Tự tin",
+    "serious": "😐 Nghiêm túc",
+    "tired": "🥱 Mệt mỏi",
+    "cry": "😭 Khóc",
+    "love": "😍 Yêu thương",
+    "disgust": "🤢 Ghê tởm",
+}
+TAG_PATTERN = r"(<\/?(?:" + "|".join(TAG_LIST.keys()) + ")>)"
+# ---------------------------
+# Core synthesis
+# ---------------------------
+def synthesize(text, env, snr_db=10, speed=1.0):
+    """Synthesize text to speech with environment effects"""
+    try:
+        # Check if model is loaded
+        if model is None:
+            print("Lỗi: Model chưa được tải. Vui lòng khởi động lại ứng dụng.")
+            return None, None, None
+        # Parse text and extract segments
+        tokens = re.split(TAG_PATTERN, text)
+        clean_segments = []
+        for tok in tokens:
+            if not tok or tok.isspace():
+                continue
+            if tok.startswith("<") and tok.endswith(">"):
+                # Skip tags for now - they're just for text segmentation
+                continue
+            else:
+                # Synthesize each text segment
+                try:
+                    audio_array = model.synthesize(tok, speed=speed)
+                    clean_segments.append(audio_array)
+                except Exception as e:
+                    print(f"Lỗi khi tổng hợp đoạn '{tok}': {e}")
+                    continue
+        if not clean_segments:
+            return None, None, None
+        # Concatenate all audio segments
+        clean_audio = np.concatenate(clean_segments, axis=0)
+        processed = clean_audio.copy()
+        # Apply environment effects
+        try:
+            if env == "Church":
+                processed = apply_reverb(processed, "ir_church.wav")
+            elif env == "Hall":
+                processed = apply_reverb(processed, "ir_hall.wav")
+            elif env == "Cafe":
+                processed = add_noise(processed, "noise_cafe.wav", snr_db=snr_db)
+            elif env == "Street":
+                processed = add_noise(processed, "noise_street.wav", snr_db=snr_db)
+            elif env == "Office":
+                processed = add_noise(processed, "noise_office.wav", snr_db=snr_db)
+            elif env == "Supermarket":
+                processed = add_noise(processed, "noise_supermarket.wav", snr_db=snr_db)
+            elif env == "Phone":
+                processed = bandlimit_phone(processed, sr=SR_OUT)
+        except Exception as e:
+            print(f"Cảnh báo: Không thể áp dụng hiệu ứng môi trường '{env}': {e}")
+            # Continue with clean audio if environment effects fail
+        # Create waveform comparison plot
+        fig = plot_waveforms(clean_audio, processed, sr=SR_OUT)
+        return (SR_OUT, processed), fig, (SR_OUT, clean_audio)
+    except Exception as e:
+        print(f"Lỗi trong quá trình tổng hợp: {e}")
+        return None, None, None
+# ---------------------------
+# Examples
+# ---------------------------
+EXAMPLES = [
+    "Xin chào <whisper> tôi nói nhỏ </whisper> rồi <laugh> bật cười </laugh>.",
+    "Tôi cảm thấy <happy> vui </happy> nhưng cũng <sad> buồn </sad>.",
+    "Khi <surprise> bất ngờ </surprise> tôi <shock> hoảng hốt </shock>.",
+]
+# ---------------------------
+# Gradio UI
+# ---------------------------
+with gr.Blocks(title="StyleTTS2 Text-to-Speech", theme=gr.themes.Soft()) as demo:
+    gr.Markdown("# 🎙️ StyleTTS2 Text-to-Speech với Hiệu ứng Môi trường")
+    # Model status indicator
+    if model_loaded:
+        gr.Markdown("✅ **Model đã sẵn sàng** - Bạn có thể bắt đầu tạo giọng nói!")
+    else:
+        gr.Markdown("❌ **Lỗi tải model** - Vui lòng kiểm tra file giọng nói tham chiếu và khởi động lại.")
+    gr.Markdown("Sử dụng StyleTTS2 với khả năng thêm hiệu ứng môi trường và điều chỉnh tốc độ nói.")
+    with gr.Accordion("📑 Danh sách Tags + Emoji", open=False):
+        md = "| Tag | Ý nghĩa |\n|-----|----------|\n"
+        for k, v in TAG_LIST.items():
+            md += f"| `<{k}>...</{k}>` | {v} |\n"
+        gr.Markdown(md)
+    with gr.Row():
+        with gr.Column(scale=1):
+            gr.Markdown("### ⚙️ Cài đặt")
+            text_in = gr.Textbox(
+                value=EXAMPLES[0],
+                label="📝 Văn bản cần chuyển đổi",
+                lines=4,
+                placeholder="Nhập văn bản của bạn ở đây. Sử dụng tags để tạo cảm xúc..."
+            )
+            with gr.Row():
+                env_in = gr.Dropdown(
+                    choices=["Neutral", "Church", "Hall", "Cafe", "Street", "Phone", "Office", "Supermarket"],
+                    value="Neutral",
+                    label="🌍 Môi trường âm thanh",
+                    info="Chọn môi trường để áp dụng hiệu ứng"
+                )
+                speed_slider = gr.Slider(
+                    minimum=0.5,
+                    maximum=2.0,
+                    value=1.0,
+                    step=0.1,
+                    label="⚡ Tốc độ nói",
+                    info="1.0 = bình thường, < 1.0 = chậm, > 1.0 = nhanh"
+                )
+            snr_slider = gr.Slider(
+                0, 30,
+                value=10,
+                step=1,
+                label="🔊 Mức độ nhiễu (SNR dB)",
+                info="Chỉ áp dụng cho môi trường có tiếng ồn. Cao hơn = ít nhiễu hơn"
+            )
+            btn = gr.Button("🎵 Tạo giọng nói", variant="primary", size="lg")
+            gr.Examples(
+                examples=[[ex] for ex in EXAMPLES],
+                inputs=[text_in],
+                label="💡 Ví dụ nhanh"
+            )
+        with gr.Column(scale=1):
+            gr.Markdown("### 🎧 Kết quả")
+            audio_out = gr.Audio(
+                label="🎵 Âm thanh có hiệu ứng",
+                type="numpy",
+                info="Phiên bản có áp dụng hiệu ứng môi trường"
+            )
+            clean_out = gr.Audio(
+                label="🎤 Âm thanh gốc",
+                type="numpy",
+                info="Phiên bản gốc không có hiệu ứng"
+            )
+            wave_plot = gr.Plot(
+                label="📊 So sánh dạng sóng",
+                info="Biểu đồ so sánh âm thanh gốc và có hiệu ứng"
+            )
+    btn.click(fn=synthesize,
+              inputs=[text_in, env_in, snr_slider, speed_slider],
+              outputs=[audio_out, wave_plot, clean_out])
+# Launch the application
 if __name__ == "__main__":
+    try:
+        print("🚀 Đang khởi động ứng dụng StyleTTS2...")
+        demo.launch(
+            server_name="0.0.0.0",
+            server_port=7860,
+            share=False,
+            show_error=True
+        )
+    except Exception as e:
+        print(f"❌ Lỗi khi khởi động ứng dụng: {e}")
+        print("Vui lòng kiểm tra lại cấu hình và thử lại.")