Spaces:

tonyshark
/

styletts2

Runtime error

App Files Files Community

tonyshark commited on Sep 17, 2025

Commit

7f65a42

verified ·

1 Parent(s): e79f3e0

Upload 25 files

Browse files

Files changed (2) hide show

app.py +121 -153
requirements.txt +12 -4

app.py CHANGED Viewed

@@ -1,159 +1,127 @@
-import re
 import numpy as np
 import soundfile as sf
-import gradio as gr
-import matplotlib.pyplot as plt
-from scipy.signal import fftconvolve
-import librosa
-from transformers import pipeline
-# ---------------------------
-# Load HF TTS model (ak36/styletts2)
-# ---------------------------
-SR_OUT = 24000
-tts_pipe = pipeline("text-to-speech", model="ak36/styletts2")
-# ---------------------------
-# Audio helpers
-# ---------------------------
-def load_wav(path, sr_target=SR_OUT):
-    wav, sr = sf.read(path)
-    if wav.ndim > 1:
-        wav = wav.mean(axis=1)
-    if sr != sr_target:
-        wav = librosa.resample(wav.astype(np.float32), orig_sr=sr, target_sr=sr_target)
-        sr = sr_target
-    return wav.astype(np.float32), sr
-def apply_reverb(wav, ir_path):
-    ir, _ = load_wav(ir_path, sr_target=SR_OUT)
-    return fftconvolve(wav, ir, mode="full")
-def add_noise(wav, noise_path, snr_db=10):
-    noise, _ = load_wav(noise_path, sr_target=SR_OUT)
-    if len(noise) < len(wav):
-        noise = np.tile(noise, int(len(wav)/len(noise)) + 1)
-    noise = noise[:len(wav)]
-    sig_power = np.mean(wav**2)
-    noise_power = np.mean(noise**2)
-    scale = np.sqrt(sig_power / (10**(snr_db/10) * noise_power))
-    return wav + noise * scale
-def bandlimit_phone(wav, sr=SR_OUT):
-    return librosa.effects.preemphasis(wav)
-def plot_waveforms(clean, processed, sr=SR_OUT):
-    fig, axes = plt.subplots(2, 1, figsize=(10, 4), sharex=True)
-    t_clean = np.arange(len(clean)) / sr
-    t_proc = np.arange(len(processed)) / sr
-    axes[0].plot(t_clean, clean, color="blue")
-    axes[0].set_title("Waveform sạch (ak36/styletts2)")
-    axes[1].plot(t_proc, processed, color="red")
-    axes[1].set_title("Waveform sau khi áp môi trường/noise")
-    axes[1].set_xlabel("Thời gian (s)")
-    fig.tight_layout()
-    return fig
-# ---------------------------
-# Tag list
-# ---------------------------
-TAG_LIST = {
-    "laugh": "😆 Cười thoải mái",
-    "whisper": "🤫 Thì thầm",
-    "giggle": "😂 Cười rúc rích",
-    "surprise": "😲 Ngạc nhiên",
-    "sad": "😢 Buồn",
-    "happy": "😊 Vui vẻ",
-    "angry": "😡 Giận dữ",
-}
-TAG_PATTERN = r"(<\/?(?:" + "|".join(TAG_LIST.keys()) + ")>)"
-# ---------------------------
-# Core synthesis
-# ---------------------------
-def synthesize(text, env, snr_db=10):
-    tokens = re.split(TAG_PATTERN, text)
-    clean_segments = []
-    for tok in tokens:
-        if not tok or tok.isspace():
-            continue
-        if tok.startswith("<") and tok.endswith(">"):
-            # Model ak36/styletts2 chưa hỗ trợ style embedding riêng,
-            # nên tags chỉ chia text thành đoạn.
-            continue
-        else:
-            result = tts_pipe(text=tok)
-            wav = result["audio"]
-            sr = result["sampling_rate"]
-            if sr != SR_OUT:
-                wav = librosa.resample(wav, orig_sr=sr, target_sr=SR_OUT)
-            clean_segments.append(wav.astype(np.float32))
-    if not clean_segments:
-        return None, None, None
-    clean_audio = np.concatenate(clean_segments, axis=0)
-    processed = clean_audio.copy()
-    # Apply environment
-    if env == "Church":
-        processed = apply_reverb(processed, "ir_church.wav")
-    elif env == "Hall":
-        processed = apply_reverb(processed, "ir_hall.wav")
-    elif env == "Cafe":
-        processed = add_noise(processed, "noise_cafe.wav", snr_db=snr_db)
-    elif env == "Street":
-        processed = add_noise(processed, "noise_street.wav", snr_db=snr_db)
-    elif env == "Office":
-        processed = add_noise(processed, "noise_office.wav", snr_db=snr_db)
-    elif env == "Supermarket":
-        processed = add_noise(processed, "noise_supermarket.wav", snr_db=snr_db)
-    elif env == "Phone":
-        processed = bandlimit_phone(processed, sr=SR_OUT)
-    fig = plot_waveforms(clean_audio, processed, sr=SR_OUT)
-    return (SR_OUT, processed), fig, (SR_OUT, clean_audio)
-# ---------------------------
-# Example texts
-# ---------------------------
-EXAMPLES = [
-    "Xin chào <whisper> tôi nói nhỏ </whisper> rồi <laugh> bật cười </laugh>.",
-    "Tôi cảm thấy <happy> vui </happy> nhưng cũng <sad> buồn </sad>.",
-    "Khi <surprise> bất ngờ </surprise> tôi <angry> giận dữ </angry>.",
-]
-# ---------------------------
-# Gradio UI
-# ---------------------------
 with gr.Blocks() as demo:
-    gr.Markdown("# 🎙️ ak36/styletts2 + Tags + Environment + Waveform Preview")
-    gr.Markdown("Dùng model `ak36/styletts2` (giọng LibriTTS mặc định). Tags chia text thành đoạn.")
-    with gr.Accordion("📑 Danh sách Tags + Emoji", open=False):
-        md = "| Tag | Ý nghĩa |\n|-----|----------|\n"
-        for k, v in TAG_LIST.items():
-            md += f"| `<{k}>...</{k}>` | {v} |\n"
-        gr.Markdown(md)
     with gr.Row():
-        with gr.Column():
-            text_in = gr.Textbox(value=EXAMPLES[0], label="Text với tags", lines=4)
-            env_in = gr.Dropdown(
-                choices=["Neutral", "Church", "Hall", "Cafe", "Street", "Phone", "Office", "Supermarket"],
-                value="Neutral", label="Environment"
-            )
-            snr_slider = gr.Slider(0, 30, value=10, step=1, label="Noise SNR (dB)")
-            btn = gr.Button("Generate")
-            gr.Examples(examples=[[ex] for ex in EXAMPLES], inputs=[text_in], label="Ví dụ nhanh")
-        with gr.Column():
-            audio_out = gr.Audio(label="Processed", type="numpy")
-            clean_out = gr.Audio(label="Clean (TTS only)", type="numpy")
-            wave_plot = gr.Plot(label="So sánh Waveform")
-    btn.click(fn=synthesize,
-              inputs=[text_in, env_in, snr_slider],
-              outputs=[audio_out, wave_plot, clean_out])
-demo.launch()

+# app.py
+import gradio as gr
+import torch
+import os
 import numpy as np
 import soundfile as sf
+import time
+from model import StyleTTModel
+SPEAKER_WAV_PATH = "speakers/example_female.wav"
+OUTPUT_FILENAME = "output.wav"
+SAMPLE_RATE = 24000
+# Kiểm tra xem file giọng nói tham chiếu có tồn tại không
+if not os.path.exists(SPEAKER_WAV_PATH):
+    raise FileNotFoundError(f"Không tìm thấy file giọng nói tham chiếu tại: {SPEAKER_WAV_PATH}. "
+                            "Vui lòng tạo thư mục và đặt file .wav của bạn vào đó.")
+print("Bắt đầu khởi tạo StyleTTS2 Model...")
+model = StyleTTModel(speaker_wav=SPEAKER_WAV_PATH)
+print("Đang tải model StyleTTS2. Quá trình này có thể mất vài phút...")
+start_time = time.time()
+model.load()
+end_time = time.time()
+print(f"Model đã được tải thành công sau {end_time - start_time:.2f} giây.")
+def process_expressive_tags(text: str) -> str:
+    """
+    Chuyển đổi các tag biểu cảm như <laugh> thành các cụm từ mô tả
+    mà StyleTTS2 có thể hiểu để thay đổi giọng điệu.
+    """
+    tag_map = {
+        "<laugh>": "[laughing]",
+        "<whisper>": "[whispering]",
+        "<sigh>": "[sighs]",
+        "<cry>": "[crying]",
+        "<naughty>": "[mischievous tone]",
+        "<giggle>": "[giggling]",
+        "<tease>": "[teasingly]",
+        "<smirk>": "[sly tone]",
+        "<surprise>": "[surprised tone]",
+        "<romantic>": "[romantic tone]",
+        "<shy>": "[shyly]",
+        "<excited>": "[excitedly]",
+        "<shock>": "[shocked tone]",
+        "<curious>": "[curious tone]",
+        "<discover>": "[discovering tone]",
+        "<blush>": "[blushing voice]"
+    }
+    processed_text = text
+    for tag, style_prompt in tag_map.items():
+        processed_text = processed_text.replace(tag, style_prompt)
+    print(f"Văn bản sau khi xử lý tag: '{processed_text}'")
+    return processed_text
+# --- Phần 3: Hàm chính để tổng hợp giọng nói ---
+def generate_speech(text: str, speed: float):
+    """
+    Hàm chính được gọi bởi Gradio để tạo ra âm thanh từ văn bản.
+    """
+    if not text:
+        return None
+    print(f"Nhận được văn bản: '{text}' với tốc độ {speed}")
+    # Bước 1: Xử lý các tag biểu cảm
+    styled_text = process_expressive_tags(text)
+    # Bước 2: Sử dụng plugin để tổng hợp âm thanh
+    # Hàm synthesize trả về một mảng numpy
+    audio_array = model.synthesize(styled_text, speed=speed)
+    sf.write(OUTPUT_FILENAME, audio_array, SAMPLE_RATE, 'FLOAT')
+    print(f"Đã tạo file âm thanh thành công tại: {OUTPUT_FILENAME}")
+    return OUTPUT_FILENAME
 with gr.Blocks() as demo:
+    gr.Markdown(
+        """
+        # 🎙️ Demo StyleTTS2 Lite với Tag biểu cảm
+        Nhập văn bản vào ô bên dưới. Bạn có thể sử dụng các tag như `<laugh>`, `<whisper>`, `<sigh>`
+        để thêm cảm xúc cho giọng nói. Điều chỉnh thanh trượt để thay đổi tốc độ nói.
+        """
+    )
     with gr.Row():
+        text_input = gr.Textbox(
+            label="Nhập văn bản ở đây",
+            placeholder="Ví dụ: Chào bạn, tôi là một trợ lý ảo. <laugh> Thật vui được gặp bạn!",
+            lines=4
+        )
+    speed_slider = gr.Slider(
+        minimum=0.5,
+        maximum=2.0,
+        value=1.0,
+        step=0.1,
+        label="Tốc độ nói (Speed)"
+    )
+    generate_button = gr.Button("Tạo giọng nói 🎙️", variant="primary")
+    audio_output = gr.Audio(label="Kết quả", type="filepath")
+    # Kết nối các thành phần
+    generate_button.click(
+        fn=generate_speech,
+        inputs=[text_input, speed_slider],
+        outputs=audio_output
+    )
+    gr.Markdown("### Các tag biểu cảm được hỗ trợ:")
+    gr.Markdown("- `<laugh>`: Tiếng cười\n- `<whisper>`: Thì thầm\n- `<sigh>`: Thở dài\n- `<cry>`: Khóc\n- `<giggle>`: Cười khúc khích\n- `<tease>`: Trêu chọc\n- `<surprised>`: Ngạc nhiên\n- Và nhiều tag khác trong mã nguồn...")
+# --- Phần 5: Chạy ứng dụng ---
+if __name__ == "__main__":
+    # Chia sẻ (share=True) sẽ tạo một liên kết công khai tạm thời
+    demo.launch(share=False)

requirements.txt CHANGED Viewed

@@ -1,8 +1,16 @@
-transformers
 torch
 soundfile
 numpy
-scipy
-gradio
 librosa
-matplotlib

 torch
+torchaudio
+transformers
+huggingface_hub
+gradio
 soundfile
 numpy
+pyyaml
+pydantic
+httpx
+phonemizer
+nltk
+munch
+noisereduce
 librosa
+pyttsx3