Spaces:

tonyshark
/

styletts2

Runtime error

App Files Files Community

tonyshark commited on Sep 17, 2025

Commit

1f495a1

verified ·

1 Parent(s): 9ce98c6

Upload 2 files

Browse files

Files changed (2) hide show

app.py +198 -113
requirements.txt +3 -13

app.py CHANGED Viewed

@@ -1,127 +1,212 @@
-# app.py
-import gradio as gr
-import torch
-import os
 import numpy as np
 import soundfile as sf
-import time
-from model import StyleTTModel
-SPEAKER_WAV_PATH = "speakers/example_female.wav"
-OUTPUT_FILENAME = "output.wav"
-SAMPLE_RATE = 24000
-# Kiểm tra xem file giọng nói tham chiếu có tồn tại không
-if not os.path.exists(SPEAKER_WAV_PATH):
-    raise FileNotFoundError(f"Không tìm thấy file giọng nói tham chiếu tại: {SPEAKER_WAV_PATH}. "
-                            "Vui lòng tạo thư mục và đặt file .wav của bạn vào đó.")
-print("Bắt đầu khởi tạo StyleTTS2 Model...")
-model = StyleTTModel(speaker_wav=SPEAKER_WAV_PATH)
-print("Đang tải model StyleTTS2. Quá trình này có thể mất vài phút...")
-start_time = time.time()
-model.load()
-end_time = time.time()
-print(f"Model đã được tải thành công sau {end_time - start_time:.2f} giây.")
-def process_expressive_tags(text: str) -> str:
-    """
-    Chuyển đổi các tag biểu cảm như <laugh> thành các cụm từ mô tả
-    mà StyleTTS2 có thể hiểu để thay đổi giọng điệu.
-    """
-    tag_map = {
-        "<laugh>": "[laughing]",
-        "<whisper>": "[whispering]",
-        "<sigh>": "[sighs]",
-        "<cry>": "[crying]",
-        "<naughty>": "[mischievous tone]",
-        "<giggle>": "[giggling]",
-        "<tease>": "[teasingly]",
-        "<smirk>": "[sly tone]",
-        "<surprise>": "[surprised tone]",
-        "<romantic>": "[romantic tone]",
-        "<shy>": "[shyly]",
-        "<excited>": "[excitedly]",
-        "<shock>": "[shocked tone]",
-        "<curious>": "[curious tone]",
-        "<discover>": "[discovering tone]",
-        "<blush>": "[blushing voice]"
-    }
-    processed_text = text
-    for tag, style_prompt in tag_map.items():
-        processed_text = processed_text.replace(tag, style_prompt)
-    print(f"Văn bản sau khi xử lý tag: '{processed_text}'")
-    return processed_text
-# --- Phần 3: Hàm chính để tổng hợp giọng nói ---
-def generate_speech(text: str, speed: float):
-    """
-    Hàm chính được gọi bởi Gradio để tạo ra âm thanh từ văn bản.
-    """
-    if not text:
         return None
-    print(f"Nhận được văn bản: '{text}' với tốc độ {speed}")
-    # Bước 1: Xử lý các tag biểu cảm
-    styled_text = process_expressive_tags(text)
-    # Bước 2: Sử dụng plugin để tổng hợp âm thanh
-    # Hàm synthesize trả về một mảng numpy
-    audio_array = model.synthesize(styled_text, speed=speed)
-    sf.write(OUTPUT_FILENAME, audio_array, SAMPLE_RATE, 'FLOAT')
-    print(f"Đã tạo file âm thanh thành công tại: {OUTPUT_FILENAME}")
-    return OUTPUT_FILENAME
 with gr.Blocks() as demo:
     gr.Markdown(
-        """
-        # 🎙️ Demo StyleTTS2 Lite với Tag biểu cảm
-        Nhập văn bản vào ô bên dưới. Bạn có thể sử dụng các tag như `<laugh>`, `<whisper>`, `<sigh>`
-        để thêm cảm xúc cho giọng nói. Điều chỉnh thanh trượt để thay đổi tốc độ nói.
-        """
     )
     with gr.Row():
-        text_input = gr.Textbox(
-            label="Nhập văn bản ở đây",
-            placeholder="Ví dụ: Chào bạn, tôi là một trợ lý ảo. <laugh> Thật vui được gặp bạn!",
-            lines=4
-        )
-    speed_slider = gr.Slider(
-        minimum=0.5,
-        maximum=2.0,
-        value=1.0,
-        step=0.1,
-        label="Tốc độ nói (Speed)"
-    )
-    generate_button = gr.Button("Tạo giọng nói 🎙️", variant="primary")
-    audio_output = gr.Audio(label="Kết quả", type="filepath")
-    # Kết nối các thành phần
-    generate_button.click(
-        fn=generate_speech,
-        inputs=[text_input, speed_slider],
-        outputs=audio_output
-    )
-    gr.Markdown("### Các tag biểu cảm được hỗ trợ:")
-    gr.Markdown("- `<laugh>`: Tiếng cười\n- `<whisper>`: Thì thầm\n- `<sigh>`: Thở dài\n- `<cry>`: Khóc\n- `<giggle>`: Cười khúc khích\n- `<tease>`: Trêu chọc\n- `<surprised>`: Ngạc nhiên\n- Và nhiều tag khác trong mã nguồn...")
-# --- Phần 5: Chạy ứng dụng ---
-if __name__ == "__main__":
-    # Chia sẻ (share=True) sẽ tạo một liên kết công khai tạm thời
-    demo.launch(share=False)

+import re
+import io
 import numpy as np
+import torch
 import soundfile as sf
+import librosa
+import gradio as gr
+from styletts2 import tts
+SR_OUT = 24000  # sample rate output cho toàn bộ hệ
+# ---------------------------
+# Load StyleTTS2
+# ---------------------------
+model = tts.StyleTTS2()
+# ---------------------------
+# Audio utils
+# ---------------------------
+def load_wav_any(file_or_path, target_sr=None, mono=True):
+    """Load wav (from path hoặc Gradio file object), optional resample."""
+    if file_or_path is None:
+        return None, None
+    if hasattr(file_or_path, "name"):  # Uploaded file (tempfile)
+        path = file_or_path.name
+    else:
+        path = file_or_path
+    wav, sr = sf.read(path, always_2d=False)
+    if wav.ndim > 1 and mono:
+        wav = wav.mean(axis=1)
+    if target_sr and sr != target_sr:
+        wav = librosa.resample(wav.astype(np.float32), orig_sr=sr, target_sr=target_sr)
+        sr = target_sr
+    return wav.astype(np.float32), sr
+def to_tensor_batch1(wav_np):
+    return torch.tensor(wav_np).float().unsqueeze(0)
+def fade(wav, fade_ms=10, sr=SR_OUT):
+    """Fade in/out để tránh click khi nối."""
+    if wav is None or len(wav) == 0:
+        return wav
+    n = len(wav)
+    fade_len = max(1, int(sr * fade_ms / 1000.0))
+    env = np.ones(n, dtype=np.float32)
+    ramp = np.linspace(0.0, 1.0, fade_len, dtype=np.float32)
+    env[:fade_len] *= ramp
+    env[-fade_len:] *= ramp[::-1]
+    return wav * env
+def match_gain(wav, gain_db):
+    """Áp gain dB lên clip."""
+    g = 10 ** (gain_db / 20.0)
+    return (wav * g).astype(np.float32)
+# ---------------------------
+# Style extraction
+# ---------------------------
+def get_style_embedding(file):
+    if file is None:
         return None
+    wav, sr = load_wav_any(file, target_sr=SR_OUT)
+    if wav is None:
+        return None
+    wav_t = to_tensor_batch1(wav)
+    return model.get_style_embedding(wav_t, SR_OUT)  # (1, D)
+# ---------------------------
+# Core synthesis
+# ---------------------------
+TAG_PATTERN = r"(\[(?:laugh|whisper|giggle)\])"
+def synthesize(
+    text,
+    neutral_ref, whisper_ref, giggle_ref,
+    laugh_sfx,               # <-- audio tiếng cười để chèn
+    embedding_scale=1.0,
+    laugh_gain_db=0.0,        # chỉnh âm lượng sfx
+    laugh_stretch=1.0,        # time-stretch sfx (1.0 = nguyên gốc)
+):
+    # 1) Chuẩn bị style embeddings
+    style_neutral = get_style_embedding(neutral_ref)
+    style_whisper = get_style_embedding(whisper_ref)
+    style_giggle  = get_style_embedding(giggle_ref)
+    if style_neutral is None:
+        return None
+    # 2) Load sfx cười (resample, fade & gain)
+    laugh_np, _ = load_wav_any(laugh_sfx, target_sr=SR_OUT)
+    if laugh_np is not None:
+        if laugh_stretch and abs(laugh_stretch - 1.0) > 1e-3:
+            laugh_np = librosa.effects.time_stretch(laugh_np, rate=1.0/float(laugh_stretch))
+        laugh_np = fade(laugh_np, fade_ms=12, sr=SR_OUT)
+        if laugh_gain_db != 0.0:
+            laugh_np = match_gain(laugh_np, laugh_gain_db)
+    # 3) Parse text theo tag
+    tokens = re.split(TAG_PATTERN, text)
+    pieces = []
+    for tok in tokens:
+        if tok is None:
+            continue
+        t = tok.strip()
+        if not t:
+            continue
+        if t.startswith("[") and t.endswith("]"):
+            tag = t[1:-1].lower()
+            if tag == "laugh":
+                # chèn trực tiếp sfx tiếng cười
+                if laugh_np is not None:
+                    pieces.append(laugh_np)
+                # nếu chưa upload sfx, bỏ qua hoặc có thể synthesize "hahaha" bằng style_giggle
+                else:
+                    # fallback: synthesize một âm tiết ngắn với giggle style nếu có
+                    style_use = style_giggle if style_giggle is not None else style_neutral
+                    audio = model.inference(
+                        "ha ha", style_embedding=style_use * embedding_scale, output_sample_rate=SR_OUT
+                    )
+                    pieces.append(audio.astype(np.float32))
+            elif tag == "whisper":
+                # tạo một đoạn ngắn im lặng mang "breath" hoặc synth 1 khoảng ngắn trống
+                # ở đây ta không synth text vì tag đơn lẻ, chỉ chuyển style kế tiếp
+                # => chèn đoạn im lặng rất ngắn để tách
+                pieces.append(np.zeros(int(0.05*SR_OUT), dtype=np.float32))
+                # Đặt "current style" cho phần text tiếp theo
+                # Cách đơn giản: lưu "style kế tiếp" trong biến
+                pieces.append(("__STYLE__", "whisper"))
+            elif tag == "giggle":
+                pieces.append(np.zeros(int(0.05*SR_OUT), dtype=np.float32))
+                pieces.append(("__STYLE__", "giggle"))
+            else:
+                # default: bỏ qua
+                pass
+        else:
+            # text bình thường => synth với style hiện thời (nếu có)
+            # tìm xem có cờ "__STYLE__" trước đó không
+            curr_style = style_neutral
+            # duyệt từ cuối pieces để tìm chỉ thị style gần nhất (nếu có)
+            for it in reversed(pieces):
+                if isinstance(it, tuple) and it[0] == "__STYLE__":
+                    mode = it[1]
+                    if mode == "whisper" and style_whisper is not None:
+                        curr_style = style_whisper
+                    elif mode == "giggle" and style_giggle is not None:
+                        curr_style = style_giggle
+                    break
+            audio = model.inference(
+                t, style_embedding=curr_style * embedding_scale, output_sample_rate=SR_OUT
+            )
+            pieces.append(audio.astype(np.float32))
+    # 4) Gộp các đoạn
+    # Lọc bỏ các marker style "__STYLE__"
+    merged = []
+    for it in pieces:
+        if isinstance(it, tuple):
+            continue
+        if it is None:
+            continue
+        merged.append(it)
+    if not merged:
+        return None
+    out = np.concatenate(merged, axis=0)
+    out = fade(out, fade_ms=8, sr=SR_OUT)
+    return (SR_OUT, out)
+# ---------------------------
+# Gradio UI
+# ---------------------------
 with gr.Blocks() as demo:
+    gr.Markdown("# 🎙️ StyleTTS2 Tags + Laugh SFX (Hugging Face Radio App)")
     gr.Markdown(
+        "Nhập text có tag: `[whisper]`, `[giggle]`, và **`[laugh]`**.\n\n"
+        "- Với `[laugh]`: app **chèn trực tiếp audio tiếng cười** bạn upload.\n"
+        "- Với `[whisper]` / `[giggle]`: app dùng **style embedding** từ file tham chiếu.\n"
+        "- Upload *ít nhất* 1 file neutral để lấy giọng cơ bản."
     )
     with gr.Row():
+        with gr.Column():
+            text_in = gr.Textbox(
+                value="Xin chào mọi người [laugh] bây giờ tôi sẽ nói nhỏ [whisper] rồi khúc khích [giggle] và lại bình thường.",
+                label="Text có tags",
+                lines=4
+            )
+            neutral_in = gr.File(label="Neutral reference (.wav)", file_types=[".wav"])
+            whisper_in = gr.File(label="Whisper reference (.wav)", file_types=[".wav"])
+            giggle_in  = gr.File(label="Giggle reference (.wav)",  file_types=[".wav"])
+            gr.Markdown("### 🎧 Laugh SFX (chèn trực tiếp khi gặp [laugh])")
+            laugh_in   = gr.File(label="Laugh SFX (.wav)", file_types=[".wav"])
+            laugh_gain = gr.Slider(-12, 12, value=0.0, step=0.5, label="Laugh gain (dB)")
+            laugh_stch = gr.Slider(0.5, 2.0, value=1.0, step=0.05, label="Laugh time-stretch (x)")
+            emb_scale  = gr.Slider(0.5, 2.0, value=1.0, step=0.1, label="Embedding scale (StyleTTS2)")
+            btn = gr.Button("Generate")
+        with gr.Column():
+            audio_out = gr.Audio(label="Kết quả", type="numpy")
+    btn.click(
+        fn=synthesize,
+        inputs=[text_in, neutral_in, whisper_in, giggle_in, laugh_in, emb_scale, laugh_gain, laugh_stch],
+        outputs=audio_out
+    )
+demo.launch()

requirements.txt CHANGED Viewed

@@ -1,16 +1,6 @@
 torch
-torchaudio
-transformers
-huggingface_hub
-gradio
 soundfile
-numpy
-pyyaml
-pydantic
-httpx
-phonemizer
-nltk
-munch
-noisereduce
 librosa
-pyttsx3

+styletts2
 torch
 soundfile
 librosa
+gradio
+numpy