Spaces:

tonyshark
/

styletts2

Runtime error

App Files Files Community

tonyshark commited on Sep 17, 2025

Commit

7014f5a

verified ·

1 Parent(s): 1f495a1

Upload 2 files

Browse files

Files changed (2) hide show

app.py +54 -153
requirements.txt +0 -1

app.py CHANGED Viewed

@@ -1,13 +1,11 @@
 import re
-import io
-import numpy as np
 import torch
 import soundfile as sf
-import librosa
 import gradio as gr
 from styletts2 import tts
-SR_OUT = 24000  # sample rate output cho toàn bộ hệ
 # ---------------------------
 # Load StyleTTS2
@@ -15,197 +13,100 @@ SR_OUT = 24000  # sample rate output cho toàn bộ hệ
 model = tts.StyleTTS2()
 # ---------------------------
-# Audio utils
-# ---------------------------
-def load_wav_any(file_or_path, target_sr=None, mono=True):
-    """Load wav (from path hoặc Gradio file object), optional resample."""
-    if file_or_path is None:
-        return None, None
-    if hasattr(file_or_path, "name"):  # Uploaded file (tempfile)
-        path = file_or_path.name
-    else:
-        path = file_or_path
-    wav, sr = sf.read(path, always_2d=False)
-    if wav.ndim > 1 and mono:
-        wav = wav.mean(axis=1)
-    if target_sr and sr != target_sr:
-        wav = librosa.resample(wav.astype(np.float32), orig_sr=sr, target_sr=target_sr)
-        sr = target_sr
-    return wav.astype(np.float32), sr
-def to_tensor_batch1(wav_np):
-    return torch.tensor(wav_np).float().unsqueeze(0)
-def fade(wav, fade_ms=10, sr=SR_OUT):
-    """Fade in/out để tránh click khi nối."""
-    if wav is None or len(wav) == 0:
-        return wav
-    n = len(wav)
-    fade_len = max(1, int(sr * fade_ms / 1000.0))
-    env = np.ones(n, dtype=np.float32)
-    ramp = np.linspace(0.0, 1.0, fade_len, dtype=np.float32)
-    env[:fade_len] *= ramp
-    env[-fade_len:] *= ramp[::-1]
-    return wav * env
-def match_gain(wav, gain_db):
-    """Áp gain dB lên clip."""
-    g = 10 ** (gain_db / 20.0)
-    return (wav * g).astype(np.float32)
-# ---------------------------
-# Style extraction
 # ---------------------------
-def get_style_embedding(file):
     if file is None:
         return None
-    wav, sr = load_wav_any(file, target_sr=SR_OUT)
-    if wav is None:
-        return None
-    wav_t = to_tensor_batch1(wav)
-    return model.get_style_embedding(wav_t, SR_OUT)  # (1, D)
 # ---------------------------
 # Core synthesis
 # ---------------------------
-TAG_PATTERN = r"(\[(?:laugh|whisper|giggle)\])"
-def synthesize(
-    text,
-    neutral_ref, whisper_ref, giggle_ref,
-    laugh_sfx,               # <-- audio tiếng cười để chèn
-    embedding_scale=1.0,
-    laugh_gain_db=0.0,        # chỉnh âm lượng sfx
-    laugh_stretch=1.0,        # time-stretch sfx (1.0 = nguyên gốc)
-):
-    # 1) Chuẩn bị style embeddings
-    style_neutral = get_style_embedding(neutral_ref)
-    style_whisper = get_style_embedding(whisper_ref)
-    style_giggle  = get_style_embedding(giggle_ref)
     if style_neutral is None:
         return None
-    # 2) Load sfx cười (resample, fade & gain)
-    laugh_np, _ = load_wav_any(laugh_sfx, target_sr=SR_OUT)
-    if laugh_np is not None:
-        if laugh_stretch and abs(laugh_stretch - 1.0) > 1e-3:
-            laugh_np = librosa.effects.time_stretch(laugh_np, rate=1.0/float(laugh_stretch))
-        laugh_np = fade(laugh_np, fade_ms=12, sr=SR_OUT)
-        if laugh_gain_db != 0.0:
-            laugh_np = match_gain(laugh_np, laugh_gain_db)
-    # 3) Parse text theo tag
     tokens = re.split(TAG_PATTERN, text)
-    pieces = []
     for tok in tokens:
-        if tok is None:
-            continue
-        t = tok.strip()
-        if not t:
             continue
-        if t.startswith("[") and t.endswith("]"):
-            tag = t[1:-1].lower()
-            if tag == "laugh":
-                # chèn trực tiếp sfx tiếng cười
-                if laugh_np is not None:
-                    pieces.append(laugh_np)
-                # nếu chưa upload sfx, bỏ qua hoặc có thể synthesize "hahaha" bằng style_giggle
-                else:
-                    # fallback: synthesize một âm tiết ngắn với giggle style nếu có
-                    style_use = style_giggle if style_giggle is not None else style_neutral
-                    audio = model.inference(
-                        "ha ha", style_embedding=style_use * embedding_scale, output_sample_rate=SR_OUT
-                    )
-                    pieces.append(audio.astype(np.float32))
-            elif tag == "whisper":
-                # tạo một đoạn ngắn im lặng mang "breath" hoặc synth 1 khoảng ngắn trống
-                # ở đây ta không synth text vì tag đơn lẻ, chỉ chuyển style kế tiếp
-                # => chèn đoạn im lặng rất ngắn để tách
-                pieces.append(np.zeros(int(0.05*SR_OUT), dtype=np.float32))
-                # Đặt "current style" cho phần text tiếp theo
-                # Cách đơn giản: lưu "style kế tiếp" trong biến
-                pieces.append(("__STYLE__", "whisper"))
-            elif tag == "giggle":
-                pieces.append(np.zeros(int(0.05*SR_OUT), dtype=np.float32))
-                pieces.append(("__STYLE__", "giggle"))
-            else:
-                # default: bỏ qua
-                pass
         else:
-            # text bình thường => synth với style hiện thời (nếu có)
-            # tìm xem có cờ "__STYLE__" trước đó không
-            curr_style = style_neutral
-            # duyệt từ cuối pieces để tìm chỉ thị style gần nhất (nếu có)
-            for it in reversed(pieces):
-                if isinstance(it, tuple) and it[0] == "__STYLE__":
-                    mode = it[1]
-                    if mode == "whisper" and style_whisper is not None:
-                        curr_style = style_whisper
-                    elif mode == "giggle" and style_giggle is not None:
-                        curr_style = style_giggle
-                    break
             audio = model.inference(
-                t, style_embedding=curr_style * embedding_scale, output_sample_rate=SR_OUT
             )
-            pieces.append(audio.astype(np.float32))
-    # 4) Gộp các đoạn
-    # Lọc bỏ các marker style "__STYLE__"
-    merged = []
-    for it in pieces:
-        if isinstance(it, tuple):
-            continue
-        if it is None:
-            continue
-        merged.append(it)
-    if not merged:
         return None
-    out = np.concatenate(merged, axis=0)
-    out = fade(out, fade_ms=8, sr=SR_OUT)
-    return (SR_OUT, out)
 # ---------------------------
 # Gradio UI
 # ---------------------------
 with gr.Blocks() as demo:
-    gr.Markdown("# 🎙️ StyleTTS2 Tags + Laugh SFX (Hugging Face Radio App)")
     gr.Markdown(
-        "Nhập text có tag: `[whisper]`, `[giggle]`, và **`[laugh]`**.\n\n"
-        "- Với `[laugh]`: app **chèn trực tiếp audio tiếng cười** bạn upload.\n"
-        "- Với `[whisper]` / `[giggle]`: app dùng **style embedding** từ file tham chiếu.\n"
-        "- Upload *ít nhất* 1 file neutral để lấy giọng cơ bản."
     )
     with gr.Row():
         with gr.Column():
             text_in = gr.Textbox(
-                value="Xin chào mọi người [laugh] bây giờ tôi sẽ nói nhỏ [whisper] rồi khúc khích [giggle] và lại bình thường.",
-                label="Text có tags",
                 lines=4
             )
             neutral_in = gr.File(label="Neutral reference (.wav)", file_types=[".wav"])
-            whisper_in = gr.File(label="Whisper reference (.wav)", file_types=[".wav"])
-            giggle_in  = gr.File(label="Giggle reference (.wav)",  file_types=[".wav"])
-            gr.Markdown("### 🎧 Laugh SFX (chèn trực tiếp khi gặp [laugh])")
-            laugh_in   = gr.File(label="Laugh SFX (.wav)", file_types=[".wav"])
-            laugh_gain = gr.Slider(-12, 12, value=0.0, step=0.5, label="Laugh gain (dB)")
-            laugh_stch = gr.Slider(0.5, 2.0, value=1.0, step=0.05, label="Laugh time-stretch (x)")
-            emb_scale  = gr.Slider(0.5, 2.0, value=1.0, step=0.1, label="Embedding scale (StyleTTS2)")
             btn = gr.Button("Generate")
         with gr.Column():
             audio_out = gr.Audio(label="Kết quả", type="numpy")
     btn.click(
         fn=synthesize,
-        inputs=[text_in, neutral_in, whisper_in, giggle_in, laugh_in, emb_scale, laugh_gain, laugh_stch],
         outputs=audio_out
     )

 import re
 import torch
 import soundfile as sf
 import gradio as gr
+import numpy as np
 from styletts2 import tts
+SR_OUT = 24000
 # ---------------------------
 # Load StyleTTS2
 model = tts.StyleTTS2()
 # ---------------------------
+# Helper: extract style embedding từ 1 file neutral
+# (trong demo này ta chỉ có neutral, các style khác dùng "neutral" luôn,
+# nhưng có thể giả lập bằng cách áp embedding_scale hoặc fine-tune thêm)
 # ---------------------------
+def extract_neutral(file):
     if file is None:
         return None
+    wav, sr = sf.read(file)
+    if wav.ndim > 1:
+        wav = wav.mean(axis=1)  # mixdown mono
+    wav = torch.tensor(wav).float().unsqueeze(0)
+    return model.get_style_embedding(wav, sr)
 # ---------------------------
 # Core synthesis
 # ---------------------------
+# Regex sẽ bắt các tag mở/đóng như [whisper] ... [/whisper]
+TAG_PATTERN = r"(\[/?(?:whisper|giggle|laugh)\])"
+def synthesize(text, neutral_ref, embedding_scale=1.0):
+    style_neutral = extract_neutral(neutral_ref)
     if style_neutral is None:
         return None
+    # Trong trường hợp bạn có checkpoint style riêng, có thể thay thế ở đây.
+    # Ở demo này, tất cả style = neutral clone (bạn có thể mở rộng).
+    styles = {
+        "neutral": style_neutral,
+        "whisper": style_neutral,
+        "giggle":  style_neutral,
+        "laugh":   style_neutral,
+    }
+    # Parse text theo tag
     tokens = re.split(TAG_PATTERN, text)
+    current_style = styles["neutral"]
+    stack = []
+    final_audio = []
     for tok in tokens:
+        if not tok or tok.isspace():
             continue
+        if tok.startswith("[") and tok.endswith("]"):
+            tag = tok[1:-1].lower().strip("/")
+            if tok.startswith("[/"):  # closing tag
+                if stack:
+                    stack.pop()
+                current_style = styles["neutral"] if not stack else styles[stack[-1]]
+            else:  # opening tag
+                stack.append(tag)
+                current_style = styles[tag]
         else:
+            # synth đoạn text với style hiện tại
             audio = model.inference(
+                tok,
+                style_embedding=current_style * embedding_scale,
+                output_sample_rate=SR_OUT
             )
+            final_audio.append(audio.astype(np.float32))
+    if not final_audio:
         return None
+    audio_out = np.concatenate(final_audio, axis=0)
+    return (SR_OUT, audio_out)
 # ---------------------------
 # Gradio UI
 # ---------------------------
 with gr.Blocks() as demo:
+    gr.Markdown("# 🎙️ StyleTTS2 với Tag mở/đóng ([whisper]...[/whisper], [giggle]...[/giggle], [laugh]...[/laugh])")
     gr.Markdown(
+        "- Upload **1 file neutral** để clone giọng.\n"
+        "- Trong text, bạn có thể dùng tag mở/đóng để giữ style cho cả đoạn.\n"
+        "- Ví dụ: `Xin chào [whisper] tôi sẽ thì thầm trong đoạn này [/whisper] và giờ lại bình thường.`"
     )
     with gr.Row():
         with gr.Column():
             text_in = gr.Textbox(
+                value="Xin chào [laugh] đoạn này cười [/laugh] và bây giờ [whisper] tôi sẽ thì thầm một lúc [/whisper] rồi lại bình thường.",
+                label="Text với tags",
                 lines=4
             )
             neutral_in = gr.File(label="Neutral reference (.wav)", file_types=[".wav"])
+            emb_scale  = gr.Slider(0.5, 2.0, value=1.0, step=0.1, label="Embedding Scale")
             btn = gr.Button("Generate")
         with gr.Column():
             audio_out = gr.Audio(label="Kết quả", type="numpy")
     btn.click(
         fn=synthesize,
+        inputs=[text_in, neutral_in, emb_scale],
         outputs=audio_out
     )

requirements.txt CHANGED Viewed

@@ -1,6 +1,5 @@
 styletts2
 torch
 soundfile
-librosa
 gradio
 numpy

 styletts2
 torch
 soundfile
 gradio
 numpy