Spaces:

tonyshark
/

styletts2

Runtime error

App Files Files Community

tonyshark commited on Sep 17, 2025

Commit

5551445

verified ·

1 Parent(s): 4af036a

Upload 2 files

Browse files

Files changed (2) hide show

app.py +206 -88
requirements.txt +4 -3

app.py CHANGED Viewed

@@ -1,95 +1,213 @@
-import torch
-import soundfile as sf
 import gradio as gr
-from styletts2 import tts
-# ---------------------------
-# Load StyleTTS2
-# ---------------------------
-model = tts.StyleTTS2()
-# ---------------------------
-# Helper: extract style embedding from uploaded file
-# ---------------------------
-def extract_style(file):
-    if file is None:
-        return None
-    wav, sr = sf.read(file)
-    wav = torch.tensor(wav).float().unsqueeze(0)
-    return model.get_style_embedding(wav, sr)
-# ---------------------------
-# Core synthesis
-# ---------------------------
-def synthesize(text, giggle_file, whisper_file, laugh_file,
-               w_giggle=0.33, w_whisper=0.33, w_laugh=0.34,
-               embedding_scale=1.0):
-    # Load embeddings if provided
-    giggle = extract_style(giggle_file) if giggle_file else None
-    whisper = extract_style(whisper_file) if whisper_file else None
-    laugh = extract_style(laugh_file) if laugh_file else None
-    # Collect
-    styles, weights = [], []
-    if giggle is not None:
-        styles.append(giggle); weights.append(w_giggle)
-    if whisper is not None:
-        styles.append(whisper); weights.append(w_whisper)
-    if laugh is not None:
-        styles.append(laugh); weights.append(w_laugh)
-    if not styles:
-        return None  # no refs uploaded
-    # Normalize weights
-    total = sum(weights)
-    weights = [w/total for w in weights]
-    # Weighted sum of embeddings
-    blended = sum(w * s for w, s in zip(weights, styles))
-    blended = blended * embedding_scale
-    # Run inference
-    audio = model.inference(
-        text,
-        style_embedding=blended,
-        output_sample_rate=24000
     )
-    return (24000, audio)
-# ---------------------------
-# Gradio UI
-# ---------------------------
-with gr.Blocks() as demo:
-    gr.Markdown("# 🎙️ StyleTTS2 Expression Blending Demo (Giggles + Whisper + Laugh)")
     with gr.Row():
-        with gr.Column():
-            text_in = gr.Textbox(
-                value="This is a playful example combining giggles, whispers, and laughter.",
-                label="Text to Synthesize"
-            )
-            giggle_in = gr.File(label="Giggles reference (.wav)", file_types=[".wav"])
-            whisper_in = gr.File(label="Whisper reference (.wav)", file_types=[".wav"])
-            laugh_in = gr.File(label="Laugh reference (.wav)", file_types=[".wav"])
-            w_giggle_in = gr.Slider(0, 1, value=0.33, step=0.05, label="Giggle weight")
-            w_whisper_in = gr.Slider(0, 1, value=0.33, step=0.05, label="Whisper weight")
-            w_laugh_in = gr.Slider(0, 1, value=0.34, step=0.05, label="Laugh weight")
-            scale_in = gr.Slider(0.5, 2.0, value=1.0, step=0.1, label="Embedding Scale")
-            btn = gr.Button("Generate")
-        with gr.Column():
-            audio_out = gr.Audio(label="Generated Audio", type="numpy")
-    btn.click(
-        fn=synthesize,
-        inputs=[text_in, giggle_in, whisper_in, laugh_in,
-                w_giggle_in, w_whisper_in, w_laugh_in, scale_in],
-        outputs=audio_out
-    )
-demo.launch()

+import os
+import re
+import io
+import numpy as np
 import gradio as gr
+import soundfile as sf
+# Coqui TTS (có hỗ trợ StyleTTS2 trên một số checkpoint)
+from TTS.api import TTS
+# ====== Cấu hình model ======
+# Nếu môi trường có sẵn StyleTTS2 qua Coqui TTS, dùng id này;
+# còn không, hãy thay bằng checkpoint bạn có (VD HF repo path), hoặc xtts_v2 làm fallback.
+MODEL_ID_CANDIDATES = [
+    # Các tên có thể khả dụng (tùy môi trường / branch)
+    "tts_models/en/ljspeech/style_tts2",   # thường gặp
+    "styletts2",                           # một số build đặt alias
+    "tts_models/multilingual/multi-dataset/xtts_v2",  # fallback: XTTS v2
+]
+tts = None
+last_error = None
+for mid in MODEL_ID_CANDIDATES:
+    try:
+        tts = TTS(mid)
+        MODEL_ID = mid
+        break
+    except Exception as e:
+        last_error = str(e)
+        tts = None
+if tts is None:
+    raise RuntimeError(
+        f"Không tải được bất kỳ model nào trong {MODEL_ID_CANDIDATES}. "
+        f"Lỗi gần nhất: {last_error}\n"
+        f"Hãy chỉnh lại MODEL_ID_CANDIDATES sang checkpoint StyleTTS2 bạn có."
     )
+# ====== Xử lý TAGS ======
+TAG_STYLES = {
+    "<laugh>":   "with a playful laugh, ",
+    "<whisper>": "in a soft whisper, ",
+    "<naughty>": "in a cheeky mischievous tone, ",
+    "<giggle>":  "while giggling, ",
+    "<tease>":   "teasingly, ",
+    "<smirk>":   "with a sly smirk, ",
+    "<surprise>": "with surprise, ",
+    "<shock>":   "shocked, ",
+    "<romantic>": "in a warm romantic tone, ",
+    "<shy>":     "shyly, ",
+    "<excited>": "excitedly, ",
+    "<curious>": "curiously, ",
+    "<discover>":"as if discovering something new, ",
+    "<blush>":   "blushingly, ",
+}
+TAG_GAIN = {
+    # Điều chỉnh cường độ (đơn giản) theo tag (hệ số nhân biên độ)
+    "<whisper>": 0.55,
+    "<romantic>": 0.9,
+    "<shy>": 0.8,
+    "<excited>": 1.1,
+    "<laugh>": 1.05,
+    "<naughty>": 1.0,
+    "<giggle>": 1.05,
+    "<surprise>": 1.0,
+    "<shock>": 1.0,
+    "<tease>": 1.0,
+    "<smirk>": 1.0,
+    "<curious>": 1.0,
+    "<discover>": 1.0,
+    "<blush>": 0.9,
+}
+LAUGH_FILLERS = {
+    "<laugh>": " haha, ",
+    "<giggle>": " hehe, ",
+}
+TAG_PATTERN = re.compile(r"^<([a-z]+)>", re.IGNORECASE)
+def parse_lines(text: str):
+    """
+    Tách văn bản theo dòng. Mỗi dòng có thể bắt đầu bằng tag <...>.
+    Trả về list (tag, content) — nếu không có tag, tag=None.
+    """
+    lines = []
+    for raw in text.splitlines():
+        line = raw.strip()
+        if not line:
+            continue
+        m = TAG_PATTERN.match(line)
+        if m:
+            tag = f"<{m.group(1).lower()}>"
+            content = line[m.end():].strip()
+        else:
+            tag = None
+            content = line
+        lines.append((tag, content))
+    return lines
+def style_prefix(tag: str) -> str:
+    if tag in TAG_STYLES:
+        base = TAG_STYLES[tag]
+        # chèn 'haha/hehe' nhẹ nếu là laugh/giggle
+        filler = LAUGH_FILLERS.get(tag, "")
+        return (base + filler).strip() + " "
+    return ""
+def apply_gain(wav: np.ndarray, tag: str) -> np.ndarray:
+    if tag in TAG_GAIN:
+        return np.clip(wav * TAG_GAIN[tag], -1.0, 1.0)
+    return wav
+def synth_one_line(tag: str, content: str, speaker_wav=None, language=None, sample_rate=22050):
+    """
+    Gọi TTS cho từng dòng. Tùy model:
+    - Một số model (XTTS v2) hỗ trợ tham số 'speaker'/'speaker_wav'/'language'
+    - StyleTTS2 thường chỉ cần text là đủ.
+    """
+    # Ghép prefix diễn cảm + nội dung dòng
+    txt = (style_prefix(tag) if tag else "") + content
+    # Gọi synth
+    # Coqui TTS trả về np.ndarray waveform, thường chuẩn hóa [-1, 1]
+    # Thử các chữ ký phổ biến; nếu model không nhận tham số speaker/language thì bỏ đi.
+    try:
+        wav = tts.tts(txt, speaker_wav=speaker_wav, language=language)  # type: ignore
+    except TypeError:
+        wav = tts.tts(txt)  # đơn giản nhất
+    wav = np.asarray(wav, dtype=np.float32)
+    wav = apply_gain(wav, tag if tag else "")
+    return wav, sample_rate
+def concat_waveforms(chunks, sr=22050, pad_ms=140):
+    """
+    Nối nhiều đoạn audio, thêm khoảng lặng ngắn giữa các dòng cho tự nhiên.
+    """
+    pad = np.zeros(int(sr * pad_ms / 1000.0), dtype=np.float32)
+    out = []
+    for i, w in enumerate(chunks):
+        if i > 0:
+            out.append(pad)
+        out.append(w)
+    if out:
+        return np.concatenate(out)
+    return np.zeros(int(sr * 0.5), dtype=np.float32)
+def tts_from_tagged_text(text: str, language: str):
+    lines = parse_lines(text)
+    if not lines:
+        raise ValueError("Văn bản trống. Hãy nhập ít nhất một dòng.")
+    # Suy luận language cho model đa ngôn ngữ (nếu dùng XTTS v2 làm fallback).
+    lang = None
+    if "xtts_v2" in (locals().get("MODEL_ID", "") or "") or "multilingual" in (locals().get("MODEL_ID", "") or ""):
+        lang = language if language else "en"
+    pieces = []
+    sr = 22050
+    for tag, content in lines:
+        wav, sr = synth_one_line(tag, content, language=lang, sample_rate=sr)
+        pieces.append(wav)
+    full = concat_waveforms(pieces, sr=sr)
+    buf = io.BytesIO()
+    sf.write(buf, full, sr, format="WAV")
+    buf.seek(0)
+    return (sr, buf.read())
+# ====== Gradio UI ======
+EXAMPLE_TEXT = """<whisper> I slowly opened the library door at midnight.
+<laugh> My friend chuckled, “We must be crazy to sneak in here!”
+<surprise> A book fell with a loud thud; we both jumped.
+<naughty> I smirked, “You did that on purpose, didn’t you?”
+<romantic> In the blue moonlight, your eyes were brighter than any lantern.
+"""
+DESC_MD = """
+### 📖 How to use expressive tags
+Start important lines with a tag to add emotion & style.
+Supported tags: `<laugh>`, `<whisper>`, `<naughty>`, `<giggle>`, `<tease>`, `<smirk>`, `<surprise>`, `<shock>`, `<romantic>`, `<shy>`, `<excited>`, `<curious>`, `<discover>`, `<blush>`.
+"""
+def infer(text, language):
+    sr, wav_bytes = tts_from_tagged_text(text, language)
+    return (sr, wav_bytes)
+with gr.Blocks(title="StyleTTS2 Tagged TTS") as demo:
+    gr.Markdown(f"# 🎙️ StyleTTS2 Tagged TTS\n**Model:** `{MODEL_ID}`\n\n{DESC_MD}")
+    with gr.Row():
+        txt = gr.Textbox(
+            label="Tagged story text",
+            value=EXAMPLE_TEXT,
+            lines=10,
+            placeholder="Begin each important line with a tag, e.g. <whisper> ...",
+        )
+        out = gr.Audio(label="Audio Output", type="numpy")
     with gr.Row():
+        lang = gr.Dropdown(
+            label="Language (for multilingual fallback models like XTTS v2)",
+            choices=["", "en", "vi", "es", "fr", "de", "it", "pt", "ja", "ko", "zh"],
+            value="",
+        )
+        btn = gr.Button("Synthesize")
+    btn.click(infer, inputs=[txt, lang], outputs=[out])
+if __name__ == "__main__":
+    demo.launch()

requirements.txt CHANGED Viewed

@@ -1,4 +1,5 @@
-styletts2
-soundfile
 gradio
-torch

+TTS>=0.22.0
+torch
 gradio
+soundfile
+numpy