Spaces:

tonyshark
/

styletts2

Runtime error

App Files Files Community

tonyshark commited on Sep 17, 2025

Commit

1b32626

verified ·

1 Parent(s): 7014f5a

Upload 2 files

Browse files

Files changed (2) hide show

app.py +137 -53
requirements.txt +4 -1

app.py CHANGED Viewed

@@ -1,54 +1,108 @@
 import re
-import torch
 import soundfile as sf
 import gradio as gr
-import numpy as np
 from styletts2 import tts
 SR_OUT = 24000
-# ---------------------------
-# Load StyleTTS2
-# ---------------------------
 model = tts.StyleTTS2()
 # ---------------------------
-# Helper: extract style embedding từ 1 file neutral
-# (trong demo này ta chỉ có neutral, các style khác dùng "neutral" luôn,
-# nhưng có thể giả lập bằng cách áp embedding_scale hoặc fine-tune thêm)
 # ---------------------------
 def extract_neutral(file):
     if file is None:
         return None
     wav, sr = sf.read(file)
     if wav.ndim > 1:
-        wav = wav.mean(axis=1)  # mixdown mono
     wav = torch.tensor(wav).float().unsqueeze(0)
     return model.get_style_embedding(wav, sr)
 # ---------------------------
-# Core synthesis
 # ---------------------------
-# Regex sẽ bắt các tag mở/đóng như [whisper] ... [/whisper]
-TAG_PATTERN = r"(\[/?(?:whisper|giggle|laugh)\])"
-def synthesize(text, neutral_ref, embedding_scale=1.0):
-    style_neutral = extract_neutral(neutral_ref)
     if style_neutral is None:
-        return None
-    # Trong trường hợp bạn có checkpoint style riêng, có thể thay thế ở đây.
-    # Ở demo này, tất cả style = neutral clone (bạn có thể mở rộng).
-    styles = {
-        "neutral": style_neutral,
-        "whisper": style_neutral,
-        "giggle":  style_neutral,
-        "laugh":   style_neutral,
-    }
-    # Parse text theo tag
     tokens = re.split(TAG_PATTERN, text)
     current_style = styles["neutral"]
     stack = []
     final_audio = []
@@ -56,17 +110,16 @@ def synthesize(text, neutral_ref, embedding_scale=1.0):
     for tok in tokens:
         if not tok or tok.isspace():
             continue
-        if tok.startswith("[") and tok.endswith("]"):
-            tag = tok[1:-1].lower().strip("/")
-            if tok.startswith("[/"):  # closing tag
                 if stack:
                     stack.pop()
                 current_style = styles["neutral"] if not stack else styles[stack[-1]]
-            else:  # opening tag
                 stack.append(tag)
-                current_style = styles[tag]
         else:
-            # synth đoạn text với style hiện tại
             audio = model.inference(
                 tok,
                 style_embedding=current_style * embedding_scale,
@@ -75,39 +128,70 @@ def synthesize(text, neutral_ref, embedding_scale=1.0):
             final_audio.append(audio.astype(np.float32))
     if not final_audio:
-        return None
-    audio_out = np.concatenate(final_audio, axis=0)
-    return (SR_OUT, audio_out)
 # ---------------------------
 # Gradio UI
 # ---------------------------
 with gr.Blocks() as demo:
-    gr.Markdown("# 🎙️ StyleTTS2 với Tag mở/đóng ([whisper]...[/whisper], [giggle]...[/giggle], [laugh]...[/laugh])")
-    gr.Markdown(
-        "- Upload **1 file neutral** để clone giọng.\n"
-        "- Trong text, bạn có thể dùng tag mở/đóng để giữ style cho cả đoạn.\n"
-        "- Ví dụ: `Xin chào [whisper] tôi sẽ thì thầm trong đoạn này [/whisper] và giờ lại bình thường.`"
-    )
     with gr.Row():
         with gr.Column():
-            text_in = gr.Textbox(
-                value="Xin chào [laugh] đoạn này cười [/laugh] và bây giờ [whisper] tôi sẽ thì thầm một lúc [/whisper] rồi lại bình thường.",
-                label="Text với tags",
-                lines=4
-            )
             neutral_in = gr.File(label="Neutral reference (.wav)", file_types=[".wav"])
-            emb_scale  = gr.Slider(0.5, 2.0, value=1.0, step=0.1, label="Embedding Scale")
             btn = gr.Button("Generate")
         with gr.Column():
-            audio_out = gr.Audio(label="Kết quả", type="numpy")
-    btn.click(
-        fn=synthesize,
-        inputs=[text_in, neutral_in, emb_scale],
-        outputs=audio_out
-    )
 demo.launch()

 import re
+import numpy as np
 import soundfile as sf
+import torch
 import gradio as gr
+import matplotlib.pyplot as plt
 from styletts2 import tts
+from scipy.signal import fftconvolve
+import librosa
 SR_OUT = 24000
 model = tts.StyleTTS2()
 # ---------------------------
+# Helper
 # ---------------------------
 def extract_neutral(file):
     if file is None:
         return None
     wav, sr = sf.read(file)
     if wav.ndim > 1:
+        wav = wav.mean(axis=1)
     wav = torch.tensor(wav).float().unsqueeze(0)
     return model.get_style_embedding(wav, sr)
+def load_wav(path, sr_target=SR_OUT):
+    wav, sr = sf.read(path)
+    if wav.ndim > 1:
+        wav = wav.mean(axis=1)
+    if sr != sr_target:
+        wav = librosa.resample(wav.astype(np.float32), orig_sr=sr, target_sr=sr_target)
+        sr = sr_target
+    return wav.astype(np.float32), sr
+def apply_reverb(wav, ir_path):
+    ir, _ = load_wav(ir_path, sr_target=SR_OUT)
+    return fftconvolve(wav, ir, mode="full")
+def add_noise(wav, noise_path, snr_db=10):
+    noise, _ = load_wav(noise_path, sr_target=SR_OUT)
+    if len(noise) < len(wav):
+        noise = np.tile(noise, int(len(wav)/len(noise)) + 1)
+    noise = noise[:len(wav)]
+    sig_power = np.mean(wav**2)
+    noise_power = np.mean(noise**2)
+    scale = np.sqrt(sig_power / (10**(snr_db/10) * noise_power))
+    return wav + noise * scale
+def bandlimit_phone(wav, sr=SR_OUT):
+    return librosa.effects.preemphasis(wav)
+def plot_waveforms(clean, processed, sr=SR_OUT):
+    fig, axes = plt.subplots(2, 1, figsize=(10, 4), sharex=True)
+    t_clean = np.arange(len(clean)) / sr
+    t_proc = np.arange(len(processed)) / sr
+    axes[0].plot(t_clean, clean, color="blue")
+    axes[0].set_title("Waveform sạch (StyleTTS2)")
+    axes[1].plot(t_proc, processed, color="red")
+    axes[1].set_title("Waveform sau khi áp môi trường/noise")
+    axes[1].set_xlabel("Thời gian (s)")
+    fig.tight_layout()
+    return fig
 # ---------------------------
+# Tag list
 # ---------------------------
+TAG_LIST = {
+    "laugh": "😆 Cười thoải mái",
+    "whisper": "🤫 Thì thầm",
+    "naughty": "😏 Tinh nghịch",
+    "giggle": "😂 Cười rúc rích",
+    "tease": "😉 Trêu chọc",
+    "smirk": "😼 Đắc ý",
+    "surprise": "😲 Ngạc nhiên",
+    "shock": "😱 Hoảng hốt",
+    "romantic": "❤️ Lãng mạn",
+    "shy": "🫣 Bẽn lẽn",
+    "excited": "🤩 Phấn khích",
+    "curious": "🧐 Tò mò",
+    "discover": "✨ Phát hiện",
+    "blush": "🌸 Ngượng ngùng",
+    "angry": "😡 Giận dữ",
+    "sad": "😢 Buồn",
+    "happy": "😊 Vui vẻ",
+    "fear": "😨 Sợ hãi",
+    "confident": "😎 Tự tin",
+    "serious": "😐 Nghiêm túc",
+    "tired": "🥱 Mệt mỏi",
+    "cry": "😭 Khóc",
+    "love": "😍 Yêu thương",
+    "disgust": "🤢 Ghê tởm",
+}
+TAG_PATTERN = r"(<\/?(?:" + "|".join(TAG_LIST.keys()) + ")>)"
+# ---------------------------
+# Core synthesis
+# ---------------------------
+def synthesize(text, env, neutral_file, embedding_scale=1.0, snr_db=10):
+    style_neutral = extract_neutral(neutral_file)
     if style_neutral is None:
+        return None, None, None
+    styles = {k: style_neutral for k in ["neutral"] + list(TAG_LIST.keys())}
     tokens = re.split(TAG_PATTERN, text)
     current_style = styles["neutral"]
     stack = []
     final_audio = []
     for tok in tokens:
         if not tok or tok.isspace():
             continue
+        if tok.startswith("<") and tok.endswith(">"):
+            tag = tok.strip("<>/").lower()
+            if tok.startswith("</"):
                 if stack:
                     stack.pop()
                 current_style = styles["neutral"] if not stack else styles[stack[-1]]
+            else:
                 stack.append(tag)
+                current_style = styles.get(tag, style_neutral)
         else:
             audio = model.inference(
                 tok,
                 style_embedding=current_style * embedding_scale,
             final_audio.append(audio.astype(np.float32))
     if not final_audio:
+        return None, None, None
+    clean_audio = np.concatenate(final_audio, axis=0)
+    processed = clean_audio.copy()
+    # Apply environment
+    if env == "Church":
+        processed = apply_reverb(processed, "ir_church.wav")
+    elif env == "Hall":
+        processed = apply_reverb(processed, "ir_hall.wav")
+    elif env == "Cafe":
+        processed = add_noise(processed, "noise_cafe.wav", snr_db=snr_db)
+    elif env == "Street":
+        processed = add_noise(processed, "noise_street.wav", snr_db=snr_db)
+    elif env == "Office":
+        processed = add_noise(processed, "noise_office.wav", snr_db=snr_db)
+    elif env == "Supermarket":
+        processed = add_noise(processed, "noise_supermarket.wav", snr_db=snr_db)
+    elif env == "Phone":
+        processed = bandlimit_phone(processed, sr=SR_OUT)
+    fig = plot_waveforms(clean_audio, processed, sr=SR_OUT)
+    return (SR_OUT, processed.astype(np.float32)), fig, (SR_OUT, clean_audio.astype(np.float32))
+# ---------------------------
+# Examples
+# ---------------------------
+EXAMPLES = [
+    "Xin chào <whisper> tôi nói nhỏ </whisper> rồi <laugh> bật cười </laugh>.",
+    "Tôi cảm thấy <happy> vui </happy> nhưng cũng <sad> buồn </sad>.",
+    "Khi <surprise> bất ngờ </surprise> tôi <shock> hoảng hốt </shock>.",
+]
 # ---------------------------
 # Gradio UI
 # ---------------------------
 with gr.Blocks() as demo:
+    gr.Markdown("# 🎙️ StyleTTS2 + Tags + Environment + Noise Level + Waveform Preview")
+    with gr.Accordion("📑 Danh sách Tags + Emoji", open=False):
+        md = "| Tag | Ý nghĩa |\n|-----|----------|\n"
+        for k, v in TAG_LIST.items():
+            md += f"| `<{k}>...</{k}>` | {v} |\n"
+        gr.Markdown(md)
     with gr.Row():
         with gr.Column():
+            text_in = gr.Textbox(value=EXAMPLES[0], label="Text với tags", lines=4)
             neutral_in = gr.File(label="Neutral reference (.wav)", file_types=[".wav"])
+            env_in = gr.Dropdown(
+                choices=["Neutral", "Church", "Hall", "Cafe", "Street", "Phone", "Office", "Supermarket"],
+                value="Neutral", label="Environment"
+            )
+            snr_slider = gr.Slider(0, 30, value=10, step=1, label="Noise SNR (dB)")
+            emb_scale = gr.Slider(0.5, 2.0, value=1.0, step=0.1, label="Embedding Scale")
             btn = gr.Button("Generate")
+            gr.Examples(examples=[[ex] for ex in EXAMPLES], inputs=[text_in], label="Ví dụ nhanh")
         with gr.Column():
+            audio_out = gr.Audio(label="Output (processed)", type="numpy")
+            clean_out = gr.Audio(label="Waveform sạch (StyleTTS2)", type="numpy")
+            wave_plot = gr.Plot(label="So sánh Waveform")
+    btn.click(fn=synthesize,
+              inputs=[text_in, env_in, neutral_in, emb_scale, snr_slider],
+              outputs=[audio_out, wave_plot, clean_out])
 demo.launch()

requirements.txt CHANGED Viewed

@@ -1,5 +1,8 @@
 styletts2
 torch
 soundfile
-gradio
 numpy

 styletts2
 torch
 soundfile
 numpy
+scipy
+gradio
+librosa
+matplotlib