Spaces:

mehdi999
/

pardi-speech

Running on Zero

App Files Files Community

mehdi999 commited on Oct 29

Commit

a175cfa

1 Parent(s): bb32e4f

Force FLA mode=chunk to avoid Triton fused kernels on ZeroGPU

Browse files

Files changed (3) hide show

app.py.bak +163 -0
tts/model/simple_gla.py +1 -1
tts/model/simple_gla.py.bak +295 -0

app.py.bak ADDED Viewed

	@@ -0,0 +1,163 @@

+import os
+import gradio as gr
+import numpy as np
+import torch
+import soundfile as sf
+import spaces
+from huggingface_hub import login
+from pardi_speech import PardiSpeech, VelocityHeadSamplingParams  # présent dans ce repo
+MODEL_REPO_ID = os.environ.get("MODEL_REPO_ID", "theodorr/pardi-speech-enfr-forbidden")
+HF_TOKEN = os.environ.get("HF_TOKEN")
+if HF_TOKEN:
+    try:
+        login(token=HF_TOKEN)
+        print("✅ Logged to Hugging Face Hub.")
+    except Exception as e:
+        print("⚠️ HF login failed:", e)
+_pardi = None
+_sampling_rate = 24000
+def _normalize_text(s: str, lang_hint: str = "fr") -> str:
+    s = (s or "").strip().lower()
+    try:
+        import re
+        from num2words import num2words
+        def repl(m): return num2words(int(m.group()), lang=lang_hint)
+        s = re.sub(r"\d+", repl, s)
+    except Exception:
+        pass
+    return s
+def _load_model(device: str = "cuda"):
+    global _pardi, _sampling_rate
+    if _pardi is None:
+        _pardi = PardiSpeech.from_pretrained(MODEL_REPO_ID, map_location=device)
+        _sampling_rate = getattr(_pardi, "sampling_rate", 24000)
+        print(f"✅ PardiSpeech loaded on {device} (sr={_sampling_rate}).")
+    return _pardi
+def _to_mono_float32(arr: np.ndarray) -> np.ndarray:
+    arr = arr.astype(np.float32)
+    if arr.ndim == 2:
+        arr = arr.mean(axis=1)
+    return arr
+@spaces.GPU(duration=120)
+def synthesize(
+    text: str,
+    ref_audio,
+    ref_text: str,
+    steps: int,
+    cfg: float,
+    cfg_ref: float,
+    temperature: float,
+    max_seq_len: int,
+    seed: int,
+    lang_hint: str
+):
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    torch.manual_seed(int(seed))
+    pardi = _load_model(device)
+    txt = _normalize_text(text, lang_hint=lang_hint)
+    cache = pardi.tts.audio_decoder.init_cache(int(max_seq_len), device)
+    # --- IMPORTANT : signature de VelocityHeadSamplingParams ---
+    # Dans ton notebook d’inférence, la classe attend (cfg_ref, cfg, num_steps) SANS 'temperature'.
+    # On essaie d’abord sans temperature, puis fallback si la classe en accepte une.
+    try:
+        vel_params = VelocityHeadSamplingParams(
+            cfg_ref=float(cfg_ref),
+            cfg=float(cfg),
+            num_steps=int(steps)
+        )
+    except TypeError:
+        vel_params = VelocityHeadSamplingParams(
+            cfg_ref=float(cfg_ref),
+            cfg=float(cfg),
+            num_steps=int(steps),
+            temperature=float(temperature)
+        )
+    # Prefix optionnel
+    prefix = None
+    if ref_audio is not None:
+        if isinstance(ref_audio, str):
+            wav, sr = sf.read(ref_audio)
+        else:
+            sr, wav = ref_audio
+        wav = _to_mono_float32(np.array(wav))
+        wav_t = torch.from_numpy(wav).to(device)
+        import torchaudio
+        if sr != pardi.sampling_rate:
+            wav_t = torchaudio.functional.resample(wav_t, sr, pardi.sampling_rate)
+        wav_t = wav_t.unsqueeze(0)
+        with torch.inference_mode():
+            prefix_tokens = pardi.patchvae.encode(wav_t)
+        prefix = (ref_text or "", prefix_tokens[0])
+    print(f"[debug] has_prefix={prefix is not None}, steps={steps}, cfg={cfg}, cfg_ref={cfg_ref}, T={temperature}, max_seq_len={max_seq_len}, seed={seed}")
+    try:
+        with torch.inference_mode():
+            wavs, _ = pardi.text_to_speech(
+                [txt],
+                prefix,
+                max_seq_len=int(max_seq_len),
+                velocity_head_sampling_params=vel_params,
+                cache=cache
+            )
+    except Exception as e:
+        import traceback, sys
+        print("❌ text_to_speech failed:", e, file=sys.stderr)
+        traceback.print_exc()
+        raise gr.Error(f"Synthèse échouée: {type(e).__name__}: {e}")
+    wav = wavs[0].detach().cpu().numpy()
+    return (_sampling_rate, wav)
+def build_demo():
+    with gr.Blocks(title="Lina-speech / pardi-speech Demo") as demo:
+        gr.Markdown(
+            "## Lina-speech (pardi-speech) – Démo TTS\n"
+            "Génère de l'audio à partir de texte, avec ou sans *prefix* (audio de référence).\n"
+            "Paramètres avancés: *num_steps*, *CFG*, *température*, *max_seq_len*, *seed*."
+        )
+        with gr.Row():
+            text = gr.Textbox(label="Texte à synthétiser", lines=4, placeholder="Tape ton texte ici…")
+        with gr.Accordion("Prefix (optionnel)", open=False):
+            ref_audio = gr.Audio(sources=["upload", "microphone"], type="numpy", label="Audio de référence")
+            ref_text  = gr.Textbox(label="Texte du prefix (si connu)", placeholder="Transcription du prefix (optionnel)")
+        with gr.Accordion("Options avancées", open=False):
+            with gr.Row():
+                steps = gr.Slider(1, 50, value=10, step=1, label="num_steps")
+                cfg = gr.Slider(0.5, 3.0, value=1.4, step=0.05, label="CFG (guidance)")
+                cfg_ref = gr.Slider(0.5, 3.0, value=1.0, step=0.05, label="CFG (réf.)")
+            with gr.Row():
+                temperature = gr.Slider(0.1, 2.0, value=1.0, step=0.05, label="Température")
+                max_seq_len = gr.Slider(50, 1200, value=300, step=10, label="max_seq_len (tokens audio)")
+                seed = gr.Number(value=0, precision=0, label="Seed (reproductibilité)")
+            lang_hint = gr.Dropdown(choices=["fr", "en"], value="fr", label="Langue (normalisation)")
+        btn = gr.Button("Synthétiser")
+        out_audio = gr.Audio(label="Sortie audio", type="numpy")
+        demo.queue(default_concurrency_limit=1, max_size=32)
+        btn.click(
+            fn=synthesize,
+            inputs=[text, ref_audio, ref_text, steps, cfg, cfg_ref, temperature, max_seq_len, seed, lang_hint],
+            outputs=[out_audio]
+        )
+    return demo
+if __name__ == "__main__":
+    demo = build_demo()
+    demo.launch()
+# retrigger 2025-10-29T16:27:55+01:00

tts/model/simple_gla.py CHANGED Viewed

@@ -43,7 +43,7 @@ class SimpleGLABlock(nn.Module):
         ffn_expansion_factor: int,
     ):
         super().__init__()
-        self.tmix = SimpleGatedLinearAttention(
             hidden_size=dim,
             num_heads=num_heads,
             layer_idx=layer_idx,

         ffn_expansion_factor: int,
     ):
         super().__init__()
+        self.tmix = SimpleGatedLinearAttention(mode='chunk',
             hidden_size=dim,
             num_heads=num_heads,
             layer_idx=layer_idx,

tts/model/simple_gla.py.bak ADDED Viewed

	@@ -0,0 +1,295 @@

+import os
+#simple-gla
+import torch
+import torch.nn.functional as F
+from einops import rearrange
+from fla.layers.simple_gla import SimpleGatedLinearAttention
+from fla.models.utils import Cache
+from sympy import num_digits
+from torch import nn
+from tts.layers.attention import CrossAttention
+from tts.layers.ffn import SwiGLU
+from .cache_utils import FLACache
+from .config import SimpleGLADecoderConfig
+from .registry import register_decoder
+from .shortconv import ShortConvBlock
+if "GRAD_CKPT" in os.environ:
+    def maybe_grad_ckpt(f):
+        def grad_ckpt_f(*args, **kwargs):
+            return torch.utils.checkpoint.checkpoint(
+                f, *args, **kwargs, use_reentrant=False
+            )
+        return grad_ckpt_f
+else:
+    def maybe_grad_ckpt(f):
+        return f
+class SimpleGLABlock(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        num_heads: int,
+        layer_idx: int,
+        expand_k: float,
+        expand_v: float,
+        use_short_conv: bool,
+        ffn_expansion_factor: int,
+    ):
+        super().__init__()
+        self.tmix = SimpleGatedLinearAttention(
+            hidden_size=dim,
+            num_heads=num_heads,
+            layer_idx=layer_idx,
+        )
+        self.cmix = SwiGLU(dim, ffn_expansion_factor)
+        self.norm1 = nn.LayerNorm(dim)
+        self.norm2 = nn.LayerNorm(dim)
+    def forward(
+    self,
+    x,
+    freqs: torch.Tensor | None = None,
+    text_freqs: torch.Tensor | None = None,
+    cache: Cache | None = None,
+    ):
+    # N’active le cache QUE s’il est utilisable (conv_state non nul)
+      use_cache_flag = isinstance(cache, dict) and cache.get("conv_state", None) not in (None, [])
+      pkv = cache if use_cache_flag else None
+      x = (
+        self.tmix(
+            self.norm1(x),
+            past_key_values=pkv,
+            use_cache=use_cache_flag,
+        )[0]
+        + x
+      )
+      x = self.cmix(self.norm2(x)) + x
+      return x
+class DecoderBlockWithOptionalCrossAttention(nn.Module):
+    def __init__(self, decoder_block: nn.Module, crossatt: nn.Module | None = None):
+        super().__init__()
+        self.decoder_block = decoder_block
+        self.crossatt = crossatt
+    def forward(
+        self,
+        x: torch.Tensor,
+        encoder_output: torch.Tensor | None = None,
+        freqs: torch.Tensor | None = None,
+        text_freqs: torch.Tensor | None = None,
+        cache: Cache | None = None,
+        selfatt_mask: torch.Tensor | None = None,
+        crossatt_mask: torch.Tensor | list[torch.Tensor] | None = None,
+    ) -> torch.Tensor:
+        x = self.decoder_block(
+            x,
+            freqs=freqs,
+            cache=cache,
+        )
+        if type(crossatt_mask) is list:
+            crossatt_mask = crossatt_mask[self.decoder_block.tmix.layer_idx]
+        if self.crossatt is not None:
+            x = x + self.crossatt(
+                x,
+                k=encoder_output,
+                text_freqs=text_freqs,
+                mask=crossatt_mask,
+                cache=cache,
+            )
+        return x
+@register_decoder("simple_gla")
+class SimpleGLADecoder(nn.Module):
+    config = SimpleGLADecoderConfig
+    def __init__(self, cfg: SimpleGLADecoderConfig):
+        super().__init__()
+        assert cfg.dim % cfg.num_heads == 0, "num_heads should divide dim"
+        assert cfg.blind_crossatt + (cfg.listen_read_crossatt is not None) < 2, (
+            "at most one specialized cross-attention"
+        )
+        self.head_dim = cfg.dim // cfg.num_heads
+        self.num_heads = cfg.num_heads
+        def simple_gla_block(i):
+            conv_layers = [] if cfg.conv_layers is None else cfg.conv_layers
+            if i in conv_layers:
+                return ShortConvBlock(
+                    dim=cfg.dim,
+                    kernel_size=4,
+                    ffn_expansion_factor=cfg.ffn_expansion_factor,
+                    layer_idx=i,
+                    use_fast_conv1d=True,
+                )
+            else:
+                return SimpleGLABlock(
+                    dim=cfg.dim,
+                    num_heads=cfg.num_heads,
+                    layer_idx=i,
+                    expand_k=cfg.expand_k,
+                    expand_v=cfg.expand_v,
+                    use_short_conv=cfg.use_short_conv,
+                    ffn_expansion_factor=cfg.ffn_expansion_factor,
+                )
+        def crossatt_block(i):
+            if i in cfg.crossatt_layer_idx:
+                return CrossAttention(
+                    dim=cfg.dim,
+                    num_heads=cfg.crossatt_num_heads,
+                    dropout=cfg.crossatt_dropout,
+                    layer_idx=i,
+                )
+            else:
+                return None
+        self.decoder_layers = nn.ModuleList(
+            [
+                DecoderBlockWithOptionalCrossAttention(
+                    simple_gla_block(i),
+                    crossatt_block(i),
+                )
+                for i in range(cfg.num_layers)
+            ]
+        )
+    def forward(
+        self,
+        encoder_output: torch.Tensor,
+        decoder_input: torch.Tensor,
+        crossatt_mask: torch.Tensor | list[torch.Tensor] | None = None,
+        text_ids: torch.Tensor | None = None,
+        cache: FLACache | None = None,
+    ):
+        x = decoder_input
+        text_freqs = None
+        for layer in self.decoder_layers:
+            x = maybe_grad_ckpt(layer)(
+                x,
+                encoder_output,
+                text_freqs=text_freqs,
+                cache=cache,
+                crossatt_mask=crossatt_mask,
+            )
+        return x
+    def init_cache(self, max_seq_len, device):
+        return FLACache(num_states=len(self.decoder_layers) + 1)
+    def init_initial_state(self, batch_size=1, scale=1e-2, device="cpu"):
+        return tuple(
+            nn.Parameter(
+                torch.randn(
+                    batch_size,
+                    self.num_heads,
+                    self.head_dim,
+                    self.head_dim,
+                    device=device,
+                )
+                * scale
+            )
+            for _ in range(len(self.decoder_layers))
+        )
+    def init_initial_state_lora(self, lora:int=1, batch_size: int = 1, scale: float=1e-2, device: str="cpu"):
+        return tuple(
+            (
+                nn.Parameter(
+                torch.randn(
+                    batch_size,
+                    self.num_heads,
+                    self.head_dim,
+                    lora,
+                    device=device,
+                )
+                * scale
+            ),
+                 nn.Parameter(
+                    torch.randn(
+                        batch_size,
+                        self.num_heads,
+                        lora,
+                        self.head_dim,
+                        device=device,
+                    )
+                    * scale
+                )
+            )
+            for _ in range(len(self.decoder_layers))
+        )
+    def _get_query(self, audio_inputs: torch.Tensor, layer_idx: int):
+        assert self.decoder_layers[layer_idx].crossatt is not None
+        x = audio_inputs
+        for _, layer in zip(range(layer_idx - 1), self.decoder_layers):
+            x = layer(x, None)
+        return self.decoder_layers[layer_idx].crossatt._query(x)
+    def forward_first_n_layers(
+        self,
+        encoder_output: torch.Tensor,
+        decoder_input: torch.Tensor,
+        n_first_layers: int,
+        crossatt_mask: torch.Tensor | None = None,
+        cache: FLACache | None = None,
+    ):
+        x = decoder_input
+        if self.text_freqs_embd is not None:
+            text_freqs = torch.arange(encoder_output.shape[1], device=x.device)[None, :]
+            text_freqs = self.text_freqs_embd(text_freqs)
+        else:
+            text_freqs = None
+        for layer in self.decoder_layers[:n_first_layers]:
+            x = maybe_grad_ckpt(layer)(
+                x,
+                encoder_output,
+                text_freqs=text_freqs,
+                cache=cache,
+                crossatt_mask=crossatt_mask,
+            )
+        return x
+    def prefill(
+        self,
+        encoder_output: torch.Tensor,
+        decoder_input: torch.Tensor,
+        crossatt_mask: torch.Tensor | None = None,
+        cache: FLACache | None = None,
+    ):
+        return self(encoder_output, decoder_input, cache=cache, crossatt_mask=crossatt_mask)
+    def decode_one(
+        self,
+        encoder_output: torch.Tensor,
+        decoder_input: torch.Tensor,
+        cache: Cache,
+        text_freqs: torch.Tensor | None = None,
+        crossatt_mask: torch.Tensor | None = None,
+    ):
+        x = decoder_input
+        for layer in self.decoder_layers:
+            x = layer(
+                x,
+                encoder_output,
+                text_freqs=text_freqs,
+                cache=cache,
+                crossatt_mask=crossatt_mask,
+            )
+        return x