Spaces:

mehdi999
/

pardi-speech

Running on Zero

App Files Files Community

mehdi999 commited on Oct 30

Commit

3d734f0

1 Parent(s): 7208504

added watch

Browse files

Files changed (1) hide show

app.py +137 -70

app.py CHANGED Viewed

@@ -8,6 +8,14 @@ import spaces
 # FLA: forcer les convolutions en backend PyTorch (pas de Triton)
 os.environ.setdefault("FLA_CONV_BACKEND", "torch")
 os.environ.setdefault("FLA_USE_FAST_OPS", "0")
 from huggingface_hub import login
 from pardi_speech import PardiSpeech, VelocityHeadSamplingParams  # présent dans ce repo
@@ -24,17 +32,20 @@ if HF_TOKEN:
 _pardi = None
 _sampling_rate = 24000
 def _normalize_text(s: str, lang_hint: str = "fr") -> str:
     s = (s or "").strip().lower()
     try:
         import re
         from num2words import num2words
         def repl(m): return num2words(int(m.group()), lang=lang_hint)
         s = re.sub(r"\d+", repl, s)
     except Exception:
         pass
     return s
 def _load_model(device: str = "cuda"):
     global _pardi, _sampling_rate
     if _pardi is None:
@@ -43,15 +54,18 @@ def _load_model(device: str = "cuda"):
         print(f"✅ PardiSpeech loaded on {device} (sr={_sampling_rate}).")
     return _pardi
 def _to_mono_float32(arr: np.ndarray) -> np.ndarray:
     arr = arr.astype(np.float32)
     if arr.ndim == 2:
         arr = arr.mean(axis=1)
     return arr
-@spaces.GPU(duration=200)
 def synthesize(
     text: str,
     ref_audio,
     ref_text: str,
     steps: int,
@@ -60,83 +74,137 @@ def synthesize(
     temperature: float,
     max_seq_len: int,
     seed: int,
-    lang_hint: str
 ):
-    device = "cuda" if torch.cuda.is_available() else "cpu"
-    torch.manual_seed(int(seed))
-    pardi = _load_model(device)
-    txt = _normalize_text(text, lang_hint=lang_hint)
-    # --- IMPORTANT : signature de VelocityHeadSamplingParams ---
-    # Dans ton notebook d’inférence, la classe attend (cfg_ref, cfg, num_steps) SANS 'temperature'.
-    # On essaie d’abord sans temperature, puis fallback si la classe en accepte une.
-    try:
-        vel_params = VelocityHeadSamplingParams(
-            cfg_ref=float(cfg_ref),
-            cfg=float(cfg),
-            num_steps=int(steps)
-        )
-    except TypeError:
-        vel_params = VelocityHeadSamplingParams(
-            cfg_ref=float(cfg_ref),
-            cfg=float(cfg),
-            num_steps=int(steps),
-            temperature=float(temperature)
-        )
-    # Prefix optionnel
-    prefix = None
-    if ref_audio is not None:
-        if isinstance(ref_audio, str):
-            wav, sr = sf.read(ref_audio)
-        else:
-            sr, wav = ref_audio
-        wav = _to_mono_float32(np.array(wav))
-        wav_t = torch.from_numpy(wav).to(device)
-        import torchaudio
-        if sr != pardi.sampling_rate:
-            wav_t = torchaudio.functional.resample(wav_t, sr, pardi.sampling_rate)
-        wav_t = wav_t.unsqueeze(0)
-        with torch.inference_mode():
-            prefix_tokens = pardi.patchvae.encode(wav_t)
-        prefix = (ref_text or "", prefix_tokens[0])
-    print(f"[debug] has_prefix={prefix is not None}, steps={steps}, cfg={cfg}, cfg_ref={cfg_ref}, T={temperature}, max_seq_len={max_seq_len}, seed={seed}")
-    try:
-        with torch.inference_mode():
-            wavs, _ = pardi.text_to_speech(
-                [txt],
-                prefix,
-                max_seq_len=int(max_seq_len),
-                velocity_head_sampling_params=vel_params,
-            )
-    except Exception as e:
-        import traceback, sys
-        print("❌ text_to_speech failed:", e, file=sys.stderr)
-        traceback.print_exc()
-        raise gr.Error(f"Synthèse échouée: {type(e).__name__}: {e}")
-    wav = wavs[0].detach().cpu().numpy()
-    return (_sampling_rate, wav)
 def build_demo():
     with gr.Blocks(title="Lina-speech / pardi-speech Demo") as demo:
         gr.Markdown(
-            "## Lina-speech (pardi-speech) – Démo TTS\n"
-            "Génère de l'audio à partir de texte, avec ou sans *prefix* (audio de référence).\n"
             "Paramètres avancés: *num_steps*, *CFG*, *température*, *max_seq_len*, *seed*."
         )
         with gr.Row():
             text = gr.Textbox(label="Texte à synthétiser", lines=4, placeholder="Tape ton texte ici…")
         with gr.Accordion("Prefix (optionnel)", open=False):
             ref_audio = gr.Audio(sources=["upload", "microphone"], type="numpy", label="Audio de référence")
-            ref_text  = gr.Textbox(label="Texte du prefix (si connu)", placeholder="Transcription du prefix (optionnel)")
         with gr.Accordion("Options avancées", open=False):
             with gr.Row():
                 steps = gr.Slider(1, 50, value=10, step=1, label="num_steps")
@@ -150,19 +218,18 @@ def build_demo():
         btn = gr.Button("Synthétiser")
         out_audio = gr.Audio(label="Sortie audio", type="numpy")
         demo.queue(default_concurrency_limit=1, max_size=32)
         btn.click(
             fn=synthesize,
-            inputs=[text, ref_audio, ref_text, steps, cfg, cfg_ref, temperature, max_seq_len, seed, lang_hint],
-            outputs=[out_audio]
         )
     return demo
 if __name__ == "__main__":
     demo = build_demo()
     demo.launch()
-# retrigger 2025-10-29T16:27:55+01:00
-# retrigger 2025-10-29T17:44:57+01:00
-# retrigger 2025-10-29T18:59:12+01:00

 # FLA: forcer les convolutions en backend PyTorch (pas de Triton)
 os.environ.setdefault("FLA_CONV_BACKEND", "torch")
 os.environ.setdefault("FLA_USE_FAST_OPS", "0")
+# Meilleure perf FP32 sur GPU compatibles
+torch.backends.cuda.matmul.allow_tf32 = True
+try:
+    torch.set_float32_matmul_precision("high")
+except Exception:
+    pass
 from huggingface_hub import login
 from pardi_speech import PardiSpeech, VelocityHeadSamplingParams  # présent dans ce repo
 _pardi = None
 _sampling_rate = 24000
 def _normalize_text(s: str, lang_hint: str = "fr") -> str:
     s = (s or "").strip().lower()
     try:
         import re
         from num2words import num2words
         def repl(m): return num2words(int(m.group()), lang=lang_hint)
         s = re.sub(r"\d+", repl, s)
     except Exception:
         pass
     return s
 def _load_model(device: str = "cuda"):
     global _pardi, _sampling_rate
     if _pardi is None:
         print(f"✅ PardiSpeech loaded on {device} (sr={_sampling_rate}).")
     return _pardi
 def _to_mono_float32(arr: np.ndarray) -> np.ndarray:
     arr = arr.astype(np.float32)
     if arr.ndim == 2:
         arr = arr.mean(axis=1)
     return arr
+@spaces.GPU(duration=200)  # 200s pour les autres users (peut être augmenté si besoin)
 def synthesize(
     text: str,
+    debug: bool,
     ref_audio,
     ref_text: str,
     steps: int,
     temperature: float,
     max_seq_len: int,
     seed: int,
+    lang_hint: str,
+    progress=gr.Progress(track_tqdm=True),
 ):
+    import io
+    import time
+    import traceback
+    from contextlib import redirect_stdout, redirect_stderr
+    # --- capture logs UI ---
+    logbuf = io.StringIO()
+    t0 = time.perf_counter()
+    # Watchdog: lève une erreur lisible avant un éventuel kill ZeroGPU
+    MAX_WALLTIME_S = 110
+    def maybe_timeout_checkpoint(stage: str):
+        dur = time.perf_counter() - t0
+        print(f"[debug] stage={stage} t={dur:.2f}s")
+        if dur > MAX_WALLTIME_S:
+            raise TimeoutError(f"Watchdog: dépassement {dur:.1f}s avant kill ZeroGPU (étape: {stage})")
+    with redirect_stdout(logbuf), redirect_stderr(logbuf):
+        try:
+            progress(0.02, desc="Init")
+            device = "cuda" if torch.cuda.is_available() else "cpu"
+            torch.manual_seed(int(seed))
+            # Pour des traces CUDA synchrones (erreurs au bon endroit)
+            os.environ.setdefault("CUDA_LAUNCH_BLOCKING", "1")
+            maybe_timeout_checkpoint("load_model")
+            progress(0.08, desc="Chargement du modèle")
+            pardi = _load_model(device)
+            if device == "cuda":
+                torch.cuda.synchronize()
+            maybe_timeout_checkpoint("normalize")
+            progress(0.12, desc="Préparation du texte")
+            txt = _normalize_text(text, lang_hint=lang_hint)
+            # Clamp pour limiter la durée
+            steps = int(min(max(1, steps), 16))
+            max_seq_len = int(min(max(50, max_seq_len), 600))
+            progress(0.16, desc="Paramètres sampling")
+            # IMPORTANT : signature de VelocityHeadSamplingParams
+            try:
+                vel_params = VelocityHeadSamplingParams(
+                    cfg_ref=float(cfg_ref),
+                    cfg=float(cfg),
+                    num_steps=int(steps)
+                )
+            except TypeError:
+                vel_params = VelocityHeadSamplingParams(
+                    cfg_ref=float(cfg_ref),
+                    cfg=float(cfg),
+                    num_steps=int(steps),
+                    temperature=float(temperature)
+                )
+            # Prefix optionnel
+            maybe_timeout_checkpoint("prefix")
+            progress(0.22, desc="Prefix (optionnel)")
+            prefix = None
+            if ref_audio is not None:
+                if isinstance(ref_audio, str):
+                    wav, sr = sf.read(ref_audio)
+                else:
+                    sr, wav = ref_audio
+                wav = _to_mono_float32(np.array(wav))
+                wav_t = torch.from_numpy(wav).to(device)
+                import torchaudio
+                if sr != pardi.sampling_rate:
+                    wav_t = torchaudio.functional.resample(wav_t, sr, pardi.sampling_rate)
+                wav_t = wav_t.unsqueeze(0)
+                with torch.inference_mode():
+                    prefix_tokens = pardi.patchvae.encode(wav_t)
+                prefix = (ref_text or "", prefix_tokens[0])
+            print(f"[debug] has_prefix={prefix is not None}, steps={steps}, cfg={cfg}, cfg_ref={cfg_ref}, T={temperature}, max_seq_len={max_seq_len}, seed={seed}")
+            maybe_timeout_checkpoint("tts_start")
+            progress(0.28, desc="Synthèse…")
+            if device == "cuda":
+                torch.cuda.synchronize()
+            with torch.inference_mode():
+                # Pas de cache envoyé (GLA “safe” côté modèle)
+                wavs, _ = pardi.text_to_speech(
+                    [txt],
+                    prefix,
+                    max_seq_len=int(max_seq_len),
+                    velocity_head_sampling_params=vel_params,
+                )
+            if device == "cuda":
+                torch.cuda.synchronize()
+            progress(0.96, desc="Finalisation")
+            wav = wavs[0].detach().cpu().numpy()
+            logs = logbuf.getvalue() if debug else ""
+            print(f"[debug] synthesize walltime = {time.perf_counter()-t0:.2f}s")
+            return (_sampling_rate, wav), logs
+        except Exception as e:
+            import traceback as _tb
+            dur = time.perf_counter() - t0
+            msg = f"{type(e).__name__}: {e}\\n\\n[walltime={dur:.1f}s]\\n"
+            logs = msg + logbuf.getvalue() + "\\n" + _tb.format_exc()
+            if debug:
+                # On retourne la trace dans l’UI (textbox), sans lever d’exception Gradio
+                return None, logs
+            raise gr.Error(msg)
 def build_demo():
     with gr.Blocks(title="Lina-speech / pardi-speech Demo") as demo:
         gr.Markdown(
+            "## Lina-speech (pardi-speech) – Démo TTS\\n"
+            "Génère de l'audio à partir de texte, avec ou sans *prefix* (audio de référence).\\n"
             "Paramètres avancés: *num_steps*, *CFG*, *température*, *max_seq_len*, *seed*."
         )
         with gr.Row():
             text = gr.Textbox(label="Texte à synthétiser", lines=4, placeholder="Tape ton texte ici…")
+        debug = gr.Checkbox(value=False, label="Mode debug (afficher la stacktrace)")
         with gr.Accordion("Prefix (optionnel)", open=False):
             ref_audio = gr.Audio(sources=["upload", "microphone"], type="numpy", label="Audio de référence")
+            ref_text = gr.Textbox(label="Texte du prefix (si connu)", placeholder="Transcription du prefix (optionnel)")
         with gr.Accordion("Options avancées", open=False):
             with gr.Row():
                 steps = gr.Slider(1, 50, value=10, step=1, label="num_steps")
         btn = gr.Button("Synthétiser")
         out_audio = gr.Audio(label="Sortie audio", type="numpy")
+        logs_box = gr.Textbox(label="Logs (debug)", lines=10)
         demo.queue(default_concurrency_limit=1, max_size=32)
         btn.click(
             fn=synthesize,
+            inputs=[text, debug, ref_audio, ref_text, steps, cfg, cfg_ref, temperature, max_seq_len, seed, lang_hint],
+            outputs=[out_audio, logs_box],
         )
     return demo
 if __name__ == "__main__":
     demo = build_demo()
     demo.launch()